In [11]:
# Librairies
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


 
# image
from PIL import Image
import io
from io import StringIO
from skimage.io import imread, imshow

import cv2

# S3 AWS
import boto3
import configparser

# Spark
import findspark  #Findspark : Make Spark available in Jupyter notebook
findspark.init('/home/ubuntu/spark-3.3.0-bin-hadoop3')

# Pyspark.
import pyspark
from pyspark.sql import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import input_file_name, udf, col, pandas_udf, PandasUDFType
from pyspark.sql.types import ArrayType, StringType, IntegerType, DoubleType, DataType, FloatType
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.ml.feature import StandardScaler, PCA


# Tensorflow
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img, array_to_img
from tensorflow.keras.preprocessing import image

import warnings
warnings.filterwarnings('ignore')

In [12]:
# We need the aws credentials in order to be able to access the s3 bucket. 
# We can use the configparser package to read the credentials from the standard aws file.

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
aws_credentials = 'default'
AWS_ACCESS_KEY_ID = config.get(aws_credentials, "aws_access_key_id") 
AWS_SECRET_ACCESS_KEY = config.get(aws_credentials, "aws_secret_access_key")

In [13]:
config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/config"))
aws_config = 'default'
RGION_NAME = config.get(aws_config, "region") 

In [14]:
# Retrieve the list of existing buckets
session = boto3.session.Session(aws_access_key_id=AWS_ACCESS_KEY_ID, 
                                aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3_client = session.client(service_name='s3', region_name= RGION_NAME)

# Output the bucket names
response = s3_client.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')
BUCKET_NAME = response['Buckets'][0]["Name"]

Existing buckets:
  ocr-projet8-fruits


In [15]:
# Configuration chemins S3
DATASET_PATH = 's3://' + BUCKET_NAME + '/'
DIR_PATH = 'Training/'
# DIR_PATH = 'Test_s3/'
DATASET_PATH

's3://ocr-projet8-fruits/'

In [16]:
# Create a reusable Paginator
paginator = s3_client.get_paginator('list_objects')

# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=BUCKET_NAME)

for page in page_iterator:
    print(len(page['Contents']))

ld_set_folders = s3_client.list_objects_v2(Bucket=BUCKET_NAME,
                                           Prefix=DIR_PATH, Delimiter='/')
print(ld_set_folders.get('CommonPrefixes'))

1000
1000
1000
1000
1000
575
[{'Prefix': 'Training/apple_6/'}, {'Prefix': 'Training/apple_braeburn_1/'}, {'Prefix': 'Training/apple_crimson_snow_1/'}, {'Prefix': 'Training/apple_golden_1/'}, {'Prefix': 'Training/apple_golden_3/'}, {'Prefix': 'Training/apple_granny_smith_1/'}, {'Prefix': 'Training/apple_hit_1/'}, {'Prefix': 'Training/apple_pink_lady_1/'}, {'Prefix': 'Training/apple_red_1/'}, {'Prefix': 'Training/apple_red_2/'}, {'Prefix': 'Training/apple_red_3/'}, {'Prefix': 'Training/apple_red_yellow_1/'}, {'Prefix': 'Training/apple_rotten_1/'}, {'Prefix': 'Training/cabbage_white_1/'}, {'Prefix': 'Training/carrot_1/'}, {'Prefix': 'Training/cucumber_1/'}, {'Prefix': 'Training/cucumber_3/'}, {'Prefix': 'Training/eggplant_violet_1/'}, {'Prefix': 'Training/pear_1/'}, {'Prefix': 'Training/pear_3/'}, {'Prefix': 'Training/zucchini_1/'}, {'Prefix': 'Training/zucchini_dark_1/'}]


In [17]:
path_folders = []
result = s3_client.list_objects(Bucket=BUCKET_NAME, Prefix=DIR_PATH, Delimiter='/')
for o in result.get('CommonPrefixes'):
    #print ('sub folder : ', o.get('Prefix'))
    l= DATASET_PATH + DIR_PATH + o.get('Prefix') + '*'
    path_folders.append(l)
path_folders

['s3://ocr-projet8-fruits/Training/Training/apple_6/*',
 's3://ocr-projet8-fruits/Training/Training/apple_braeburn_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_crimson_snow_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_golden_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_golden_3/*',
 's3://ocr-projet8-fruits/Training/Training/apple_granny_smith_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_hit_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_pink_lady_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_red_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_red_2/*',
 's3://ocr-projet8-fruits/Training/Training/apple_red_3/*',
 's3://ocr-projet8-fruits/Training/Training/apple_red_yellow_1/*',
 's3://ocr-projet8-fruits/Training/Training/apple_rotten_1/*',
 's3://ocr-projet8-fruits/Training/Training/cabbage_white_1/*',
 's3://ocr-projet8-fruits/Training/Training/carrot_1/*',
 's3://ocr-projet8-fruits/Training/Training/cucumber_1/*',

### Enable access to s3 data from Spark

In order to be able to read data via S3A we need a couple of dependencies / 
we need to make sure the hadoop-aws and aws-java-sdk packages are available when we load spark:

In [18]:
# # Add an environnement variable
# os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages com.amazonaws:aws-java-sdk-pom:1.10.34,org.apache.hadoop:hadoop-aws:3.3.0 pyspark-shell"

In [19]:
spark = (SparkSession
             .builder.master('local[*]')
             .appName('fruits')
             .getOrCreate()
            )
sc = SparkContext.getOrCreate()
sc.setLogLevel('WARN')

In [20]:
spark

In [None]:
zipped = zip(path_folders)
columns = ['path_folders']

df_images = spark.createDataFrame(zipped, columns)
df_images.show()

[Stage 0:>                                                          (0 + 1) / 1]

In [None]:
rdd = spark.sparkContext.parallelize(path_folders)
df = rdd.toDF(["path_folders"])
df.show()

[Stage 0:>                                                          (0 + 1) / 1]