In [15]:
import sagemaker
import tensorflow as tf

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from sagemaker import get_execution_role
import sagemaker_pyspark


# tensorflow
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img


# Additionl libraries
import numpy as np
import random
import os
from PIL import Image
import numpy as np
# import matplotlib.pyplot as plt
# import tarfile
import urllib
import boto3
from sklearn import decomposition
import pandas as pd

In [2]:
print('sagemaker version', sagemaker.__version__)
print('tensorflow version', tf.__version__)
print('numpy version', np.__version__)
print('boto3 version',boto3.__version__)

sagemaker version 2.59.3
tensorflow version 2.3.4
numpy version 1.18.5
boto3 version 1.18.45


In [None]:
# First, we import the necessary modules and create the SparkSession with the SageMaker-Spark dependencies attached

In [3]:
role = get_execution_role()

# Configure Spark to use the SageMaker Spark dependency jars
jars = sagemaker_pyspark.classpath_jars()

classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = (
    SparkSession.builder.config("spark.driver.extraClassPath", classpath)
    .master("local[*]")
    .getOrCreate()
)

In [4]:
# connect to s3 test

In [5]:
bucket_name = 'ocfruits17102021'

In [6]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket_name)
dir_name = ''
for image in my_bucket.objects.limit(3):
    print(image.key)

Fruits/Apple_Braeburn/0_100.jpg
Fruits/Apple_Braeburn/100_100.jpg
Fruits/Apple_Braeburn/101_100.jpg


In [7]:
# Preprocessing des images

In [8]:
# utilisation de vgg16

In [9]:
model = VGG16(weights=None, include_top=False) # La suppression des dernières couches se fait en ajoutant l'argument  include_top = False  lors de l'import du modèle pré-entraîné.
model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [10]:
s3 = boto3.resource('s3', region_name='eu-west-1')
bucket = s3.Bucket(bucket_name)
features_list = []


for file in s3.Bucket(bucket_name).objects.limit(1000):
        path = file.key
         # chargement de l'image
        response = file.get()
        file_stream = response['Body']
        im = Image.open(file_stream)
        # transformation de l'image en np.array
        image = np.asarray(im)
     
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # prepare the image for the VGG model
        image = preprocess_input(image)
        # predict the probability across all output classes
    
        feature = model.predict(image)
        features_np = np.array(feature) # transformation de la liste en array
        features_list.append(features_np.flatten()) # flatten : Return a copy of the array collapsed into one dimension.
    
        
        

A. Création du bag of features

In [11]:
features = np.asarray(features_list)

B. Réduction de dimension du bag of features

In [12]:
# Typically, we want the explained variance to be between 95–99%. In Scikit-learn we can set it like this:
n_comp = 0.95


In [13]:
print("Dimensions avant réduction PCA : ", features.shape)
pca = decomposition.PCA(n_components = n_comp)
features_pca = pca.fit_transform(features) # projection
print("Dimensions après réduction PCA : ", features_pca.shape)

Dimensions avant réduction PCA :  (1000, 4608)
Dimensions après réduction PCA :  (1000, 87)


In [16]:
df = pd.DataFrame(features_pca)

In [17]:
df_write = spark.createDataFrame(df)

In [20]:
df_write.write.option("header","true").csv("s3a://ocfruits17102021/csv/fruits")