In [None]:
# Libraries loading
import os
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt
from IPython.display import Image                 
from IPython.display import display
from PIL import Image
from io import StringIO

# Importing AWS libraries: S3, Sagemaker, PySpark
# S3
import boto3
import botocore.session

# Sagemaker
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
import sagemaker_pyspark

# Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import input_file_name
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import split
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT

# Dimension reduction - PCA
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler

# Tensorflow
#!pip install tensorflow
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image

# Misc
import time
from io import BytesIO

In [None]:
# Spark session initialization
def init_spark_session(bucket=''):
    '''Trigger SPARK session
    Input:
    - bucket : S3 bucket name containing images
    
    Output:
    - SparkContext
    - S3 bucket images path
    '''
    
    # Remote access to our S3 bucket from Sagemaker
    path_img = "s3a://"+bucket_name+"/**"
        
#     session = botocore.session.get_session()
#     credentials = session.get_credentials()

    # TO HIDE CREDENTIALS
    access_id = 'AKIAXX4UHDZO6LW34JPX'
    access_key = 'DYM7eaUgKPJOlfZZa59kocf4BlaWifM6eYVF6Uth'
    
    conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))
        
    spark = (
        SparkSession
        .builder
        .config(conf=conf) \
        .config('fs.s3a.access.key', access_id) \
        .config('fs.s3a.secret.key', access_key) \
        .config("spark.driver.memory", "32g") \
        .master('local[*]') \
        .appName('P8_Fruits') \
        .getOrCreate()
    ) 

    sc = spark.sparkContext
    
    return sc, spark, path_img

In [None]:
# Data loading
def load_data(path_img):
    '''Dataframes loading: 
    Input:
    - path_image: Directory containing images
    
    Output:
    df_img: Spark dataframe with images and names
    '''
    # Timer
    start = time.time()
    
    # SPARK dataframe loading
    df_img = spark.read.format("image").load(path_img, inferschema=True)
    print('Images loaded - DONE')
    
    df_img = df_img.withColumn("fileName", regexp_replace('image.origin', 'dbfs:/mnt/images/', '')) 
    split_col =split(df_img['fileName'], '/')
    df_img = df_img.withColumn('Category', split_col.getItem(3))
    
    df_img_see = df_img.select('image', 'image.origin',"image.height","image.width","image.nChannels", "image.mode", "image.data",'Category')
    df_img_feat = df_img.select('image.origin',"image.height","image.width","image.nChannels", "image.mode", "image.data",'Category')
    
    print('Images loaded in: {} secondes'.format(time.strftime('%S', time.gmtime(time.time()-start))))
    
    return df_img_see, df_img_feat

In [None]:
# # Fonction qui donne des informations sur un dataframe spark
# def spark_shape(dfs):
#     '''Renvoie des informations sur un dataframe spark: 
#     Entrée:
#     - dfs: dataframe spark
    
#     Retour:
#     - nombre enregistrements
#       int
#     - nombre de colonnes
#       int
#     '''
#     return (dfs.count(), len(dfs.columns))

In [None]:
# # Fonction pour déterminer la catégorie de l'image
# def parse_categorie(path):
#     '''Renvoie la catégorie d\'une image à partir de son chemin
#     Entrée:
#     - chemin complet de l\'image
#       string
#     Retour:
#     - catégorie de l\'image
#       string
#     '''
#     if len(path) > 0:
#         # Catégorie de l'image
#         return path.split('/')[-2]
#     else:
#         return ''

In [None]:
# Display images
def display_image(dfs, Category):
    '''Display a selected image
    Input:
    - SPARK dataframe
    - Image category
      
    Output:
    - Image array
      
    '''
    filter_cat = dfs.filter(dfs.Category == Category)
    list_height = filter_cat.select('height').collect()
    list_width = filter_cat.select('width').collect()
    height = list_height[0].height
    width = list_width[0].width

    image_1 = filter_cat.first()

    disp_img = np.array(image_1.asDict()['image']['data']).reshape(height,width,3)[:,:,::-1]
    
    return disp_img

In [None]:
# Features extraction with VGG16
def extract_features_vgg16(bucket_name):
    
    '''Features extraction with VGG16
    Input: S3 bucket name
    
    Output: Images features
    
    '''
    # Timer
    start = time.time()
    
    model = VGG16(include_top=False, weights='imagenet', pooling='max', input_shape=(224, 224, 3))
    model.summary()
    
    # AWS S3 ressources 
    s3_client = boto3.client("s3")
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    
    vgg16_features=[]
    
    for my_bucket_object in bucket.objects.all():
        if my_bucket_object.key.endswith('jpg'):
            file_byte_string = s3_client.get_object(Bucket=bucket_name, Key=my_bucket_object.key)['Body'].read()
            
            # Image loading
            img = Image.open(BytesIO(file_byte_string))
            
            # Image redimensionning in 224*224 px
            img_redim = img.resize((224, 224))
            
            # Image to array
            img_array = image.img_to_array(img_redim).reshape((-1,224,224,3))
            img_array = np.array(img_array)
            
            # Images pre-processing 
            img_array = preprocess_input(img_array)
            
            # Features extraction for an image
            feature = model.predict(img_array).ravel().tolist()
            
            vgg16_features.append(feature)
            
    print('Features extraction loading time: {} secondes'.format(time.strftime('%S', time.gmtime(time.time()-start))))
    
    return vgg16_features

In [None]:
# Features in SPARK dataframe 
def features_pyspark_df(features, df_img):
    
    '''Add features to Pyspark dataframe 
    Input:
    - Image features 
    
    Output:
    - pyspark dataframe with info about images and features
    '''
    features_df = spark.createDataFrame([(l,) for l in features], ['features'])
    
    df_img = df_img.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))
    features_df = features_df.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))

    df_img_feat = df_img.join(features_df, df_img.row_idx == features_df.row_idx).drop("row_idx")
    
    return df_img_feat

In [None]:
def preprocess_pca(dataframe):
    
    '''
     Data preparation:
     - Dense vector conversion
     - Standardization
     Input : dataframe : Images dataframe
     Output : dataframe with standardized dense vectors
    '''

    # Images data to dense vector conversion
    transform_dense_vector = udf(lambda r: Vectors.dense(r), VectorUDT())
    dataframe = dataframe.withColumn('features_vectors', transform_dense_vector('features'))

    # Standardization for PCA
    scaler_std = StandardScaler(inputCol="features_vectors", outputCol="features_scaled", withStd=True, withMean=True)
    model_std = scaler_std.fit(dataframe)
    # Upscaling
    dataframe = model_std.transform(dataframe)

    return dataframe

In [None]:
def optimal_k_search(dataframe, nb_comp=13):
    '''
       Search for the optimal k number (95% variance)
       param : dataframe : Images dataframe
       return : k Number of components explaining 95% of the variance
    '''

    pca = PCA(k = nb_comp,
              inputCol="features_scaled", 
              outputCol="features_pca")

    model_pca = pca.fit(dataframe)
    variance = model_pca.explainedVariance

    # visuel
    plt.plot(np.arange(len(variance)) + 1, variance.cumsum(), c="red", marker='o')
    plt.xlabel("Nb components")
    plt.ylabel("% variance")
    plt.show(block=False)

    def nb_comp ():
        for i in range(13):
          a = variance.cumsum()[i]
          if a >= 0.95:
              print("{} principal components explain 95% of the information".format(i))
              break
        return i

    k=nb_comp()

    return k

In [None]:
# Saving results in csv file on a S3 bucket
def save_csv_bucket_s3(pca_matrix, file_name, bucket_name):
    
    '''Saving results in a csv file in a S3 bucket
    Input:
    - pca_matrix (psypark dataframe)
    - csv file name to save
    - bucket_name: S3 bucket name
    '''
    s3_resource = boto3.resource('s3')
    
    # Buffer creation
    csv_buffer = StringIO()
    
    # psypark to pandas dataframe conversion
    pca_matrix.toPandas().to_csv(csv_buffer)
    
    # Resulting csv file in S3 bucket
    s3_resource.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())
    

In [None]:
# Bucket name
bucket_name = 'h7obucket'

In [None]:
# Spark session initialization
sc, spark, path = init_spark_session(bucket=bucket_name)

In [None]:
# Display pyspark context
sc

### Dataset Overview

In [None]:
# Dataframe loading
images_feat, images_see = load_data(path)

In [None]:
# Affichage des 5 premières images
images_feat['origin', 'Category'].show(5)

In [None]:
# # Taille de la dataframe pyspark
# spark_shape(images_feat)

In [None]:
# image_cat = display_image(images_see, "apple_rotten_1")
# print(image_cat.shape)
# Image.fromarray(image_cat, 'RGB')

In [None]:
# Features Extraction 
image_features = extract_features_vgg16(bucket_name)

In [None]:
# Adding features to pyspark dataframe
images_feat_df = features_pyspark_df(image_features, images_feat)

In [None]:
images_feat_df.show(5)

### PCA Dimension reduction

In [None]:
pca_df = preprocess_pca(images_feat_df)

In [None]:
# Number of components explaining 95% of the variance
n_components = optimal_k_search(pca_df)

In [None]:
# PCA implementation with optimal k components
pca = PCA(k=n_components, inputCol='features_scaled', outputCol='vectors_pca')
model_pca = pca.fit(pca_df)

# Transform images 
df_post_pca = model_pca.transform(pca_df)

In [None]:
# df_post_pca.show()

In [None]:
# Saving results file in new bucket
bucket_name_matrix = 'results-fruits-bucket'
save_csv_bucket_s3(df_post_pca, 'post_pca_results.csv', bucket_name_matrix)

In [None]:
# Stopping Spark session
spark.stop()