In [1]:
# spark
import findspark
findspark.init()
import pyspark
pyspark.__version__

'3.1.2'

In [2]:
# Réduction de dimension - PCA
from pyspark.sql.functions import element_at, split, col, pandas_udf, PandasUDFType, udf
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector

In [3]:
# context & session
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [4]:
import pyarrow
pyarrow.__version__

'7.0.0'

In [5]:
# useful packages
import pandas as pd
import numpy as np
import time
import os
# deal with image
from PIL import Image

In [6]:
# data handling
from pyspark.sql.functions import element_at, col, split
from pyspark.sql.functions import pandas_udf, PandasUDFType
# from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from typing import Iterator
# ml tasks

In [7]:
# ml tasks
from pyspark.ml.image import ImageSchema
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
# transform
from pyspark.ml.linalg import Vectors, VectorUDT
# core featurizer
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img
tf.__version__

'2.8.0'

In [8]:
# Initiate a Spark session
spark = SparkSession.builder.master('local[*]').appName('P8').getOrCreate()
# check wether arrow should be enabled by this setting
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [9]:
spark

In [10]:
# define work_path
work_path = 'Sample/**'

In [11]:
# function to load data into into a spark_df
# with spaces of folder's name removed first, inferschema optional here
start = time.perf_counter()
df_img = spark.read.format('image').load(work_path, inferschema=True) 
stop = time.perf_counter()
print(f'data load with spark.read, elapsed time: {stop - start:0.2f}s')

data load with spark.read, elapsed time: 3.20s


In [12]:
# number of images in this sample
df_img.count()

13

In [13]:
start = time.perf_counter()
df_img.show(20)
stop = time.perf_counter()
print(f'data load with spark.read, elapsed time: {stop - start:0.2f}s')

+--------------------+
|               image|
+--------------------+
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
|{file:///c:/Users...|
+--------------------+

data load with spark.read, elapsed time: 1.65s


In [14]:
# display DataFrame schema 
df_img.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [15]:
# origin detail
df_img.select('image.origin').show(1, False, True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------
 origin | file:///c:/Users/steph/Documents/Formation_Data_Scientist/P8_Lanchec_Stephane/Sample/cabbage_white_1/r0_172.jpg 
only showing top 1 row



In [16]:
# extract label from image.origin
df_img = df_img.withColumn('label', element_at(split(df_img['image.origin'], "/"), -2))

In [17]:
# show first 3 rows with image struct detailed
df_img.select(
    'image.origin',
    'image.height',
    'image.width',
    'image.nChannels',
    'image.mode',
    'image.data',
    'label'
    ).show(3, True)

+--------------------+------+-----+---------+----+--------------------+---------------+
|              origin|height|width|nChannels|mode|                data|          label|
+--------------------+------+-----+---------+----+--------------------+---------------+
|file:///c:/Users/...|   714|  721|        3|  16|[FF FF FF FF FF F...|cabbage_white_1|
|file:///c:/Users/...|   713|  715|        3|  16|[FF FF FF FF FF F...|cabbage_white_1|
|file:///c:/Users/...|   711|  713|        3|  16|[FF FF FF FF FF F...|cabbage_white_1|
+--------------------+------+-----+---------+----+--------------------+---------------+
only showing top 3 rows



In [18]:
# model for featurization, last layers truncated.
# nb. spark workers need to access the model and its weights
conv_base = VGG16(
    include_top=False,
    weights=None,
    pooling='max',
    input_shape=(100, 100, 3))

In [19]:
# verify that the top layer is removed
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 100, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 100, 100, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 100, 100, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 50, 50, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 50, 50, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 50, 50, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 25, 25, 128)       0     

In [20]:
# get weights as broadcasted variable over nodes (provide a copy to each node)
conv_base_weights = spark.sparkContext.broadcast(conv_base.get_weights())

In [21]:
# make conv_base callable
def conv_base_init():
    # returns a VGG 16 model with top layer removed and broadcasted weights
    conv_base = VGG16(
        include_top=False,
        weights=None,
        pooling='max',
        input_shape=(100, 100, 3))
    # error if sparkcontext as it will be called on workers and not only drivers
    # conv_base_weights = sc.broadcast(conv_base.get_weights())
    conv_base.set_weights(conv_base_weights.value)
    return conv_base

In [22]:
# function to get tensors from batch path
def gettensorfrompath(image_path):
    path = image_path.replace("file://", "")
    img = load_img(path)
    x = img_to_array(img)
    x = preprocess_input(x)
    return x

In [23]:
# target pandas user defined function to make operation on dataframe with pyspark.sql
@pandas_udf('array<double>')
def featurize(images_data_iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    # load model outside of for loop
    conv_base = conv_base_init()
    for image_data_series in images_data_iter:
        image_path_series = image_data_series['origin']
        # Apply functions to entire series at once
        x = image_path_series.map(gettensorfrompath)
        x = np.stack(list(x.values))
        # option is to enable batch_size
        features = conv_base.predict(x)
        features_flat = [p.flatten() for p in features]
        yield pd.Series(features_flat)

In [24]:
df_img.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: string (nullable = true)



In [25]:
# apply featurization
featurized_df = df_img.withColumn('features', featurize('image')).cache()

In [26]:
featurized_df.head(3)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 588, in main
  File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 336, in read_udfs
  File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 249, in read_single_udf
  File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
  File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
  File "C:\Users\steph\anaconda3\lib\site-packages\keras\__init__.py", line 25, in <module>
    from keras import models
  File "C:\Users\steph\anaconda3\lib\site-packages\keras\models.py", line 19, in <module>
    from keras import backend
  File "C:\Users\steph\anaconda3\lib\site-packages\keras\backend.py", line 39, in <module>
    from tensorflow.python.eager.context import get_config
ImportError: cannot import name 'get_config' from 'tensorflow.python.eager.context' (C:\Users\steph\anaconda3\lib\site-packages\tensorflow\python\eager\context.py)


In [None]:
start = time.perf_counter()
featurized_df.show()
stop = time.perf_counter()
print(f'data load with spark.read, elapsed time: {stop - start:0.2f}s')

In [None]:
# get the number of paritions
print(featurized_df.rdd.getNumPartitions())

In [None]:
def pca_transformation(df, n_components=6, col_image='image'):
    
    """
    Applique un algorithme de PCA sur l'ensemble des images pour réduire la dimension de chaque image 
    du jeu de données.
    
    Paramètres:
    df(pyspark dataFrame): contient une colonne avec les données images
    n_components(int): nombre de dimensions à conserver
    col_image(string): nom de la colonne où récupérer les données images
    """

    # Initilisation du temps de calcul
    start_time = time.time()

    # Les données images sont converties au format vecteur dense
    ud_f = udf(lambda r: Vectors.dense(r), VectorUDT())
    df = df.withColumn('image', ud_f('image'))
    
    standardizer = StandardScaler(inputCol="image", outputCol="scaledFeatures",
                                  withStd=True, withMean=True)
    model_std = standardizer.fit(df)
    df = model_std.transform(df)

    # Entrainement de l'algorithme
    pca = PCA(k=n_components, inputCol='scaledFeatures', outputCol='pcaFeatures')
    model_pca = pca.fit(df)

    # Transformation des images sur les k premières composantes
    df = model_pca.transform(df)

    df = df.filter(df.pcaFeatures.isNotNull())
    
    # Affiche le temps de calcul
    print("Temps d'execution {:.2f} secondes".format(time.time() - start_time))


    return df

In [None]:
pca_transformation(featurized_df, n_components=4, col_image='image')

In [None]:
# from Array to Vectors for PCA
array_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
vectorized_df = featurized_df.withColumn('vectors', array_to_vector_udf('features'))
vectorized_df.show()

In [None]:
# reduce with PCA - set k Max to determine the adequate nb of principal components
start = time.perf_counter()
pca = PCA(k=20, inputCol='vectors', outputCol='pca_vectors')
model = pca.fit(vectorized_df)
stop = time.perf_counter()
print(f'pca - fit best k nb, elapsed time: {stop - start:0.2f}s')

### PCA Dimension reduction

In [None]:
def display_scree_plot(pca):
    '''Display a scree plot for the pca'''

    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(), c="red", marker='o')
    plt.xlabel("Number of principal components")
    plt.ylabel("Percentage explained variance")
    plt.title("Scree plot")
    plt.show(block=False)

In [None]:
pca = PCA()
pca.fit(vectorized_df)

In [None]:
display_scree_plot(pca)

In [None]:
# from pyspark import SparkContext
# sc = SparkContext()
data = range(1,1000)
rdd = sc.parallelize(data)
rdd.collect()