In [1]:
# https://docs.databricks.com/_static/notebooks/deep-learning/deep-learning-transfer-learning-keras.html

In [2]:
from pyspark.sql import SparkSession
import os 
# from pyspark.context import SparkContext
# # Get a reference to the Spark Session
# sc = SparkContext()
# spark = SparkSession(sc)



spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Fruits')\
        .getOrCreate()

sc = spark.sparkContext
sc.setSystemProperty('spark.executor.memory', '32g')


In [3]:
import pandas as pd
from PIL import Image
import numpy as np
import io

from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img

from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split, udf

from pyspark.ml.feature import PCA
# from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.ml.linalg import Vectors, VectorUDT

Charger la dataframe

In [4]:
path = "Fruit-Images/"
images = spark.read.format("image") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(path)

display(images.limit(5))

DataFrame[image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,data:binary>]

In [5]:
images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [6]:
images = images.withColumn('label',element_at(split(images['image.origin'],"/"),-2))

In [7]:
images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: string (nullable = true)



Préparation du modèle : Chargement du modèle sans la dernière couche

In [8]:
model = VGG16(weights=None, include_top=False)
model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [9]:
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.memory', '32g')

bc_model_weights = sc.broadcast(model.get_weights())# utilisation de variables de diffusion

def model_fn():
    """
    Returns a VGG16 model with top layer removed and broadcasted pretrained weights.
    """
    model = VGG16(weights=None, include_top=False)
    model.set_weights(bc_model_weights.value)
    return model

* featurize a pd.Series of images
    * preprocess une image
    
utilisation de Scalar Iterator pandas pour amortir le coût du chargement
de gros models sur les "workers"

In [10]:
def preprocess(path_file):
    """
    Preprocesses raw image bytes for prediction.
    """
    path_file = path_file.replace("file://", "")
    img = load_img(path_file)
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, path_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(path_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

In [11]:
@pandas_udf('array<Double>', PandasUDFType.SCALAR_ITER)
def featurize_udf(path_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
  
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for path_series in path_series_iter:
        yield featurize_series(model, path_series)



In [12]:
features_df = images.repartition(16).select(col("image.origin"), col("label"), featurize_udf("image.origin").alias("features"))

In [13]:
features_df.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [14]:
features_df.show()

+--------------------+--------------+--------------------+
|              origin|         label|            features|
+--------------------+--------------+--------------------+
|file:///home/jo/D...|Apple_Braeburn|[0.39042681455612...|
|file:///home/jo/D...|       Avocado|[0.41923102736473...|
|file:///home/jo/D...|       Apricot|[0.35134685039520...|
|file:///home/jo/D...|Apple_Braeburn|[0.38464885950088...|
|file:///home/jo/D...|       Apricot|[0.35411885380744...|
|file:///home/jo/D...|Apple_Braeburn|[0.37923756241798...|
|file:///home/jo/D...|       Avocado|[0.44469347596168...|
|file:///home/jo/D...|       Apricot|[0.35258254408836...|
|file:///home/jo/D...|       Avocado|[0.41870588064193...|
|file:///home/jo/D...|       Apricot|[0.35502418875694...|
|file:///home/jo/D...|       Avocado|[0.40119403600692...|
+--------------------+--------------+--------------------+



# Réduction de dimension des features extraites

In [15]:
to_vector_udf = udf(lambda a: Vectors.dense(a), VectorUDT())

In [16]:
# data = features_df.select("label", "origin", to_vector("features").alias("features_vec"))
data = features_df.withColumn("features_vec", to_vector_udf("features"))

In [17]:
data.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- features_vec: vector (nullable = true)



In [18]:
data.show()

+--------------------+--------------+--------------------+--------------------+
|              origin|         label|            features|        features_vec|
+--------------------+--------------+--------------------+--------------------+
|file:///home/jo/D...|Apple_Braeburn|[0.39042681455612...|[0.39042681455612...|
|file:///home/jo/D...|       Avocado|[0.41923102736473...|[0.41923102736473...|
|file:///home/jo/D...|       Apricot|[0.35134685039520...|[0.35134685039520...|
|file:///home/jo/D...|Apple_Braeburn|[0.38464885950088...|[0.38464885950088...|
|file:///home/jo/D...|       Apricot|[0.35411885380744...|[0.35411885380744...|
|file:///home/jo/D...|Apple_Braeburn|[0.37923756241798...|[0.37923756241798...|
|file:///home/jo/D...|       Avocado|[0.44469347596168...|[0.44469347596168...|
|file:///home/jo/D...|       Apricot|[0.35258254408836...|[0.35258254408836...|
|file:///home/jo/D...|       Avocado|[0.41870588064193...|[0.41870588064193...|
|file:///home/jo/D...|       Apricot|[0.

In [19]:
pca = PCA(k = 8, inputCol = "features_vec", outputCol = "pca_features")
model_pca = pca.fit(data.select("features_vec"))
# 
cumValues = model_pca.explainedVariance.cumSum()

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 59144)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
 

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34833)
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-b7b191d697d5>", line 2, in <module>
    model_pca = pca.fit(data.select("features_vec"))
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/base.py", line 161, in fit
    return self._fit(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 335, in _fit
    java_model = self._fit_java(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 332, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/home/jo/anaconda3/lib/pyth

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34833)
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-b7b191d697d5>", line 2, in <module>
    model_pca = pca.fit(data.select("features_vec"))
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/base.py", line 161, in fit
    return self._fit(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 335, in _fit
    java_model = self._fit_java(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 332, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/home/jo/anaconda3/lib/pyth

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34833)
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-b7b191d697d5>", line 2, in <module>
    model_pca = pca.fit(data.select("features_vec"))
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/base.py", line 161, in fit
    return self._fit(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 335, in _fit
    java_model = self._fit_java(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 332, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/home/jo/anaconda3/lib/pyth

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34833)
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-b7b191d697d5>", line 2, in <module>
    model_pca = pca.fit(data.select("features_vec"))
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/base.py", line 161, in fit
    return self._fit(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 335, in _fit
    java_model = self._fit_java(dataset)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 332, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/home/jo/anaconda3/lib/pyth

Py4JError: An error occurred while calling o89.fit