In [1]:
# https://docs.databricks.com/_static/notebooks/deep-learning/deep-learning-transfer-learning-keras.html

In [2]:
from pyspark.sql import SparkSession
import os 
# from pyspark.context import SparkContext
# # Get a reference to the Spark Session
# sc = SparkContext()
# spark = SparkSession(sc)



spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Fruits')\
        .getOrCreate()

sc = spark.sparkContext
sc.setSystemProperty('spark.executor.memory', '32g')


In [3]:
import pandas as pd
from PIL import Image
import numpy as np
import io

from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img

from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split, udf

from pyspark.ml.feature import PCA
# from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.ml.linalg import Vectors, VectorUDT

Charger la dataframe

In [4]:
path = "Fruit-Images/"
images = spark.read.format("image") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(path)

display(images.limit(5))

DataFrame[image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,data:binary>]

In [5]:
images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [6]:
images = images.withColumn('label',element_at(split(images['image.origin'],"/"),-2))

In [7]:
images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: string (nullable = true)



Préparation du modèle : Chargement du modèle sans la dernière couche

In [8]:
model = VGG16(weights=None, include_top=False)
model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [9]:
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.memory', '32g')

bc_model_weights = sc.broadcast(model.get_weights())# utilisation de variables de diffusion

def model_fn():
    """
    Returns a VGG16 model with top layer removed and broadcasted pretrained weights.
    """
    model = VGG16(weights=None, include_top=False)
    model.set_weights(bc_model_weights.value)
    return model

* featurize a pd.Series of images
    * preprocess une image
    
utilisation de Scalar Iterator pandas pour amortir le coût du chargement
de gros models sur les "workers"

In [10]:
def preprocess(path_file):
    """
    Preprocesses raw image bytes for prediction.
    """
    path_file = path_file.replace("file://", "")
    img = load_img(path_file)
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, path_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(path_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

In [11]:
@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(path_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
  
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for path_series in path_series_iter:
        yield featurize_series(model, path_series)



In [12]:
features_df = images.repartition(16).select(col("image.origin"), col("label"), featurize_udf("image.origin").alias("features"))

In [13]:
features_df.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [14]:
features_df.show()

+--------------------+--------------+--------------------+
|              origin|         label|            features|
+--------------------+--------------+--------------------+
|file:///home/jo/t...|       Avocado|[0.2749591, 0.327...|
|file:///home/jo/t...|       Avocado|[0.2744887, 0.336...|
|file:///home/jo/t...|       Apricot|[0.2601691, 0.218...|
|file:///home/jo/t...|Apple_Braeburn|[0.27739197, 0.27...|
|file:///home/jo/t...|       Avocado|[0.2706342, 0.330...|
|file:///home/jo/t...|Apple_Braeburn|[0.27894095, 0.28...|
|file:///home/jo/t...|       Apricot|[0.2816303, 0.259...|
|file:///home/jo/t...|       Avocado|[0.26609278, 0.32...|
|file:///home/jo/t...|Apple_Braeburn|[0.28261504, 0.29...|
|file:///home/jo/t...|       Apricot|[0.26481956, 0.23...|
|file:///home/jo/t...|       Apricot|[0.2585389, 0.225...|
+--------------------+--------------+--------------------+



# Réduction de dimension des features extraites

In [15]:
to_vector_udf = udf(lambda a: Vectors.dense(a), VectorUDT())

In [16]:
# data = features_df.select("label", "origin", to_vector("features").alias("features_vec"))
data = features_df.withColumn("features_vec", to_vector_udf("features"))

In [17]:
data.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- features_vec: vector (nullable = true)



In [18]:
data.show()

+--------------------+--------------+--------------------+--------------------+
|              origin|         label|            features|        features_vec|
+--------------------+--------------+--------------------+--------------------+
|file:///home/jo/t...|       Avocado|[0.2749591, 0.327...|[0.27495908737182...|
|file:///home/jo/t...|       Avocado|[0.2744887, 0.336...|[0.27448868751525...|
|file:///home/jo/t...|       Apricot|[0.2601691, 0.218...|[0.26016908884048...|
|file:///home/jo/t...|Apple_Braeburn|[0.27739197, 0.27...|[0.27739197015762...|
|file:///home/jo/t...|       Avocado|[0.2706342, 0.330...|[0.27063420414924...|
|file:///home/jo/t...|Apple_Braeburn|[0.27894095, 0.28...|[0.27894094586372...|
|file:///home/jo/t...|       Apricot|[0.2816303, 0.259...|[0.28163030743598...|
|file:///home/jo/t...|       Avocado|[0.26609278, 0.32...|[0.26609277725219...|
|file:///home/jo/t...|Apple_Braeburn|[0.28261504, 0.29...|[0.28261503577232...|
|file:///home/jo/t...|       Apricot|[0.

In [19]:
pca = PCA(k = 8, inputCol = "features_vec", outputCol = "pca_features")
model_pca = pca.fit(data.select("features_vec"))
# 
cumValues = model_pca.explainedVariance.cumSum()

Py4JJavaError: An error occurred while calling o89.fit.
: java.lang.OutOfMemoryError: Java heap space
	at breeze.linalg.svd$.breeze$linalg$svd$$doSVD_Double(svd.scala:94)
	at breeze.linalg.svd$Svd_DM_Impl$.apply(svd.scala:36)
	at breeze.linalg.svd$Svd_DM_Impl$.apply(svd.scala:35)
	at breeze.generic.UFunc.apply(UFunc.scala:46)
	at breeze.generic.UFunc.apply$(UFunc.scala:45)
	at breeze.linalg.svd$.apply(svd.scala:21)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computePrincipalComponentsAndExplainedVariance(RowMatrix.scala:481)
	at org.apache.spark.mllib.feature.PCA.fit(PCA.scala:65)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:93)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:64)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
