In [1]:
# https://docs.databricks.com/_static/notebooks/deep-learning/deep-learning-transfer-learning-keras.html

In [2]:
from pyspark.sql import SparkSession
from pyspark import SQLContext
import os 
# from pyspark.context import SparkContext
# # Get a reference to the Spark Session
# sc = SparkContext()
# spark = SparkSession(sc)



spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Fruits')\
        .config("spark.driver.memory", "16g")\
        .getOrCreate()

sc = spark.sparkContext
#sqlContext = SQLContext(sc)


In [3]:
import pandas as pd
from PIL import Image
import numpy as np
import io

from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img

from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split, udf

from pyspark.ml.feature import PCA

from pyspark.ml.linalg import Vectors, VectorUDT

Charger la dataframe

In [4]:
path = "Fruit-Images/"
images = spark.read.format("image") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(path)

display(images.limit(5))

DataFrame[image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,data:binary>]

In [5]:
images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [6]:
images = images.withColumn('label',element_at(split(images['image.origin'],"/"),-2))

In [7]:
images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: string (nullable = true)



Préparation du modèle : Chargement du modèle sans la dernière couche

In [8]:
model = VGG16(weights=None, include_top=False)
model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [9]:
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.memory', '32g')

bc_model_weights = sc.broadcast(model.get_weights())# utilisation de variables de diffusion

def model_fn():
    """
    Returns a VGG16 model with top layer removed and broadcasted pretrained weights.
    """
    model = VGG16(weights=None, include_top=False)
    model.set_weights(bc_model_weights.value)
    return model

* featurize a pd.Series of images
    * preprocess une image
    
utilisation de Scalar Iterator pandas pour amortir le coût du chargement
de gros models sur les "workers"

In [10]:
def preprocess(path_file):
    """
    Preprocesses raw image bytes for prediction.
    """
    path_file = path_file.replace("file://", "")
    img = load_img(path_file)
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, path_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(path_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

In [11]:
@pandas_udf('array<Double>', PandasUDFType.SCALAR_ITER)
def featurize_udf(path_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
  
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for path_series in path_series_iter:
        yield featurize_series(model, path_series)



In [12]:
features_df = images.repartition(16).select(col("image.origin"), col("label"), featurize_udf("image.origin").alias("features"))
#features_df = images.withColumn('features', featurize_udf('image')).cache()

In [13]:
features_df.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [14]:
features_df.show()

+--------------------+--------------+--------------------+
|              origin|         label|            features|
+--------------------+--------------+--------------------+
|file:///home/jo/P...|       Avocado|[0.0, 0.169608205...|
|file:///home/jo/P...|       Avocado|[0.0, 0.267576307...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.362186431...|
|file:///home/jo/P...|       Avocado|[0.0, 0.160421162...|
|file:///home/jo/P...|       Apricot|[0.0, 0.258375674...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.330986350...|
|file:///home/jo/P...|       Apricot|[0.0, 0.281262934...|
|file:///home/jo/P...|       Apricot|[0.0, 0.279605954...|
|file:///home/jo/P...|       Avocado|[0.0, 0.174274235...|
|file:///home/jo/P...|       Apricot|[0.0, 0.321444869...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.335478305...|
+--------------------+--------------+--------------------+



In [15]:
features_df.select('features').show(1)

+--------------------+
|            features|
+--------------------+
|[0.0, 0.174274235...|
+--------------------+
only showing top 1 row



# Réduction de dimension des features extraites

In [16]:
to_vector_udf = udf(lambda a: Vectors.dense(a), VectorUDT())

In [17]:
#data = features_df.select(to_vector_udf("features").alias("features_vec"))
data = features_df.withColumn("features_vec", to_vector_udf("features"))

In [18]:
data.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- features_vec: vector (nullable = true)



In [19]:
data.show()

+--------------------+--------------+--------------------+--------------------+
|              origin|         label|            features|        features_vec|
+--------------------+--------------+--------------------+--------------------+
|file:///home/jo/P...|       Avocado|[0.0, 0.169608205...|[0.0,0.1696082055...|
|file:///home/jo/P...|       Avocado|[0.0, 0.267576307...|[0.0,0.2675763070...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.362186431...|[0.0,0.3621864318...|
|file:///home/jo/P...|       Avocado|[0.0, 0.160421162...|[0.0,0.1604211628...|
|file:///home/jo/P...|       Apricot|[0.0, 0.258375674...|[0.0,0.2583756744...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.330986350...|[0.0,0.3309863507...|
|file:///home/jo/P...|       Apricot|[0.0, 0.281262934...|[0.0,0.2812629342...|
|file:///home/jo/P...|       Apricot|[0.0, 0.279605954...|[0.0,0.2796059548...|
|file:///home/jo/P...|       Avocado|[0.0, 0.174274235...|[0.0,0.1742742359...|
|file:///home/jo/P...|       Apricot|[0.

In [20]:
pca = PCA(k = 20, inputCol = "features_vec", outputCol = "pca_features")
model_pca = pca.fit(data)
# 

In [21]:
data_pca = model_pca.transform(data)


In [22]:
print('Explained Variance :', model_pca.explainedVariance.toArray())

Explained Variance : [7.23664847e-01 1.56294859e-01 6.40149215e-02 4.62250302e-02
 2.44079202e-03 2.28387174e-03 1.61655465e-03 1.59219006e-03
 1.10494227e-03 7.61991792e-04 3.34184301e-15 3.27394673e-15
 2.59118532e-15 2.52493543e-15 2.33346653e-15 1.94300725e-15
 1.91356255e-15 1.86212006e-15 1.47902141e-15 1.37326795e-15]


In [23]:
data_pca.show(6)

+--------------------+--------------+--------------------+--------------------+--------------------+
|              origin|         label|            features|        features_vec|        pca_features|
+--------------------+--------------+--------------------+--------------------+--------------------+
|file:///home/jo/P...|       Avocado|[0.0, 0.169608205...|[0.0,0.1696082055...|[19.5446212229797...|
|file:///home/jo/P...|       Avocado|[0.0, 0.267576307...|[0.0,0.2675763070...|[13.0136398313400...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.362186431...|[0.0,0.3621864318...|[12.7893440303211...|
|file:///home/jo/P...|       Avocado|[0.0, 0.160421162...|[0.0,0.1604211628...|[19.5005920854939...|
|file:///home/jo/P...|       Apricot|[0.0, 0.258375674...|[0.0,0.2583756744...|[12.4264792738997...|
|file:///home/jo/P...|Apple_Braeburn|[0.0, 0.330986350...|[0.0,0.3309863507...|[12.8213011577409...|
+--------------------+--------------+--------------------+--------------------+------------

In [24]:
# Ecriture du dataframe dans un fichier csv

In [25]:
data_pca.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- features_vec: vector (nullable = true)
 |-- pca_features: vector (nullable = true)



In [26]:
from pyspark.ml.functions import vector_to_array
data_pca = data_pca.withColumn('pca_features_list',vector_to_array('pca_features'))



In [27]:
def conv_arr_list(x):
    return x.tolist()

In [28]:
from pyspark.sql.types import ArrayType, FloatType, StringType
udf_conv_arr_list = udf(conv_arr_list, ArrayType(FloatType()))
udf_vector_to_array = udf(lambda v:v.toArray().tolist(),ArrayType(FloatType()))


In [29]:
df_write = data_pca.select('origin','label', udf_vector_to_array('pca_features').alias('pcaFeaturesList'))
df_write.printSchema()

root
 |-- origin: string (nullable = true)
 |-- label: string (nullable = true)
 |-- pcaFeaturesList: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [32]:
df_write.write.parquet('fruits.parquet')

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:41337)
Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/py4j/java_gateway.py", line 1115, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41337)

In [77]:
df_write.write.mode("overwrite").csv('fruits.csv')

AnalysisException: CSV data source does not support array<float> data type.