# Deep Learning

### Downloads flower photos

In [1]:
# !wget http://download.tensorflow.org/example_images/flower_photos.tgz
# !tar xzf flower_photos.tgz

In [2]:
img_dir = './flower_photos'

### Creates Spark session with appropriate packages to read JPEGs

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession\
.Builder()\
.config("spark.jars.packages", "com.twelvemonkeys.imageio:imageio-core:3.3.2,com.twelvemonkeys.imageio:imageio-jpeg:3.3.2,com.databricks:spark-xml_2.11:0.4.0,databricks:spark-deep-learning:1.0.0-spark2.3-s_2.11,JohnSnowLabs:spark-nlp:1.5.3")\
.getOrCreate()

### Loads Data

In [4]:
# Create training & test DataFrames for transfer learning - this piece of code is longer than transfer learning itself below!
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit

tulips_df = ImageSchema.readImages(img_dir + "/tulips").withColumn("label", lit(1))
daisy_df = ImageSchema.readImages(img_dir + "/daisy").withColumn("label", lit(0))

### Train / Test split

In [5]:
tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])
train_df = tulips_train.unionAll(daisy_train)
test_df = tulips_test.unionAll(daisy_test)
# Under the hood, each of the partitions is fully loaded in memory, which may be expensive.
# This ensure that each of the paritions has a small size.
train_df = train_df.repartition(100)
test_df = test_df.repartition(100)

### Uses InceptionV3 second-to-last layer as features to a logistic regression

In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer 

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])

p_model = p.fit(train_df)

Using TensorFlow backend.


### Checks performance of the model

In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label"))))

Test set accuracy = 0.9836956521739131


In [8]:
tested_df.show()

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|               image|label|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|[file:/home/jovya...|    1|[0.0,1.6356960535...|[-5.0003982118265...|[0.00669020410947...|       1.0|
|[file:/home/jovya...|    1|[0.0,0.0,0.0,0.0,...|[-2.4530546732209...|[0.07921545361819...|       1.0|
|[file:/home/jovya...|    1|[0.09982410073280...|[-3.6911658099002...|[0.02433589836335...|       1.0|
|[file:/home/jovya...|    0|[0.0,0.5962234735...|[1.04913588186380...|[0.74060893026092...|       0.0|
|[file:/home/jovya...|    0|[0.12704770267009...|[3.51567480638898...|[0.97113048946148...|       0.0|
|[file:/home/jovya...|    0|[0.0,0.0,0.200741...|[4.93462486536401...|[0.99285821316427...|       0.0|
|[file:/home/jovya...|    1|[0.0,0.0,0.0,0.0,...|[-1.0263473400966...|[0.

In [9]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import expr, udf

def _p1(v):
    return float(v.array[1])
p1 = udf(_p1, DoubleType())

df = tested_df.withColumn("p_1", p1(tested_df.probability))
wrong_df = df.orderBy(expr("abs(p_1 - label)"), ascending=False)
wrong_df.select("image", "p_1", "label").limit(10).show()

+--------------------+-------------------+-----+
|               image|                p_1|label|
+--------------------+-------------------+-----+
|[file:/home/jovya...|  0.757767616815746|    0|
|[file:/home/jovya...| 0.7554068729680621|    0|
|[file:/home/jovya...| 0.7177013885251289|    0|
|[file:/home/jovya...| 0.7122823928265124|    0|
|[file:/home/jovya...| 0.3462392842804362|    1|
|[file:/home/jovya...|0.43770401132959896|    1|
|[file:/home/jovya...| 0.5277157556209117|    0|
|[file:/home/jovya...| 0.4816212879194128|    1|
|[file:/home/jovya...| 0.4948924204422959|    1|
|[file:/home/jovya...| 0.5158162043989969|    1|
+--------------------+-------------------+-----+



### Copies some photos to a sample folder

In [10]:
sample_img_dir = './flower_photos/sample'

In [11]:
#!mkdir ./flower_photos/sample

In [12]:
#!cp ./flower_photos/daisy/100080576_f52e8ee070_n.jpg ./flower_photos/sample
#!cp ./flower_photos/daisy/10140303196_b88d3d6cec.jpg ./flower_photos/sample
#!cp ./flower_photos/tulips/100930342_92e8746431_n.jpg ./flower_photos/sample
#!cp ./flower_photos/tulips/10094729603_eeca3f2cb6.jpg ./flower_photos/sample

### Makes predictions using InceptionV3

In [13]:
from sparkdl import DeepImagePredictor

image_df = ImageSchema.readImages(sample_img_dir)

predictor = DeepImagePredictor(inputCol="image", 
                               outputCol="predicted_labels",
                               modelName="InceptionV3", 
                               decodePredictions=True, 
                               topK=10)
predictions_df = predictor.transform(image_df)

predictions_df.select("image", "predicted_labels").toPandas()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
INFO:tensorflow:Froze 378 variables.
Converted 378 variables to const ops.


Py4JJavaError: An error occurred while calling o291.collectToPython.
: org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path does not exist: file:/home/jovyan/work/DSR/notebooks/flower_photos/sample
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:323)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265)
	at org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51)
	at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:51)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:340)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:3195)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:3225)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3192)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
df = p_model.transform(image_df)
df.select("image", (1-p1(df.probability)).alias("p_daisy")).toPandas()

### Users a Keras Pretrained model as a Transformer

In [None]:
from keras.applications import InceptionV3

model = InceptionV3(weights="imagenet")
model.save('/tmp/model-full.h5')

In [None]:
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img
import numpy as np
from pyspark.sql.types import StringType
from sparkdl import KerasImageFileTransformer

def loadAndPreprocessKerasInceptionV3(uri):
    # this is a typical way to load and prep images in keras
    image = img_to_array(load_img(uri, target_size=(299, 299)))  # image dimensions for InceptionV3
    image = np.expand_dims(image, axis=0)
    return preprocess_input(image)

transformer = KerasImageFileTransformer(inputCol="uri", outputCol="predictions",
                                        modelFile='/tmp/model-full.h5',  # local file path for model
                                        imageLoader=loadAndPreprocessKerasInceptionV3,
                                        outputMode="vector")

In [None]:
import os
from pyspark.sql import SQLContext
sc = spark.sparkContext
sqlContext = SQLContext(sc)

files = [os.path.join(sample_img_dir, f) for f in os.listdir(sample_img_dir)]
uri_df = sqlContext.createDataFrame(files, StringType()).toDF("uri")

keras_pred_df = transformer.transform(uri_df)

In [None]:
results = keras_pred_df.select("uri", "predictions").toPandas()

In [None]:
np.argmax(results.predictions.iloc[0])

### Uses a regular Keras model as a Transformer

In [None]:
from sparkdl import KerasTransformer
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from pyspark.sql.types import *
from pyspark.sql import SQLContext

sc = spark.sparkContext
sqlContext = SQLContext(sc)

# Generate random input data
num_features = 10
num_examples = 100
input_data = [{"features" : np.random.randn(num_features).astype(float).tolist()} for i in range(num_examples)]
schema = StructType([ StructField("features", ArrayType(FloatType()), True)])
input_df = sqlContext.createDataFrame(input_data, schema)

# Create and save a single-hidden-layer Keras model for binary classification
# NOTE: In a typical workflow, we'd train the model before exporting it to disk,
# but we skip that step here for brevity
model = Sequential()
model.add(Dense(units=20, input_shape=[num_features], activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
model_path = "/tmp/simple-binary-classification"
model.save(model_path)

# Create transformer and apply it to our input data
transformer = KerasTransformer(inputCol="features", outputCol="predictions", modelFile=model_path)
final_df = transformer.transform(input_df)

In [None]:
final_df.show()

### Deploy a Keras pretrained model as a UDF to be used on SQL queries

In [None]:
from keras.applications import InceptionV3
from sparkdl.udf.keras_image_model import registerKerasImageUDF

registerKerasImageUDF("inceptionV3_udf", InceptionV3(weights="imagenet"))
registerKerasImageUDF("my_custom_keras_model_udf", "/tmp/model-full.h5")

def keras_load_img(fpath):
    from keras.preprocessing.image import load_img, img_to_array
    import numpy as np
    img = load_img(fpath, target_size=(299, 299))
    return img_to_array(img).astype(np.uint8)

registerKerasImageUDF("inceptionV3_udf_with_preprocessing", InceptionV3(weights="imagenet"), keras_load_img)

In [None]:
sample_img_dir = './flower_photos/sample'
from pyspark.ml.image import ImageSchema

image_df = ImageSchema.readImages(sample_img_dir)
image_df.registerTempTable("sample_images")

In [None]:
sqlContext.sql("SELECT inceptionV3_udf(image) as predictions from sample_images").show()

In [None]:
sqlContext.sql("SELECT my_custom_keras_model_udf(image) as predictions from sample_images").show()

In [None]:
sqlContext.sql("SELECT inceptionV3_udf_with_preprocessing(image) as predictions from sample_images").show()