## Federer Vs Djokovic Image Classification using Deep Learning with Spark and Tensorflow


In [1]:
spark = SparkSession.builder \
   .master("local[*]") \
   .appName("ImageClassification") \
   .config("spark.executor.memory", "16gb") \
   .config("spark.driver.memory", "16G") \
   .config("spark.driver.offHeap.enabled", "true") \
   .config("spark.driver.offHeap.size", "16G") \
   .config("spark.executor.maxResultSize", "16gb") \
   .getOrCreate()

In [2]:
## .config("spark.memory.offHeap.enabled",true)
###     .config("spark.memory.offHeap.size","16g")  
### --executor-memory 64G

In [2]:
sc = spark.sparkContext
sc

In [3]:
import tensorflow as tf
tf.__version__

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


'1.4.0'

In [4]:
import pyspark.sql.functions as f
import sparkdl as dl

Using TensorFlow backend.


### Load Images of Djokovic and Federer

In [5]:
dfDjokovic = dl.readImages('./tennis/small/djokovic/').withColumn('label', f.lit(0))
dfFederer = dl.readImages('./tennis/small/federer/').withColumn('label', f.lit(1))

In [6]:
dfDjokovic.show(n=10,truncate=False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
dfFederer.show(n=10,truncate=False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
trainDFdjokovic, testDFdjokovic = dfDjokovic.randomSplit([80.00, 20.00], seed =12)
trainDFfederer, testDFfederer = dfFederer.randomSplit([80.00, 20.00], seed=12)

In [9]:
print('The number of images in trainDFdjokovic is {}'.format(trainDFdjokovic.toPandas().shape[0]))
print('The number of images in testDFdjokovic is {}'.format(testDFdjokovic.toPandas().shape[0]))
print('The number of images in trainDFfederer is {}'.format(trainDFfederer.toPandas().shape[0]))
print('The number of images in testDFfederer is {}'.format(testDFfederer.toPandas().shape[0]))

The number of images in trainDFdjokovic is 48
The number of images in testDFdjokovic is 12
The number of images in trainDFfederer is 48
The number of images in testDFfederer is 12


In [10]:
trainDF = trainDFdjokovic.unionAll(trainDFfederer)
testDF = testDFdjokovic.unionAll(testDFfederer)

In [11]:
print('The number of images in the training data is {}' .format(trainDF.toPandas().shape[0]))
print('The number of images in the testing  data is {}' .format(testDF.toPandas().shape[0]))

The number of images in the training data is 96
The number of images in the testing  data is 24


### Training with Deep Image Featurizer + Logistic Regression

In [12]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
## Xception InceptionV3
vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')
logreg = LogisticRegression(maxIter=30,regParam=0.05, elasticNetParam=0.3, labelCol = "label", featuresCol="features")
pipeline = Pipeline(stages=[vectorizer, logreg])

pipeline_model = pipeline.fit(trainDF)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.


### Save MODELS

In [13]:
lrModel = pipeline_model
print(lrModel)  # summary only

PipelineModel_4000a4eb4bc5f8674dba


In [15]:
lrModel.stages[1].write().overwrite().save('lr')

### Reload LR Model

In [23]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr_test = LogisticRegressionModel.load('./lr')

# Use a featurizer to use trained features from an existing model
featurizer_test = dl.DeepImageFeaturizer(inputCol = "image", outputCol = "features", modelName = "InceptionV3")

# Pipeline both entities
p_lr_test = PipelineModel(stages=[featurizer_test, lr_test])

# Test and evaluate
tested_lr_test = p_lr_test.transform(testDF)
evaluator_lr_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Logistic Regression Model: Test set accuracy = " + str(evaluator_lr_test.evaluate(tested_lr_test.select("prediction", "label"))))

tested_lr_test.select("label", "probability", "prediction").show(20, False)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.
Logistic Regression Model: Test set accuracy = 0.625
+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[0.9816942623349085,0.01830573766509151]|0.0       |
|0    |[0.8964592926384412,0.10354070736155883]|0.0       |
|0    |[0.845914224595905,0.15408577540409504] |0.0       |
|0    |[0.6812148102329231,0.318785189767077]  |0.0       |
|0    |[0.6976636983246348,0.3023363016753652] |0.0       |
|0    |[0.72433912351231,0.27566087648769]     |0.0       |
|0    |[0.5730115632253174,0.4269884367746826] |0.0       |
|0    |[0.9110588779715431,0.08894112202845691]|0.0       |
|0    |[0.7748100240503034,0.22518997594969664]|0.0       |
|0    |[0.7604404567342387,0.23955954326576148]|0.0       |
|0    |[0.62653488761

### Decision Tree Classifier

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
## Xception InceptionV3
vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')

dt = DecisionTreeClassifier(labelCol = "label", featuresCol="features", maxDepth = 3)

dt_pipeline = Pipeline(stages=[vectorizer, dt])

dt_pipeline_model = dt_pipeline.fit(trainDF)


INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.


In [19]:
# Test and evaluate
tested_dt_test = dt_pipeline_model.transform(testDF)
evaluator_dt_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Decision Tree Model: Test set accuracy = " + str(evaluator_dt_test.evaluate(tested_dt_test.select("prediction", "label"))))

tested_dt_test.select("label", "probability", "prediction").show(20, False)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.
Decision Tree Model: Test set accuracy = 0.4583333333333333
+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[0.6428571428571429,0.35714285714285715]|0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.0,1.0]                               |1.0       |
|0    |[1.0,0.

### Save DT model

In [20]:
dtModel = dt_pipeline_model
print(dtModel)  # summary only
dtModel.stages[1].write().overwrite().save('dt')

PipelineModel_4c84b908e99f56ec16b2


### Reload DT Model

In [None]:
from pyspark.ml.classification import DecisionTreeModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

dt_test = DecisionTreeModel.load('./dt')

# Use a featurizer to use trained features from an existing model
featurizer_dt_test = dl.DeepImageFeaturizer(inputCol = "image", outputCol = "features", modelName = "InceptionV3")

# Pipeline both entities
pdt_test = PipelineModel(stages=[featurizer_dt_test, dt_test])

# Test and evaluate
tested_dt_test = pdt_test.transform(testDF)
evaluator_dt_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Test set accuracy = " + str(evaluator_test.evaluate(tested_dt_test.select("prediction", "label"))))

tested_dt_test.select("label", "probability", "prediction").show(20, False)

### Random Forest Classifier

In [21]:
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml import Pipeline
## Xception InceptionV3
vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')

rf = RandomForestClassifier(labelCol = "label", featuresCol="features")

rf_pipeline = Pipeline(stages=[vectorizer, rf])

rf_pipeline_model = rf_pipeline.fit(trainDF)





INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.


In [22]:
# Test and evaluate
tested_rf_test = rf_pipeline_model.transform(testDF)
evaluator_rf_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Random Forest Model: Test set accuracy = " + str(evaluator_rf_test.evaluate(tested_rf_test.select("prediction", "label"))))

tested_dt_test.select("label", "probability", "prediction").show(20, False)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.
Random Forest Model: Test set accuracy = 0.5416666666666666
+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[0.6428571428571429,0.35714285714285715]|0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.0,1.0]                               |1.0       |
|0    |[1.0,0.

### Save RF Model

### Reload RF Model

In [None]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr_test = LogisticRegressionModel.load('./lr')

# Use a featurizer to use trained features from an existing model
featurizer_test = dl.DeepImageFeaturizer(inputCol = "image", outputCol = "features", modelName = "InceptionV3")

# Pipeline both entities
p_test = PipelineModel(stages=[featurizer_test, lr_test])

# Test and evaluate
tested_df_test = p_test.transform(testDF)
evaluator_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Test set accuracy = " + str(evaluator_test.evaluate(tested_df_test.select("prediction", "label"))))

tested_df_test.select("label", "probability", "prediction").show(20, False)

### Gradient-Boosted Tree Classifier

In [24]:
from pyspark.ml.classification import GBTClassifier

from pyspark.ml import Pipeline
## Xception InceptionV3
vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')

gbt = GBTClassifier(maxIter=10)

gbt_pipeline = Pipeline(stages=[vectorizer, gbt])

gbt_pipeline_model = gbt_pipeline.fit(trainDF)





INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.


In [25]:
# Test and evaluate
tested_gbt_test = gbt_pipeline_model.transform(testDF)
evaluator_gbt_test = MulticlassClassificationEvaluator(metricName = "accuracy")
print("GBT Model: Test set accuracy = " + str(evaluator_gbt_test.evaluate(tested_gbt_test.select("prediction", "label"))))

tested_dt_test.select("label", "probability", "prediction").show(20, False)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.
GBT Model: Test set accuracy = 0.5
+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[0.6428571428571429,0.35714285714285715]|0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[1.0,0.0]                               |0.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.02564102564102564,0.9743589743589743]|1.0       |
|0    |[0.0,1.0]                               |1.0       |
|0    |[1.0,0.0]                       

In [27]:
tested_gbt_test.select("label", "probability", "prediction").show(20, False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[0.9213679929239009,0.07863200707609908]|0.0       |
|0    |[0.9109990064336037,0.08900099356639635]|0.0       |
|0    |[0.5690384687450698,0.4309615312549302] |0.0       |
|0    |[0.917621369873649,0.08237863012635105] |0.0       |
|0    |[0.9234198064833812,0.07658019351661882]|0.0       |
|0    |[0.9341221756527827,0.06587782434721734]|0.0       |
|0    |[0.08713896088954212,0.9128610391104579]|1.0       |
|0    |[0.06587782434721742,0.9341221756527825]|1.0       |
|0    |[0.07447446412813939,0.9255255358718606]|1.0       |
|0    |[0.14395522502022787,0.8560447749797722]|1.0       |
|0    |[0.9242239200928457,0.07577607990715429]|0.0       |
|0    |[0.06587782434721742,0.9341221756527825]|1.0       |
|1    |[0.9189007369184469,0.08109926308155313]|0.0       |
|1    |[0.9141603725626289,0.08583962743

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binaryevaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
binary_rate = binaryevaluator.evaluate(prediction)*100
print("accuracy: {}%" .format(round(binary_rate,2)))

accuracy: 58.33%


### KERAS - Save Model

In [1]:
from keras.applications import InceptionV3


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)
