# Classify the Galaxies

## Set Up

In [1]:
#import all necessary libraries
from sparkdl import readImages
from pyspark.sql.functions import lit
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

sqlCtx = SQLContext(sc)

Using TensorFlow backend.


In [2]:
#declare image directory
img_dir = "data/images_training_rev1/"

In [23]:
#Load classification table
class_df = sqlCtx.read.\
    format("csv").\
    option("header", "true").\
    option("inferSchema", "true").\
    load("data/galaxyClasses.csv")

class_df.show(10)

+--------+------+----+------+
|GalaxyID|Smooth|Edge|Spiral|
+--------+------+----+------+
|  100008|     0|   0|     1|
|  100023|     0|   0|     1|
|  100053|     1|   0|     0|
|  100078|     1|   0|     0|
|  100090|     1|   0|     0|
|  100122|     1|   0|     0|
|  100123|     0|   0|     0|
|  100128|     1|   0|     0|
|  100134|     0|   0|     1|
|  100143|     0|   1|     0|
+--------+------+----+------+
only showing top 10 rows



In [24]:
#merge classes columns {0: nothing, 1:smooth, 2:edge, 3:spiral}
classes = class_df.withColumn("Class", 
        F.when((F.col("smooth") == 1), 1)
        .when((F.col("edge") == 1), 2)
        .when((F.col("spiral") == 1), 3).otherwise(0)
        ).select("galaxyID", "class")

classes.show(10)

+--------+-----+
|galaxyID|class|
+--------+-----+
|  100008|    3|
|  100023|    3|
|  100053|    1|
|  100078|    1|
|  100090|    1|
|  100122|    1|
|  100123|    0|
|  100128|    1|
|  100134|    3|
|  100143|    2|
+--------+-----+
only showing top 10 rows



In [27]:
#Load Images table
image_df = readImages(img_dir)

In [26]:
print(image_df.count())

KeyboardInterrupt: 

In [None]:
img_dir = "/PATH/TO/personalities/"

#Read images and Create training & test DataFrames for transfer learning
jobs_df = readImages(img_dir + "/jobs").withColumn("label", lit(1))
zuckerberg_df = readImages(img_dir + "/zuckerberg").withColumn("label", lit(0))
jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4])
zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4])

#dataframe for training a classification model
train_df = jobs_train.unionAll(zuckerberg_train)

#dataframe for testing the classification model
test_df = jobs_test.unionAll(zuckerberg_test)

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

In [None]:
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

In [None]:
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")

In [None]:
p = Pipeline(stages=[featurizer, lr])

In [None]:
from keras import backend as K
print(K.image_data_format())

In [None]:
p_model = p.fit(train_df)

In [None]:
predictions = p_model.transform(test_df)

predictions.select("filePath", "prediction").show(truncate=False)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
df = p_model.transform(test_df)
df.show()

predictionAndLabels = df.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))