# Classify the Galaxies

## Set Up

In [1]:
#import all necessary libraries
import os

from pyspark.sql.functions import lit
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import IPython.display as dp
from pyspark.ml.image import ImageSchema
from sparkdl.image import imageIO

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from sparkdl import DeepImageFeaturizer

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

sqlCtx = SQLContext(sc)

Using TensorFlow backend.


## Create DataFrame of Images

### Verify Can Load Images

In [None]:
#small subset image directory
direc = "data/galaxy_images_classified/small_subset/"

fs = os.listdir(direc) #get list of image file names
fs.sort() #sort by file name

images = [] #create blank list which will store image objects

for ea in fs: #for each image in the list of images
    #add images object to the list of images
    images.append(dp.Image(filename=direc + ea, format="png"))

In [None]:
#Show first 5 images
for ea in range(5):
    dp.display_png(images[ea])

### Create Image DataFrames

#### IMPORTANT!!!! Change directory to full subset when not testing

In [None]:
#image_df = ImageSchema.readImages("data/galaxy_images_classified/[TYPE]")

smooth_df = ImageSchema.readImages("data/galaxy_images_classified/smooth/").withColumn("label", lit(1))
edge_df = ImageSchema.readImages("data/galaxy_images_classified/edge/").withColumn("label", lit(2))
spiral_df = ImageSchema.readImages("data/galaxy_images_classified/spiral/").withColumn("label", lit(3))
other_df = ImageSchema.readImages("data/galaxy_images_classified/other/").withColumn("label", lit(0))

In [None]:
#create train and test datasets for each galaxy type
smooth_train, smooth_test = smooth_df.randomSplit([0.6, 0.4], seed=123)
edge_train, edge_test = edge_df.randomSplit([0.6, 0.4], seed=123)
spiral_train, spiral_test = spiral_df.randomSplit([0.6, 0.4], seed=123)
other_train, other_test = other_df.randomSplit([0.6, 0.4], seed=123)

In [None]:
#Add in other types of galaxies later!!!!!
train_df = smooth_train.unionAll(other_train)
test_df = smooth_test.unionAll(other_test)

In [None]:
train_df = train_df.repartition(100)
test_df = test_df.repartition(100)

## Train the Model

In [None]:
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])

p_model = p.fit(train_df)