# Classify the Galaxies

## Set Up

In [12]:
#import all necessary libraries
import os

from pyspark.sql.functions import lit
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import IPython.display as dp
from pyspark.ml.image import ImageSchema
from sparkdl.image import imageIO

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

sqlCtx = SQLContext(sc)

Using TensorFlow backend.


In [2]:
#declare image directory
img_dir = "data/images_training_rev1/"

In [3]:
#Load classification table
class_df = sqlCtx.read.\
    format("csv").\
    option("header", "true").\
    option("inferSchema", "true").\
    load("data/galaxyClasses.csv")

class_df.show(10)

+--------+------+----+------+
|GalaxyID|Smooth|Edge|Spiral|
+--------+------+----+------+
|  100008|     0|   0|     1|
|  100023|     0|   0|     1|
|  100053|     1|   0|     0|
|  100078|     1|   0|     0|
|  100090|     1|   0|     0|
|  100122|     1|   0|     0|
|  100123|     0|   0|     0|
|  100128|     1|   0|     0|
|  100134|     0|   0|     1|
|  100143|     0|   1|     0|
+--------+------+----+------+
only showing top 10 rows



In [4]:
#merge classes columns {0: nothing, 1:smooth, 2:edge, 3:spiral}
classes = class_df.withColumn("Class", 
        F.when((F.col("smooth") == 1), 1)
        .when((F.col("edge") == 1), 2)
        .when((F.col("spiral") == 1), 3).otherwise(0)
        ).select("galaxyID", "class")

classes.show(10)

+--------+-----+
|galaxyID|class|
+--------+-----+
|  100008|    3|
|  100023|    3|
|  100053|    1|
|  100078|    1|
|  100090|    1|
|  100122|    1|
|  100123|    0|
|  100128|    1|
|  100134|    3|
|  100143|    2|
+--------+-----+
only showing top 10 rows



### Create DataFrame of Images

In [None]:
fs = os.listdir("data/images_training_rev1/") #get list of image file names
fs.sort() #sort by file name

images = [] #create blank list which will store image objects

for ea in fs: #for each image in the list of images
    #add images object to the list of images
    images.append(dp.Image(filename="data/images_training_rev1/" + ea, format="png"))

In [None]:
#Show first 5 images
for ea in range(5):
    dp.display_png(images[ea])

#### IMPORTANT!!!! Change directory to full subset when not testing

In [9]:
#create images dataframe

#image_df = ImageSchema.readImages("data/images_training_rev1/")

image_fd = ImageSchema.readImages("data/subset_images/")

In [10]:
image_df.show(5)

+--------------------+
|               image|
+--------------------+
|[file:/home/jovya...|
|[file:/home/jovya...|
|[file:/home/jovya...|
|[file:/home/jovya...|
|[file:/home/jovya...|
+--------------------+
only showing top 5 rows

