# Split All Images into Classified Folders

## Set Up

In [1]:
#import all necessary libraries
import os

from pyspark.sql.functions import lit
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import IPython.display as dp
from pyspark.ml.image import ImageSchema
from sparkdl.image import imageIO

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

sqlCtx = SQLContext(sc)

Using TensorFlow backend.


In [2]:
#Load classification table
class_df = sqlCtx.read.\
    format("csv").\
    option("header", "true").\
    option("inferSchema", "true").\
    load("data/galaxyClasses.csv")

class_df.show(10)

+--------+------+----+------+
|GalaxyID|Smooth|Edge|Spiral|
+--------+------+----+------+
|  100008|     0|   0|     1|
|  100023|     0|   0|     1|
|  100053|     1|   0|     0|
|  100078|     1|   0|     0|
|  100090|     1|   0|     0|
|  100122|     1|   0|     0|
|  100123|     0|   0|     0|
|  100128|     1|   0|     0|
|  100134|     0|   0|     1|
|  100143|     0|   1|     0|
+--------+------+----+------+
only showing top 10 rows



### Create List of GalaxyID for Each Type of Galaxy

In [3]:
#Find ID's of each kind of galaxy type
smoothID = class_df.where(F.col("smooth") == 1).select('galaxyID').rdd.map(lambda x: x[0]).collect()
edgeID = class_df.where(F.col("edge") == 1).select('galaxyID').rdd.map(lambda x: x[0]).collect()
spiralID = class_df.where(F.col("spiral") == 1).select('galaxyID').rdd.map(lambda x: x[0]).collect()
otherID = class_df.where((F.col("edge") == 0) & (F.col("smooth") == 0) & (F.col("spiral") == 0)).select('galaxyID').rdd.map(lambda x: x[0]).collect()

In [4]:
#Verify that there are no leftover images (should return 0)
len(smoothID) + len(edgeID) + len(spiralID) + len(otherID) - class_df.count()

0

### Move Photos to Appropriate Folder

#### Smooth Galaxies

In [None]:
#Create file directories of all smooth galaxies

smooth_dir = [] #blank list of file directories for all smooth galaxies

#loop through smooth ID's and generate corresponding file directory
for smooth in smoothID:
    smooth_dir.append("data/images_training_rev1/" + str(smooth) + ".jpg")

In [None]:
#Move all smooth galaxies to appropriate folder
for img in smooth_dir:
    !mv {img} data/galaxy_images_classified/smooth/

#### Edge Galaxies

In [5]:
#Create file directories of all edge galaxies

edge_dir = [] #blank list of file directories for all edge galaxies

#loop through edge ID's and generate corresponding file directory
for edge in edgeID:
    edge_dir.append("data/images_training_rev1/" + str(edge) + ".jpg")

In [7]:
#Move all edge galaxies to appropriate folder
for img in range(len(edge_dir) - 1):
    if img/len(edge_dir) % 0.05 == 0:
        print(img/len(edge_dir), "% completed...")
    !mv {edge_dir[img]} data/galaxy_images_classified/edge/

0.0 % completed...


#### Spiral

In [8]:
#Create file directories of all spiral galaxies

spiral_dir = [] #blank list of file directories for all spiral galaxies

#loop through spiral ID's and generate corresponding file directory
for spiral in spiralID:
    spiral_dir.append("data/images_training_rev1/" + str(spiral) + ".jpg")

In [13]:
#Move all spiral galaxies to appropriate folder
for img in range(len(spiral_dir) - 1):
    if img/15000 % 0.05 == 0:
        print(img/15000, "% completed...")
    !mv {spiral_dir[img]} data/galaxy_images_classified/spiral/

0.0 % completed...
0.05 % completed...
0.1 % completed...
0.2 % completed...
0.4 % completed...
0.8 % completed...
