## Set Up

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

sqlCtx = SQLContext(sc)

In [3]:
galaxy_df = sqlCtx.read.\
    format("csv").\
    option("header", "true").\
    option("inferSchema", "true").\
    load("data/training_solutions_rev1.csv")

## Summarize the Data

In [4]:
galaxy_df.printSchema()

root
 |-- GalaxyID: integer (nullable = true)
 |-- Class1.1: double (nullable = true)
 |-- Class1.2: double (nullable = true)
 |-- Class1.3: double (nullable = true)
 |-- Class2.1: double (nullable = true)
 |-- Class2.2: double (nullable = true)
 |-- Class3.1: double (nullable = true)
 |-- Class3.2: double (nullable = true)
 |-- Class4.1: double (nullable = true)
 |-- Class4.2: double (nullable = true)
 |-- Class5.1: double (nullable = true)
 |-- Class5.2: double (nullable = true)
 |-- Class5.3: double (nullable = true)
 |-- Class5.4: double (nullable = true)
 |-- Class6.1: double (nullable = true)
 |-- Class6.2: double (nullable = true)
 |-- Class7.1: double (nullable = true)
 |-- Class7.2: double (nullable = true)
 |-- Class7.3: double (nullable = true)
 |-- Class8.1: double (nullable = true)
 |-- Class8.2: double (nullable = true)
 |-- Class8.3: double (nullable = true)
 |-- Class8.4: double (nullable = true)
 |-- Class8.5: double (nullable = true)
 |-- Class8.6: double (nullable = 

In [5]:
galaxy_df.show(5)

+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+--------+-----------+-----------+-----------+--------+--------+-----------+-----------+-----------+--------+---------+-----------+---------+-----------+---------+--------+-----------+--------+-----------+-----------+-----------+-----------+-----------+-----------+---------+---------+---------+-----------+
|GalaxyID|Class1.1|Class1.2|Class1.3|   Class2.1|   Class2.2|   Class3.1|   Class3.2|   Class4.1|   Class4.2|Class5.1|   Class5.2|   Class5.3|   Class5.4|Class6.1|Class6.2|   Class7.1|   Class7.2|   Class7.3|Class8.1| Class8.2|   Class8.3| Class8.4|   Class8.5| Class8.6|Class8.7|   Class9.1|Class9.2|   Class9.3|  Class10.1|  Class10.2|  Class10.3|  Class11.1|  Class11.2|Class11.3|Class11.4|Class11.5|  Class11.6|
+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+--------+-----------+-----------+-----------+--------+-----

In [54]:
galaxy_df.createOrReplaceTempView("df") #allow us to use SQL statements

#How many objects are there
numTotal = galaxy_df.count()
print("There are", numTotal, "objects.")

#How many are not galaxies
numNot = sqlCtx.sql("SELECT * FROM df WHERE `Class1.3` >= 0.5").count()
print("There are", numNot, "non-galaxies.")

#How many smooth galaxies
numSmooth = sqlCtx.sql("SELECT * FROM df WHERE `Class1.1` >= 0.5").count()
print("There are", numSmooth, "smooth galaxies.")

#How many edge-on glaxies
numEdge = sqlCtx.sql("SELECT * FROM df WHERE `Class1.2` >= 0.5 \
                                            AND `Class2.1` >= 0.5").count()
print("There are", numEdge, "edge-on galaxies.")

#How many are not classified
numUncertain = sqlCtx.sql("SELECT * FROM df WHERE `Class1.1` < 0.5 AND \
                          `Class1.2` < 0.5 AND `Class1.3` < 0.5").count()
print("There are", numUncertain, "unclassified objects.")

#How many spiral glaxies
numSpiral = sqlCtx.sql("SELECT * FROM df WHERE `Class1.2` >= 0.5 AND `Class2.1` < 0.5 AND `Class4.1` >= 0.5").count()
print("There are", numSpiral, "spiral galaxies.")

#How many other
numOdd = sqlCtx.sql("SELECT * FROM df WHERE `Class1.1` < 0.5 AND `Class2.1` < 0.5 AND `Class4.1` < 0.5 AND `Class6.1` >= 0.5").count()
print("There are", numOdd, "other galaxies.")

print(numTotal-numNot-numSmooth-numEdge-numSpiral-numOdd)

NotnumOdd = sqlCtx.sql("SELECT * FROM df WHERE `Class1.1` < 0.5 AND `Class2.1` < 0.5 AND `Class4.1` < 0.5 AND `Class6.1` < 0.5")

There are 61578 objects.
There are 44 non-galaxies.
There are 25868 smooth galaxies.
There are 34114 edge-on galaxies.
There are 1555 unclassified objects.
There are 10397 spiral galaxies.
There are 4587 other galaxies.
-13432


In [42]:
NotnumOdd.show(5)

+--------+--------+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+--------+--------+-----------+-----------+-----------+----------+---------+-----------+-----------+-----------+--------+--------+--------+--------+--------+-----------+-----------+---------+-----------+-----------+---------+---------+-----------+-----------+
|GalaxyID|Class1.1|Class1.2|Class1.3|Class2.1|Class2.2|   Class3.1|   Class3.2|   Class4.1|   Class4.2|   Class5.1|   Class5.2|   Class5.3|   Class5.4|Class6.1|Class6.2|   Class7.1|   Class7.2|   Class7.3|  Class8.1| Class8.2|   Class8.3|   Class8.4|   Class8.5|Class8.6|Class8.7|Class9.1|Class9.2|Class9.3|  Class10.1|  Class10.2|Class10.3|  Class11.1|  Class11.2|Class11.3|Class11.4|  Class11.5|  Class11.6|
+--------+--------+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+--------+--------+-----------