In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [51]:
spark = SparkSession.builder.getOrCreate()

In [52]:
df_train = spark.read.csv("Planet_Training.csv", header=True, inferSchema=True)
df_train = df_train.select("Temperature", "Atmosphere Color", "Water", "Habitable")
df_train = df_train.na.drop()

In [53]:
df_train.show()

+-----------+----------------+------+---------+
|Temperature|Atmosphere Color| Water|Habitable|
+-----------+----------------+------+---------+
|     323488|          Yellow|Medium|        1|
|     319279|          Yellow|   Low|        1|
|     315375|          Yellow|   Low|        1|
|     302312|          Yellow|Medium|        1|
|     329687|          Yellow|   Low|        1|
|     265746|             Red|  High|        0|
|     305214|          Yellow|  High|        1|
|     299936|          Yellow|  High|        0|
|     269577|             Red|Medium|        1|
|     303631|             Red|  High|        0|
|     290051|             Red|  High|        0|
|     306122|          Yellow|   Low|        1|
|     300635|          Yellow|   Low|        1|
|     312152|            Blue|  High|        0|
|     265942|            Blue|Medium|        0|
|     307368|             Red|  High|        0|
|     276274|          Yellow|Medium|        1|
|     308531|          Yellow|Medium|   

In [54]:
df_test = spark.read.csv("Planet_Testing.csv", header=True, inferSchema=True)
df_test = df_test.select("Temperature", "Atmosphere Color", "Water", "Habitable")
df_test = df_test.na.drop()

In [55]:
df_test.show()

+-----------+----------------+------+---------+
|Temperature|Atmosphere Color| Water|Habitable|
+-----------+----------------+------+---------+
|     325145|          Yellow|  High|        1|
|     269079|             Red|Medium|        0|
|     302996|          Yellow|  High|        1|
|     312604|          Yellow|  High|        1|
|     280875|          Yellow|   Low|        1|
|     306384|             Red|  High|        0|
|     303007|          Yellow|  High|        1|
|     297965|             Red|  High|        0|
|     290305|            Blue|  High|        0|
|     316596|             Red|  High|        0|
|     266840|          Yellow|   Low|        1|
|     324187|          Yellow|   Low|        1|
|     327198|          Yellow|   Low|        1|
|     310018|          Yellow|  High|        0|
|     302718|            Blue|  High|        1|
|     301697|          Yellow|   Low|        1|
|     316071|          Yellow|Medium|        1|
|     271765|          Yellow|   Low|   

In [56]:
def parse(df):
    df = df.withColumn("Atmosphere Color", when(df["Atmosphere Color"]=="Red",0).when(df["Atmosphere Color"]=="Blue",1).when(df["Atmosphere Color"]=="Yellow",2))
    df = df.withColumn("Water", when(df["Water"]=="Low",0).when(df["Water"]=="Medium",1).when(df["Water"]=="High",2))
    
    cols = df.columns
    cols.remove("Habitable")
    df = VectorAssembler(inputCols = cols, outputCol = "Features").transform(df)
    
    scaler = StandardScaler(inputCol = "Features", outputCol = "Scaled_Features")
    df = scaler.fit(df).transform(df)
    
    return df

In [57]:
df_train = parse(df_train)
df_test = parse(df_test)

In [60]:
model = LogisticRegression(featuresCol = "Scaled_Features", labelCol = "Habitable", maxIter=10).fit(df_train)

prediction = model.transform(df_test)

In [61]:
evaluator = BinaryClassificationEvaluator(labelCol="Habitable")
acc = evaluator.evaluate(prediction) * 100
print("Accuracy: {}%".format(acc))

Accuracy: 91.71043337232418%
