In [1]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

## Set seed
seed = 42

In [2]:
## Create Spark Session
spark = SparkSession.builder.appName('dtConsProject').getOrCreate()

In [3]:
## Setup Schema
schema = StructType(fields=[StructField('a', IntegerType(), True),
                            StructField('b', IntegerType(), True),
                            StructField('c', DoubleType(), True),
                            StructField('d', IntegerType(), True),
                            StructField('spoiled', DoubleType(), True)])

In [4]:
## Load Data
df = spark.read.csv('gs://spark-training-data/datasets/dog_food.csv', header=True, inferSchema=False,
                    schema=schema)
df.show(5)
df.printSchema() ## Confirm proper schema

[Stage 0:>                                                          (0 + 1) / 1]

+---+---+----+---+-------+
|  a|  b|   c|  d|spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows

root
 |-- a: integer (nullable = true)
 |-- b: integer (nullable = true)
 |-- c: double (nullable = true)
 |-- d: integer (nullable = true)
 |-- spoiled: double (nullable = true)



                                                                                

In [5]:
## Assembler & Create modeling df
assembler = VectorAssembler(inputCols=['a','b','c','d'],
                           outputCol='features')
output_features = assembler.transform(df)
output_features.head(1)

                                                                                

[Row(a=4, b=2, c=12.0, d=3, spoiled=1.0, features=DenseVector([4.0, 2.0, 12.0, 3.0]))]

In [7]:
## Setup Final Data
final_data = output_features.select(['features','spoiled'])
final_data.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+------------------+-------+
|          features|spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
|[5.0,6.0,12.0,7.0]|    1.0|
|[6.0,2.0,13.0,6.0]|    1.0|
|[4.0,2.0,12.0,1.0]|    1.0|
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 5 rows



                                                                                

In [8]:
## Split into train, test
train_data, test_data = final_data.randomSplit([0.7,0.3], seed=seed)

In [9]:
## Setup Classification Models & fit training data
dtc = DecisionTreeClassifier(labelCol='spoiled', featuresCol='features')
dtc_model = dtc.fit(train_data)

rfc = RandomForestClassifier(labelCol='spoiled', featuresCol='features')
rfc_model = rfc.fit(train_data)

gbt = GBTClassifier(labelCol='spoiled', featuresCol='features')
gbt_model = gbt.fit(train_data)

                                                                                

In [10]:
## Make Predictions for test data
dtc_preds = dtc_model.transform(test_data)
dtc_preds.show(5)

rfc_preds = rfc_model.transform(test_data)
rfc_preds.show(5)

gbt_preds = gbt_model.transform(test_data)
gbt_preds.show(5)

+------------------+-------+-------------+--------------------+----------+
|          features|spoiled|rawPrediction|         probability|prediction|
+------------------+-------+-------------+--------------------+----------+
|[1.0,1.0,12.0,4.0]|    1.0|   [0.0,92.0]|           [0.0,1.0]|       1.0|
| [1.0,3.0,8.0,3.0]|    0.0|  [243.0,1.0]|[0.99590163934426...|       0.0|
| [1.0,3.0,9.0,8.0]|    0.0|  [243.0,1.0]|[0.99590163934426...|       0.0|
| [1.0,4.0,8.0,1.0]|    0.0|  [243.0,1.0]|[0.99590163934426...|       0.0|
| [1.0,4.0,9.0,6.0]|    0.0|  [243.0,1.0]|[0.99590163934426...|       0.0|
+------------------+-------+-------------+--------------------+----------+
only showing top 5 rows

+------------------+-------+--------------------+--------------------+----------+
|          features|spoiled|       rawPrediction|         probability|prediction|
+------------------+-------+--------------------+--------------------+----------+
|[1.0,1.0,12.0,4.0]|    1.0|[0.16666666666666...|[0.00

In [11]:
## Evaluate Models using Binary
my_binary_eval = BinaryClassificationEvaluator(labelCol='spoiled', rawPredictionCol='rawPrediction')

print(f'DTC Eval: {my_binary_eval.evaluate(dtc_preds)}')
print(f'RFC Eval: {my_binary_eval.evaluate(rfc_preds)}')
print(f'GBT Eval: {my_binary_eval.evaluate(gbt_preds)}')

DTC Eval: 0.9936507936507937
RFC Eval: 0.9898412698412699
GBT Eval: 0.9952380952380951


In [12]:
## Evaluate Models using Multi
accuracy_eval = MulticlassClassificationEvaluator(labelCol='spoiled', predictionCol='prediction',
                                                  metricName='accuracy')

print(f'DTC Accuracy: {accuracy_eval.evaluate(dtc_preds)}')
print(f'RFC Accuracy: {accuracy_eval.evaluate(rfc_preds)}')
print(f'GBT Accuracy: {accuracy_eval.evaluate(gbt_preds)}')

DTC Accuracy: 0.992
RFC Accuracy: 0.992
GBT Accuracy: 0.992
