In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('dog_food').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/dog_food.csv', inferSchema = True, header = True)

In [0]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [0]:
data.head(10)

Out[7]: [Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0),
 Row(A=5, B=6, C=12.0, D=7, Spoiled=1.0),
 Row(A=6, B=2, C=13.0, D=6, Spoiled=1.0),
 Row(A=4, B=2, C=12.0, D=1, Spoiled=1.0),
 Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0),
 Row(A=10, B=3, C=13.0, D=9, Spoiled=1.0),
 Row(A=8, B=5, C=14.0, D=5, Spoiled=1.0),
 Row(A=5, B=8, C=12.0, D=8, Spoiled=1.0),
 Row(A=6, B=5, C=12.0, D=9, Spoiled=1.0),
 Row(A=3, B=3, C=12.0, D=1, Spoiled=1.0)]

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')

In [0]:
data_features = assembler.transform(data)

In [0]:
final_data = data_features.select('features','Spoiled')

In [0]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [0]:
from pyspark.ml.classification import (DecisionTreeClassifier,  RandomForestClassifier, GBTClassifier)

In [0]:
DTC_model = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')
RFC_model = RandomForestClassifier(labelCol='Spoiled',featuresCol='features')
GBT_model = GBTClassifier(labelCol='Spoiled',featuresCol='features')

In [0]:
dtc = DTC_model.fit(train_data)
rfc = RFC_model.fit(train_data)
gbt = GBT_model.fit(train_data)

In [0]:
dtc_predictions = dtc.transform(test_data)
rfc_predictions = rfc.transform(test_data)
gbt_predictions = gbt.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_binary_evaluator = BinaryClassificationEvaluator(labelCol = 'Spoiled')

In [0]:
print(my_binary_evaluator.evaluate(dtc_predictions), my_binary_evaluator.evaluate(rfc_predictions),  my_binary_evaluator.evaluate(gbt_predictions))

1.0 1.0 1.0


In [0]:
dtc_predictions.show()

+-------------------+-------+-------------+--------------------+----------+
|           features|Spoiled|rawPrediction|         probability|prediction|
+-------------------+-------+-------------+--------------------+----------+
| [1.0,1.0,12.0,2.0]|    1.0|   [0.0,78.0]|           [0.0,1.0]|       1.0|
| [1.0,1.0,13.0,3.0]|    1.0|   [0.0,78.0]|           [0.0,1.0]|       1.0|
|  [1.0,3.0,9.0,8.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|  [1.0,4.0,9.0,3.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|[1.0,5.0,13.0,10.0]|    1.0|   [0.0,78.0]|           [0.0,1.0]|       1.0|
|  [1.0,6.0,8.0,1.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|  [1.0,6.0,8.0,9.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|  [1.0,7.0,7.0,2.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|  [1.0,7.0,8.0,2.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|  [1.0,7.0,8.0,4.0]|    0.0|  [241.0,1.0]|[0.99586776859504...|       0.0|
|  [1.0,8.0,

In [0]:
rfc.featureImportances

Out[27]: SparseVector(4, {0: 0.0252, 1: 0.0243, 2: 0.9182, 3: 0.0323})

## Chemical c causes early spoilage