In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('spoiled_food').master('local[4]').getOrCreate()

### Use RF to Determine Which Chemical Causes Early Spoiling

In [4]:
# we probably don't need machine learning to solve this
# train test split probably doesn't matter much
# we are really interested in the entropy methods of decision trees

In [8]:
data = spark.read.csv('../data/dog_food.csv', header=True, inferSchema=True)

In [9]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [10]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [11]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [14]:
# check for missing data
data.na.drop().describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



### Create Features

In [15]:
from pyspark.ml.feature import VectorAssembler

In [16]:
assembler = VectorAssembler(
    inputCols=['A', 'B', 'C', 'D'],
    outputCol='features')

In [19]:
with_features = assembler.transform(data).select('features', 'Spoiled')

In [20]:
with_features.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Spoiled: double (nullable = true)



In [22]:
with_features.head(5)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0),
 Row(features=DenseVector([5.0, 6.0, 12.0, 7.0]), Spoiled=1.0),
 Row(features=DenseVector([6.0, 2.0, 13.0, 6.0]), Spoiled=1.0),
 Row(features=DenseVector([4.0, 2.0, 12.0, 1.0]), Spoiled=1.0),
 Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

In [24]:
with_features.filter(with_features['Spoiled'] == 1).count()

140

In [25]:
with_features.filter(with_features['Spoiled'] == 0).count()

350

In [26]:
# we see there is some imbalance, some foods are not spoiling

### Train RF Classifier with All Data

In [27]:
# we don't need train / test split, just trying to find important attributes

In [28]:
from pyspark.ml.classification import RandomForestClassifier

In [30]:
rfc = RandomForestClassifier(numTrees=100, labelCol='Spoiled')

In [31]:
rfc_model = rfc.fit(with_features)

In [32]:
rfc_model.featureImportances

SparseVector(4, {0: 0.021, 1: 0.0216, 2: 0.9344, 3: 0.023})

In [33]:
# we see that the 3rd attribute (C) provides 93.44% of the Spoilage

### Does Attribute C Indicare Spoilage?

In [53]:
spoiled_data = data.filter(data['Spoiled'] == 1).select('C', 'Spoiled')

In [55]:
non_spoiled_data = data.filter(data['Spoiled'] == 0).select('C', 'Spoiled')

In [57]:
spoiled_data.describe().show()

+-------+------------------+-------+
|summary|                 C|Spoiled|
+-------+------------------+-------+
|  count|               140|    140|
|   mean|11.914285714285715|    1.0|
| stddev|0.9706907300060253|    0.0|
|    min|               9.0|    1.0|
|    max|              14.0|    1.0|
+-------+------------------+-------+



In [58]:
non_spoiled_data.describe().show()

+-------+-----------------+-------+
|summary|                C|Spoiled|
+-------+-----------------+-------+
|  count|              350|    350|
|   mean| 8.01142857142857|    0.0|
| stddev|1.086455140730764|    0.0|
|    min|              5.0|    0.0|
|    max|             11.0|    0.0|
+-------+-----------------+-------+



- spoiled data has chemical C with mean of 11.9
- non-spoiled data has chemical C with mean of 8.0
- looks like we need to reduce chemical C to <= 9!