# Tree Methods Consulting Project 

In [2]:
import findspark
findspark.init('/home/hale/spark-2.4.3-bin-hadoop2.7/')
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('dogfood').getOrCreate()

In [4]:
data = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

In [5]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [6]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [None]:
# Many machine learning models produce some sort of coefficient value for each feature involved, 
# indicating their 'importance' or predictive power relative to the actual label
# These tree methods classifiers have a ".featureImportances" attribute available
# So we can create a model, fit it on all the data, and then check which feature (preservative) was causing the spoilage
# or had the most predictive power to the label

In [9]:
# .featureImportances returns:
# SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199}) where the first number indicates the #of features were used
# And the second entry is like a dictionary where you have the actual index number of the feature and its actual importance
# So here we can see that the most important feature was by far feature index 2.
# Corresponding features column would look something like: 
# Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)
# We input feautures as a dense vector
# There are many different ways to solve this problem, including just using "pure" statistics instead of a machine learning model.
# In this case we don't care about train/test splits or deployments
# All we care about is the actual relationship between a feature and its label.
# So again what we really want to fundamentally understand is which feature is important for predicting the label.

In [11]:
# The basic idea of our approach here is we are going to perform a random forest classifier model on the data itself.
data.head(1)
# The following four columns for this particular row of the preservative percentage amounts A B C D, 
# and then we have whether or not it was spoiled 
# We build that random forest classifier model that is able to predict whether or not this particular batch of dog food
# is spoiled based off their preservative amounts. 
# Then We are going to request the feature importance of that model and see if a particular preservative is really important


[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [12]:
from pyspark.ml.feature import VectorAssembler

In [13]:
# Check data columns
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [14]:
# We need to put data in the right format
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')

In [15]:
# Transform the data.
output = assembler.transform(data)

In [16]:
from pyspark.ml.classification import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')

In [18]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [19]:
# For our final_data we need Spoiled and features columns to pass in to the random forest classifier
final_data = output.select('features','Spoiled')

In [20]:
final_data.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [21]:
# Let's train our classifier on our actual data
rfc_model = rfc.fit(final_data)

In [22]:
final_data.head(1)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

In [23]:
# Now, we have our fitted model, what we can do is request the feature importance of this model
rfc_model.featureImportances
# 0 correlates to A, 1 correlates to B etc..
# Notice that, one of these particular letters is by far the most important feature and that is chemical C.
# So, that is the most important feature meaning it is actually causing the early spoilage. 

SparseVector(4, {0: 0.0193, 1: 0.0222, 2: 0.9406, 3: 0.0179})

In [None]:
# All we really wanted to know was what feature is really driving the causation of whether or not something is being spoiled
