In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
'''
This dataset has features of universities and labeled either 
Private or Public.

We will use different tree methods to attempt to label or build model
that can predict whether university based on features is private 
or public.

'''

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("tree").getOrCreate()

In [3]:
## Lets load the data

data = spark.read.csv("/Users/jaskiratsinghp/Desktop/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/College.csv" , 
                      inferSchema = True , header = True)

data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [5]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [6]:
## Lets convert the dataset into desired format

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Apps',
                                         'Accept',
                                         'Enroll',
                                         'Top10perc',
                                         'Top25perc',
                                         'F_Undergrad',
                                         'P_Undergrad',
                                         'Outstate',
                                         'Room_Board',
                                         'Books',
                                         'Personal',
                                         'PhD',
                                         'Terminal',
                                         'S_F_Ratio',
                                         'perc_alumni',
                                         'Expend',
                                         'Grad_Rate'] , outputCol = "features")

In [7]:
output = assembler.transform(data)

In [8]:
'''
Lets change the output column as it is string column that has values
either Yes or No, we have to convert that into integers.

'''
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = "Private" , outputCol = "PrivateIndex")

outputFixed = indexer.fit(output).transform(output)

In [9]:
outputFixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [10]:
finalData = outputFixed.select("features" , "PrivateIndex")

In [11]:
## Lets split the dataset into train and test dataset.

train_data , test_data = finalData.randomSplit([0.7 , 0.3])

In [12]:
## Lets import the classifiers

from pyspark.ml.classification import (DecisionTreeClassifier , 
                                       GBTClassifier , 
                                       RandomForestClassifier)

#from pyspark.ml.regression import RandomForestRegressor , etc..

from pyspark.ml import Pipeline

decisionTreeClassifier = DecisionTreeClassifier(featuresCol = "features", 
                                               labelCol = "PrivateIndex")

randomForestClassifier = RandomForestClassifier(featuresCol = "features",
                                                labelCol = "PrivateIndex")

gradientBoostingClassifier = GBTClassifier(featuresCol = "features" , 
                                           labelCol = "PrivateIndex")

In [13]:
## Lets fit the model

decisionTreeModel = decisionTreeClassifier.fit(train_data)

randomForestModel = randomForestClassifier.fit(train_data)

gradientBoostingModel = gradientBoostingClassifier.fit(train_data)

In [14]:
## Lets do some predictions on test data

decisionTreePredictions = decisionTreeModel.transform(test_data)

randomForestPredictions = randomForestModel.transform(test_data)

gradientBoostingPredictions = gradientBoostingModel.transform(test_data)

In [15]:
## Lets see evaluation matrices

from pyspark.ml.evaluation import BinaryClassificationEvaluator

myBinaryEval = BinaryClassificationEvaluator(labelCol = "PrivateIndex")

In [16]:
print("Decision Tree Accuracy: " , myBinaryEval.evaluate(decisionTreePredictions))

print("\nRandom Forest Accuracy: " , myBinaryEval.evaluate(randomForestPredictions))

print("\nGradient Boosting Accuracy: " , myBinaryEval.evaluate(gradientBoostingPredictions))

Decision Tree Accuracy:  0.9250021916367142

Random Forest Accuracy:  0.9832558955027619

Gradient Boosting Accuracy:  0.948978697291137


In [17]:
'''
If you want to test things like Accuracy , Precision or Recall,
you can grab those DIRECTLY from BinaryClassificationEvaluator.

But, you can't grab those from MulticlassClassificationEvaluator DIRECTLY.

'''

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accEval = MulticlassClassificationEvaluator(labelCol = "PrivateIndex" , 
                                            metricName = "accuracy")


In [18]:
randomForestAccuracy = accEval.evaluate(randomForestPredictions)

randomForestAccuracy

0.9395161290322581

# Random Forest - Classification Consulting     Project

In [22]:
'''
There is this Dog Food Company in St.Louis, Missouri.

* The company is trying to predict why some batches of their dog food
  are spoiling much quicker than intended.

* Unfortunately, this Dog Food company has not upgraded to the latest 
  machinery, meaning that the amounts of the 5 preservative chemicals
  they are using can vary a lot, but WE HAVE TO PREDICT THAT WHICH 
  CHEMICAL HAS STRONGEST EFFECT.
  
* The Dog Food company first mixes up a batch of preservative that
  contains 4 different preservative chemicals (A, B, C, D) and then
  is completed with a "filler" chemical.

* The Food Scientists believe one of the A, B, C or D preservatives
  is causing the problem, but need your help to figure out which one.
  
* Use Machine Learning with RF to find out which parameter had the
  most predictive power, thus finding out which chemical causes the 
  early spoiling.

* So create a model and then find out how you can decide which chemical
  is the problem.
  

Features in the dataset:

1.) Pres_A: Percentage of preservative A in the mix.
2.) Pres_B: Percentage of preservative B in the mix.
3.) Pres_C: Percentage of preservative C in the mix.
4.) Pres_D: Percentage of preservative D in the mix.
5.) Spoiled: Label indicating whether or not the Dog Food batch was spoiled.


THERE ARE MANY DIFFERENT WAYS TO SOLVE THIS PROBLEM BUT, WE WILL SOLVE
THIS USING STATISTICS.

$ It won't be like a typical workflow, we won't be working with train_data
  and test_data, instead we will feed the model the complete dataset and 
  using feature_importance functionality we will find out which feature 
  is causing food to get spoiled.
  
'''

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RF_consult").getOrCreate()

In [23]:
## Lets read the data

data = spark.read.csv("/Users/jaskiratsinghp/Desktop/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/dog_food.csv" , 
                      inferSchema = True , header = True)

data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [24]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [28]:
data.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [29]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [30]:
## Lets create an assembler.

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ["A" , "B" , "C" , "D"] , 
                            outputCol = "features")

output = assembler.transform(data)

In [31]:
from pyspark.ml.classification import RandomForestClassifier

randomForestClassifier = RandomForestClassifier(featuresCol = "features" , 
                                                labelCol = "Spoiled")

In [32]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [33]:
finalData = output.select("features" , "Spoiled")

In [35]:
finalData.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [36]:
randomForestModel = randomForestClassifier.fit(finalData)

In [37]:
randomForestModel.featureImportances

SparseVector(4, {0: 0.0177, 1: 0.0199, 2: 0.9481, 3: 0.0142})

##  Outcome:
### Therefore, we can see that Preservative_C is causing the Spoiling of food.