In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

Load the data:

In [None]:
df = spark.read.options(header = True, inferSchema = True).csv("drive/MyDrive/Colab Notebooks/dog_food.csv"); df

DataFrame[A: int, B: int, C: double, D: int, Spoiled: double]

Analyze each column:

In [None]:
df.printSchema() # No strings as columns values, so we don't need to index it.

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [None]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

Create features column:

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')
final_data = assembler.transform(df); final_data.toPandas()

Unnamed: 0,A,B,C,D,Spoiled,features
0,4,2,12.0,3,1.0,"[4.0, 2.0, 12.0, 3.0]"
1,5,6,12.0,7,1.0,"[5.0, 6.0, 12.0, 7.0]"
2,6,2,13.0,6,1.0,"[6.0, 2.0, 13.0, 6.0]"
3,4,2,12.0,1,1.0,"[4.0, 2.0, 12.0, 1.0]"
4,4,2,12.0,3,1.0,"[4.0, 2.0, 12.0, 3.0]"
...,...,...,...,...,...,...
485,8,3,6.0,6,0.0,"[8.0, 3.0, 6.0, 6.0]"
486,6,4,9.0,10,0.0,"[6.0, 4.0, 9.0, 10.0]"
487,1,3,8.0,3,0.0,"[1.0, 3.0, 8.0, 3.0]"
488,6,6,8.0,3,0.0,"[6.0, 6.0, 8.0, 3.0]"


In [None]:
from pyspark.ml.classification import (RandomForestClassifier)

In [None]:
train, test = final_data.randomSplit([0.7, 0.3], 42)

Create our classifier:

In [None]:
rfc = RandomForestClassifier(numTrees = 100, labelCol='Spoiled')

Train and predict:

In [None]:
rfc_model = rfc.fit(train)
predictions = rfc_model.transform(test)

Check the predictions:

In [None]:
predictions.toPandas()

Unnamed: 0,A,B,C,D,Spoiled,features,rawPrediction,probability,prediction
0,1,1,12.0,4,1.0,"[1.0, 1.0, 12.0, 4.0]","[2.349140802177153, 97.65085919782285]","[0.02349140802177153, 0.9765085919782286]",1.0
1,1,3,8.0,3,0.0,"[1.0, 3.0, 8.0, 3.0]","[97.83426096457283, 2.1657390354271744]","[0.9783426096457282, 0.021657390354271742]",0.0
2,1,3,9.0,8,0.0,"[1.0, 3.0, 9.0, 8.0]","[96.44343979761874, 3.556560202381255]","[0.9644343979761875, 0.03556560202381256]",0.0
3,1,4,8.0,1,0.0,"[1.0, 4.0, 8.0, 1.0]","[97.76110680801415, 2.2388931919858694]","[0.9776110680801413, 0.02238893191985869]",0.0
4,1,4,9.0,6,0.0,"[1.0, 4.0, 9.0, 6.0]","[97.3707336127964, 2.629266387203602]","[0.973707336127964, 0.02629266387203602]",0.0
...,...,...,...,...,...,...,...,...,...
120,10,9,8.0,6,0.0,"[10.0, 9.0, 8.0, 6.0]","[98.67553516846924, 1.3244648315307936]","[0.9867553516846921, 0.013244648315307932]",0.0
121,10,9,9.0,1,0.0,"[10.0, 9.0, 9.0, 1.0]","[98.68758166240269, 1.3124183375973466]","[0.9868758166240265, 0.01312418337597346]",0.0
122,10,9,11.0,9,1.0,"[10.0, 9.0, 11.0, 9.0]","[2.5479036598880302, 97.45209634011196]","[0.025479036598880303, 0.9745209634011196]",1.0
123,10,10,7.0,4,0.0,"[10.0, 10.0, 7.0, 4.0]","[97.8090439765592, 2.1909560234408256]","[0.9780904397655918, 0.021909560234408255]",0.0


Evaluate our classifier:

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Spoiled')

In [None]:
evaluator.evaluate(predictions)

0.9944444444444445

## ¿Cuál es es el conservante que más efecto tiene?

Para ello, empleamos la función featureImportances que forma parte de la función del modelo. Esta función nos muestra la importancia de cada 'feature' respecto a la predicción. Podemos ver, sin ninguna duda, que la columna con index 2 (C) es la que más importancia tiene con muchísima diferencia.

Por tanto, podemos concluír que el conservante C es el que más impacto tiene en que se estropee la comida para perros.

In [None]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0186, 1: 0.0246, 2: 0.9312, 3: 0.0256})

### BONUS TRACK

Let's do it with a pipeline.

In [None]:
#First, we need to import the function
from pyspark.ml import Pipeline

Create the stages that will follow our pipeline

In [None]:
pip_assem = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol= 'features')

In [None]:
pip_rfc = RandomForestClassifier(numTrees= 100, labelCol= 'Spoiled')

In [None]:
pipeline = Pipeline(stages = [pip_assem, pip_rfc])

In [None]:
pip_train, pip_test = df.randomSplit([0.7, 0.3], 42)

In [None]:
model = pipeline.fit(pip_train)
pip_predic = model.transform(pip_test)

In [None]:
pip_predic.toPandas()

Unnamed: 0,A,B,C,D,Spoiled,features,rawPrediction,probability,prediction
0,1,1,12.0,4,1.0,"[1.0, 1.0, 12.0, 4.0]","[2.349140802177153, 97.65085919782285]","[0.02349140802177153, 0.9765085919782286]",1.0
1,1,3,8.0,3,0.0,"[1.0, 3.0, 8.0, 3.0]","[97.83426096457283, 2.1657390354271744]","[0.9783426096457282, 0.021657390354271742]",0.0
2,1,3,9.0,8,0.0,"[1.0, 3.0, 9.0, 8.0]","[96.44343979761874, 3.556560202381255]","[0.9644343979761875, 0.03556560202381256]",0.0
3,1,4,8.0,1,0.0,"[1.0, 4.0, 8.0, 1.0]","[97.76110680801415, 2.2388931919858694]","[0.9776110680801413, 0.02238893191985869]",0.0
4,1,4,9.0,6,0.0,"[1.0, 4.0, 9.0, 6.0]","[97.3707336127964, 2.629266387203602]","[0.973707336127964, 0.02629266387203602]",0.0
...,...,...,...,...,...,...,...,...,...
120,10,9,8.0,6,0.0,"[10.0, 9.0, 8.0, 6.0]","[98.67553516846924, 1.3244648315307936]","[0.9867553516846921, 0.013244648315307932]",0.0
121,10,9,9.0,1,0.0,"[10.0, 9.0, 9.0, 1.0]","[98.68758166240269, 1.3124183375973466]","[0.9868758166240265, 0.01312418337597346]",0.0
122,10,9,11.0,9,1.0,"[10.0, 9.0, 11.0, 9.0]","[2.5479036598880302, 97.45209634011196]","[0.025479036598880303, 0.9745209634011196]",1.0
123,10,10,7.0,4,0.0,"[10.0, 10.0, 7.0, 4.0]","[97.8090439765592, 2.1909560234408256]","[0.9780904397655918, 0.021909560234408255]",0.0


Check the accuracy of our model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Spoiled')
pip_predic = pip_predic.select(['Spoiled', 'prediction'])

In [None]:
evaluator.evaluate(pip_predic)

0.9944444444444445