In [1]:
import findspark 
findspark.init("/home/jean/spark-2.4.4-bin-hadoop2.7")

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [3]:
spark = SparkSession.builder.appName('dog').getOrCreate()

In [4]:
df = spark.read.csv("dog_food.csv", inferSchema=True, header=True)

In [5]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [6]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [7]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol="features")

In [8]:
df = assembler.transform(df)

In [9]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
final_data = df.select(["features", "Spoiled"])

In [11]:
from pyspark.ml.classification import RandomForestClassifier

In [12]:
rf = RandomForestClassifier(numTrees=100, labelCol="Spoiled")

In [14]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [15]:
rf_fit = rf.fit(train_data)

In [16]:
results = rf_fit.transform(test_data)

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [18]:
evaluator = BinaryClassificationEvaluator(labelCol="Spoiled")

In [19]:
evaluator.evaluate(results)

0.9961389961389961

In [20]:
#This is a function that shows the feature importance for the model, as we can see, the variable 2(C) is causing the mixture to be spoiled.
rf_fit.featureImportances

SparseVector(4, {0: 0.0235, 1: 0.0307, 2: 0.92, 3: 0.0258})