### Import the necessary libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml.pipeline import Pipeline

### Instantiate Spark session

In [None]:
spark = SparkSession.builder.appName('project_tree').getOrCreate()

### Import csv as a dataframe and analyze data

In [None]:
data = spark.read.csv('dog_food.csv',inferSchema = True,header= True)

In [None]:
data.show(2)

In [None]:
data.printSchema()

In [None]:
data.describe().show()

### Transform data into the format understood by Spark and build a random forest classifier

In [None]:
rfc = RandomForestClassifier(featuresCol = 'features',labelCol = 'label',numTrees = 500)

In [None]:
assembler = VectorAssembler(inputCols=['A','B','C','D'],outputCol = 'features')

In [None]:
df = assembler.transform(data).withColumnRenamed('Spoiled','label')
train,test = df.randomSplit([0.7,0.3])

In [None]:
model = rfc.fit(train)

In [None]:
pred = model.transform(test)

In [None]:
acc = MulticlassClassificationEvaluator(metricName = 'accuracy').evaluate(pred)

In [None]:
acc

### Using the 'featureImportances' method of the classifier, build the distribution of the features A, B, C, and D in terms of their effects on the output (label)

In [None]:
print(model.featureImportances.__doc__)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(model.featureImportances)

### For the sake of comparison (with the initial random forest model), build and assess simple decision tree models for four different tree depth hyperparameters. Perform 5-fold cross-validation for the assessment of their respective accuracies. 

In [None]:
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
paramgrid = ParamGridBuilder().addGrid(dtc.maxDepth,[1,3,5,7]).build()

In [None]:
pipeline = Pipeline(stages=[dtc])

In [None]:
crossval = CrossValidator(estimator=pipeline,
    estimatorParamMaps=paramgrid,
    evaluator=MulticlassClassificationEvaluator(metricName = 'accuracy'),
    numFolds=5)

In [None]:
model = crossval.fit(df)

In [None]:
# Accuracy of the decision tree classifier with the depth parameter = [1,3,5,7], respectively.
model.avgMetrics

## Conclusions: 
 1. The chemical "C" is clearly the one causing the problem
 2. For the given data, a simple decision tree classifier with depth = 1 presents the most accurate algorithm. With that said, all of the models considered do not differ significantly in terms of accuracies, which lie in the range (0.96-0.99). 