# Week 4 - Evaluation of Machine Learning Models

# Hands-on: Evaluation of Decision Tree in Spark

In [1]:
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
sqlContext = SQLContext(sc)
predictions = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/predictions.csv',
                                   format = 'com.databricks.spark.csv',
                                   header = 'true',
                                   inferSchema = 'true')

### Compute accuracy
Let's create an instance of `MulticlassClassificationEvaluator` to determine the accuracy of the predictions

In [3]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'label',
                                              predictionCol = 'prediction',
                                              metricName = 'precision')

In [4]:
accuracy = evaluator.evaluate(predictions)
print('Accuracy = %g ' % (accuracy))

Accuracy = 0.795122 


### Display confusion matrix
The `MulticlassMetrics` class can be used to generate a confusion matrix of our classifier model. However, unlike `MulticlassClassificationEvaluator`, `MulticlassMetrics` works with RDDs of numbers and not DataFrames, so we need to convert our predictions DataFrame into an RDD.

In [5]:
predictions.rdd.take(2)

[Row(prediction=1.0, label=1.0), Row(prediction=0.0, label=1.0)]

In [6]:
predictions.rdd.map(tuple).take(2)

[(1.0, 1.0), (0.0, 1.0)]

In [7]:
metrics = MulticlassMetrics(predictions.rdd.map(tuple))

The `confusionMatrix()` function returns a Spark Matrix, which we can convert to a Python Numpy array, and transpose to view:

In [8]:
metrics.confusionMatrix().toArray()

array([[ 85.,  23.],
       [ 19.,  78.]])