In [None]:
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

In [None]:
// Load and parse the data file.
val input = sc.textFile("/asnumbers.csv")
// The above is an Array[String] which is not what we need. We need a LabeledPoint 
val data  = input.map { line => 
    val lineSplit = line.split(',')
    val values = Vectors.dense(lineSplit.take(12).map(_.toDouble))
    LabeledPoint(lineSplit(13).toDouble, values)
}.cache()

In [None]:
// Split the data into training and test sets (30% held out for testing)
val splits = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))

// Train a DecisionTree model.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "gini"
val maxDepth = 5
val maxBins = 50

In [None]:
val model = DecisionTree.trainClassifier(trainingData, 2, categoricalFeaturesInfo,impurity, maxDepth, maxBins)

In [None]:
// Evaluate model on test instances and compute test error
val labelAndPreds = testData.map { point => 
    val prediction = model.predict(point.features)
    (point.label, prediction)
}

In [None]:
val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count()
println("Test Error = " + testErr)
println("Learned classification tree model:\n" + model.toDebugString)

In [None]:
val metrics = new BinaryClassificationMetrics(labelAndPreds)
//show the area under the curve
val roc = metrics.roc
val auROC = metrics.areaUnderROC

In [None]:
//precision recall curve
 val PR = metrics.pr
 val auPR = metrics.areaUnderPR

In [None]:
val precision = metrics.precisionByThreshold
precision.collect()

In [None]:
val recall = metrics.recallByThreshold
recall.take(5)

In [None]:
val f1Score = metrics.fMeasureByThreshold
f1Score.take(5)

In [None]:
model.save(sc, "/models/binaryclassificationmodel")
val sameModel = DecisionTreeModel.load(sc, "/models/binaryclassificationmodel")