# Higgs Boson Machine Learning classification

### Importing MLlib libraries

In [None]:
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import sqlContext.implicits._
import org.apache.spark.sql.functions._

### Read data and pre-processing

File descriptions

- training.csv: Training set of 250000 events, with an ID column, 30 feature columns, a weight column and a label column.
 
- test.csv: Test set of 550000 events with an ID column and 30 feature columns (without label is not usefull in this example but only for submission on Kaggle competition).


For detailed information on the semantics of the features, labels, and weights, see the technical documentation from https://www.kaggle.com/c/higgs-boson/https://www.kaggle.com/c/higgs-boson/

Some details to get started:

- All variables are floating point, except PRI_jet_num which is integer
- Variables prefixed with PRI (for PRImitives) are “raw” quantities about the bunch collision as measured by the detector.
- Variables prefixed with DER (for DERived) are quantities computed from the primitive features, which were selected by the physicists of ATLAS
- It can happen that for some entries some variables are meaningless or cannot be computed; in this case, their value is −999.0, which is outside the normal range of all variables

In [None]:
val rawData = sc.textFile("data/Higgs/training.csv")
rawData.take(5).foreach(println)
rawData.count()

In [None]:
val splitlines = rawData.map(lines => {
    lines.split(',')
  })
splitlines.first()

In [None]:
val temp = splitlines.filter(lines => lines(0) != "EventId")
temp.first

Drop the last feature Weight

In [None]:
val Data = temp.map { col =>   
     val temp_label = col(col.size - 1)                       
     val label = if (temp_label == "s") 1.toInt else 0.toInt
     val features = col.slice(1, col.size - 2).map(_.toDouble)
     LabeledPoint(label, Vectors.dense(features))
}
Data.take(5).foreach(println)

### Split the data into training and test sets (40% held out for testing)

In [None]:
val splits = Data.randomSplit(Array(0.6, 0.4), seed = 13L)
val trainingData = splits(0).cache()
val testData = splits(1)
println("Training Data")
trainingData.take(5).foreach(println)
println("Test Data")
testData.take(5).foreach(println)

### Train a Decision Tree

In [None]:
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "entropy"
val maxDepth = 3
val maxBins = 10
val dtModel = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
  impurity, maxDepth, maxBins)
println(dtModel.toDebugString)

In [None]:
val dtTotalCorrect = trainingData.map { point =>
  if (dtModel.predict(point.features) == point.label) 1 else 0
  }.sum

println(dtTotalCorrect)
println(trainingData.count)

In [None]:
val dtAccuracy = dtTotalCorrect / trainingData.count
println(dtAccuracy)

### Test

In [None]:
val dtTotalCorrect = testData.map { point =>
  if (dtModel.predict(point.features) == point.label) 1 else 0
  }.sum
println(dtTotalCorrect)
println(testData.count)

In [None]:
val dtAccuracy = dtTotalCorrect / testData.count
println(dtAccuracy)

In [None]:
val predictionAndLabels = testData.map { case LabeledPoint(label, features) =>
  val prediction = dtModel.predict(features)
  (prediction, label)
}

// Instantiate metrics object
val metrics = new MulticlassMetrics(predictionAndLabels)

// Confusion matrix
println("Confusion matrix:")
println(metrics.confusionMatrix)

// Overall Statistics
val precision = metrics.precision
val recall = metrics.recall // same as true positive rate
val f1Score = metrics.fMeasure
println("Summary Statistics")
println(s"Precision = $precision")
println(s"Recall = $recall")
println(s"F1 Score = $f1Score")

// Precision by label
val labels = metrics.labels
labels.foreach { l =>
    println(s"Precision($l) = " + metrics.precision(l))
}

// Recall by label
labels.foreach { l =>
    println(s"Recall($l) = " + metrics.recall(l))
}

// False positive rate by label
labels.foreach { l =>
    println(s"FPR($l) = " + metrics.falsePositiveRate(l))
}

// F-measure by label
labels.foreach { l =>
    println(s"F1-Score($l) = " + metrics.fMeasure(l))
}

// Weighted stats
println(s"Weighted precision: ${metrics.weightedPrecision}")
println(s"Weighted recall: ${metrics.weightedRecall}")
println(s"Weighted F1 score: ${metrics.weightedFMeasure}")
println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")