# Spark & Python: MLlib Decision Trees

In [3]:
import pyspark
from pyspark import SparkContext
import urllib
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.sql import SQLContext
from time import time
#  Custom imports
import MySQLConnection


## Getting the training data and creating the RDD

In [6]:
sqlContext = SQLContext(sc)
#  Get username and password from file in this format: {"user":"yourusername","password":"yourpassword"}
connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
# Get training data from the database...biosensor database and SensorTrainingReadings table
data = sqlContext.read.jdbc("jdbc:mysql://localhost/biosensor", "SensorTrainingReadings", properties=connectionProperties).selectExpr("deviceID","metricTypeID","uomID","positionID","actualPitch", "actualYaw")
print "Train data size is {}".format(data.count())

Train data size is 505


## Split training data into training set and test set


In [8]:
# Split data into training and test dataasets
(trainingDataTable, testDataTable) = data.randomSplit([0.7, 0.3])

trainingDataTable.show()
testDataTable.show()


+--------------------+------------+-----+----------+-----------+---------+
|            deviceID|metricTypeID|uomID|positionID|actualPitch|actualYaw|
+--------------------+------------+-----+----------+-----------+---------+
|5d681c54e66ff4a56...|           6|    4|         0|   -14.8309|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -14.6938|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -14.3637|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -14.2678|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -13.8651|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -13.8577|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -13.8351|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -13.4711|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -13.4314|     18.0|
|5d681c54e66ff4a56...|           6|    4|         0|   -13.3352|     18.0|
|5d681c54e66ff4a56...|   

## Create an RDD of LabeledPoints
The featurize method returns a LabeledPoint with the label and an vector array of features.  

An example for a reading from the stooped position would be:
*  0, [-40,15]

In [19]:
# The model requires labeldPoints which is a row with label and a vector of features.
def featurize(t):
	return LabeledPoint(t.positionID, [t.actualPitch, t.actualYaw])

trainingData = trainingDataTable.map(featurize)

## Training the model
For this example we are choosing a Random Forest model wich is multiple decision trees averaged together.  In this case since we know there will only be 3 distinct values of "labels", numClasses = 3. 

In [24]:
# Train the classifier/Build the model
startTime = time()

#Random Forest Model
model = RandomForest.trainClassifier(
                                    trainingData, 
                                    numClasses=3, 
                                    categoricalFeaturesInfo={},
                                    numTrees=6, 
                                    featureSubsetStrategy="auto",
                                    impurity='gini', 
                                    maxDepth=4, 
                                    maxBins=32
                                    )

elapsedTime = time() - startTime

print "Classifier trained in {} seconds".format(round(elapsedTime,3))

# Save the madel for use in evaluating readings
model.save(sc,"models/IoTBackBraceRandomForest.model")

Classifier trained in 1.045 seconds


## Evaluating the accuracy of the model
Since we use 70% of the training data for actually training the model, we have the remaining 30% that we can use as a test dataset.  Since these values are still known, we can see if the model does a good job of classifying. 

In [34]:
# Evaluate model on test instances and compute test error
testData = testDataTable.map(featurize)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.0208333333333


Another handy feature is that you can view the model logic tree by using the "toDebugString()" method

In [33]:
print('Random Forest Classifcation Model:')
print(model.toDebugString())

Random Forest Classifcation Model:
TreeEnsembleModel classifier with 6 trees

  Tree 0:
    If (feature 0 <= -14.6938)
     If (feature 0 <= -26.1529)
      Predict: 2.0
     Else (feature 0 > -26.1529)
      If (feature 0 <= -24.0257)
       Predict: 1.0
      Else (feature 0 > -24.0257)
       Predict: 1.0
    Else (feature 0 > -14.6938)
     Predict: 0.0
  Tree 1:
    If (feature 0 <= -14.6938)
     If (feature 0 <= -26.1529)
      Predict: 2.0
     Else (feature 0 > -26.1529)
      If (feature 0 <= -24.0257)
       Predict: 1.0
      Else (feature 0 > -24.0257)
       If (feature 0 <= -19.0393)
        Predict: 1.0
       Else (feature 0 > -19.0393)
        Predict: 1.0
    Else (feature 0 > -14.6938)
     Predict: 0.0
  Tree 2:
    If (feature 0 <= -14.6938)
     If (feature 0 <= -24.0257)
      If (feature 0 <= -26.1529)
       Predict: 2.0
      Else (feature 0 > -26.1529)
       Predict: 2.0
     Else (feature 0 > -24.0257)
      If (feature 0 <= -19.0393)
       Predict: 1.0
 

## Using the model for analysis of raw data
Once the model is saved, it can be loaded again in any script by referring to the path where it was saved.

In [26]:
loadedModel = RandomForestModel.load(sc, "models/IoTBackBraceRandomForest.model")

The example below passes a value to the model from a range of -50 degrees (stooped) to +10 degrees (standing).

In [29]:
for i in range(-50,10):
    prediction = loadedModel.predict([i])
    positions = {
                  0 : "upright",
                  1 : "back bent",
                  2 : "stooped"
                }
    print str(i) + " => " + str(positions[prediction])

-50 => stooped
-49 => stooped
-48 => stooped
-47 => stooped
-46 => stooped
-45 => stooped
-44 => stooped
-43 => stooped
-42 => stooped
-41 => stooped
-40 => stooped
-39 => stooped
-38 => stooped
-37 => stooped
-36 => stooped
-35 => stooped
-34 => stooped
-33 => stooped
-32 => stooped
-31 => stooped
-30 => stooped
-29 => stooped
-28 => stooped
-27 => stooped
-26 => back bent
-25 => back bent
-24 => back bent
-23 => back bent
-22 => back bent
-21 => back bent
-20 => back bent
-19 => back bent
-18 => back bent
-17 => back bent
-16 => back bent
-15 => back bent
-14 => upright
-13 => upright
-12 => upright
-11 => upright
-10 => upright
-9 => upright
-8 => upright
-7 => upright
-6 => upright
-5 => upright
-4 => upright
-3 => upright
-2 => upright
-1 => upright
0 => upright
1 => upright
2 => upright
3 => upright
4 => upright
5 => upright
6 => upright
7 => upright
8 => upright
9 => upright
