# Random Forest Classifier with PySpark MLLib

To run the notebook with a Spark Context (sc) enter: 'IPYTHON_OPTS="notebook --certfile=~/cert/mycert.pem --keyfile ~/cert/mykey.key" $SPARK_HOME/bin/pyspark --master spark://spark1:7077 --jars $SPARK_HOME/jars/elasticsearch-hadoop-2.2.0.jar'



In [1]:
#ML Lib libraries
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

#Python Libraries
import sys
sys.path.append('../Infrastructure_Capstone')
import os
import random

from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch.client import indices

import ConfigParser

#read in the config file
config = ConfigParser.ConfigParser()
config.read('../Infrastructure_Capstone/config/capstone_config.ini')

ES_url = config.get('ElasticSearch','host')
ES_password = config.get('ElasticSearch','password')
ES_username= config.get('ElasticSearch','username')

print ES_url
seed = random.seed(0)

169.53.138.84


In [2]:
#Configuration for reading from and writing to Elasticsearch
es_read_conf = { 
        "es.resource" : "dataframe_plus_weather/rows", 
        "es.nodes" : ES_url,
        "es.port" : "9200",
        "es.net.http.auth.user" : ES_username,
        "es.net.http.auth.pass" : ES_password 
    }

es_write_conf = {
        "es.resource" : "rf_output/results",
        "es.nodes" : ES_url,
        "es.port" : "9200",
        "es.net.http.auth.user" : ES_username, 
        "es.net.http.auth.pass" : ES_password
        #"es.mapping.id" : "grid_id"
    } 

### Functions for MapReduce

In [3]:
#Function to convert dataframe rows to Labeled Points (label, features)
def getLabeledPoint(row): 
    #input: SparkSQL dataframe row element
    #output: Spark LabeledPoint
    
    #zip code
    zipcode = row.grid_zipcodeIdx
    
    #date time fields
    dayOfMonth = long(row.grid_day)
    dayOfWeek = long(row.grid_dayOfWeek)
    hour = long(row.grid_hourOfDay)
    month = long(row.grid_month)
    
    #weather fields
    fog = row.weather_FogIdx
    rain = row.weather_RainIdx
    snow = row.weather_SnowHailIceIdx
    
    if float(row.weather_WetBulbFarenheit) <> 99999:
        temp = row.weather_WetBulbFarenheit  
    else:
        temp = 70
        
    if float(row.weather_HourlyPrecip) <> 99999:
        precip = row.weather_HourlyPrecip  
    else:
        precip = 0
        
    if float(row.weather_Visibility) <> 99999:
        vis = row.weather_Visibility  
    else:
        vis = 10
        
    if float(row.weather_WindSpeed) <> 99999:
        windspeed = row.weather_WindSpeed  
    else:
        windspeed = 0
    
    #truth label
    label = long(row.grid_isAccident)    
    
    return LabeledPoint(label,[zipcode,dayOfMonth,dayOfWeek,hour,month,fog,rain,snow,temp,precip,vis,windspeed])

### Gather data from Elasticsearch grid index

In [4]:
#get RDD of the collisions grid
grid_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable", 
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
    conf=es_read_conf).map(lambda row: row[1])

grid_rdd.first()
#grid_test = sc.parallelize(grid_rdd.take(100000)) #start with a small test set

{u'grid_day': 6,
 u'grid_dayOfWeek': 3,
 u'grid_fullDate': u'2013-02-06T12:00:00-06:00',
 u'grid_hourOfDay': 13,
 u'grid_id': u'2013-02-06T13:00:00_10306',
 u'grid_isAccident': 0,
 u'grid_month': 2,
 u'grid_zipcode': 10306,
 u'weather_Fog': 0,
 u'weather_HourlyPrecip': 99999.0,
 u'weather_Rain': 0,
 u'weather_SkyCondition': u'SCT042',
 u'weather_SnowHailIce': 0,
 u'weather_Visibility': 10.0,
 u'weather_WeatherType': None,
 u'weather_WetBulbFarenheit': 31.0,
 u'weather_WindSpeed': 8.0}

In [5]:
#get set of accident and non-accident records
accidents = grid_rdd.filter(lambda row: row['grid_isAccident'] == 1)
accN = accidents.count()

In [6]:
no_accidents = grid_rdd.filter(lambda row: row['grid_isAccident'] == 0)
noaccN = no_accidents.count()

In [7]:
print "%s out of %s grid elements have an accident." % (str(accN),str(noaccN))

570174 out of 6238371 grid elements have an accident.


### Sample Data
We will now randomly sample from the accidents and non-accidents RDD's to get approximately 50-50 accidents and non-accidents

In [9]:
fraction = 0.75
sub_acc = accidents.sample(withReplacement=False,fraction=fraction,seed=seed)
sub_noacc = no_accidents.sample(withReplacement=False,fraction=fraction*accN/noaccN,seed=seed)

print "Total accidents: %s" % str(sub_acc.count())
print "Total non-accidents: %s" % str(sub_noacc.count())
full_rdd = sub_acc.union(sub_noacc)

Total accidents: 427088
Total non-accidents: 427836


### Convert categorical features

In [10]:
#create a dataframe for encoding categorical variables
#df = sqlContext.createDataFrame(full_rdd) #complete dataset
df = sqlContext.createDataFrame(full_rdd.sample(withReplacement=False,fraction=0.25,seed=seed)) #let's start with a 1/4 of the data



In [11]:
#define categorical indexers for the data
zipIndexer =  StringIndexer(inputCol='grid_zipcode', outputCol='grid_zipcodeIdx')
fogIndexer =  StringIndexer(inputCol='weather_Fog', outputCol='weather_FogIdx')
rainIndexer =  StringIndexer(inputCol='weather_Rain', outputCol='weather_RainIdx')
snowIndexer =  StringIndexer(inputCol='weather_SnowHailIce', outputCol='weather_SnowHailIceIdx')

In [12]:
#index the zip codes and weather categories
model1 = zipIndexer.fit(df)
td1 = model1.transform(df)

model2 = fogIndexer.fit(td1)
td2 = model2.transform(td1)

model3 = rainIndexer.fit(td2)
td3 = model3.transform(td2)

model4 = snowIndexer.fit(td3)
td4 = model4.transform(td3)

td4.take(1)

[Row(grid_collision_counter=1, grid_day=7, grid_dayOfWeek=2, grid_fullDate=u'2015-07-07T21:00:00-05:00', grid_hourOfDay=22, grid_id=u'2015-07-07T22:00:00_10031', grid_isAccident=1, grid_month=7, grid_zipcode=10031, weather_Fog=0, weather_HourlyPrecip=99999.0, weather_Rain=0, weather_SkyCondition=u'SCT049 SCT090 SCT200', weather_SnowHailIce=0, weather_Visibility=10.0, weather_WeatherType=None, weather_WetBulbFarenheit=73.0, weather_WindSpeed=10.0, grid_zipcodeIdx=112.0, weather_FogIdx=0.0, weather_RainIdx=0.0, weather_SnowHailIceIdx=0.0)]

In [15]:
zipEncoder = OneHotEncoder(dropLast=False, inputCol="grid_zipcodeIdx", outputCol="grid_zipcodeVec")
zipEncoded = zipEncoder.transform(td4)

fogEncoder = OneHotEncoder(dropLast=False, inputCol="weather_FogIdx", outputCol="weather_FogVec")
fogEncoded = fogEncoder.transform(zipEncoded)

rainEncoder = OneHotEncoder(dropLast=False, inputCol="weather_RainIdx", outputCol="grid_RainVec")
rainEncoded = rainEncoder.transform(fogEncoded)

snowEncoder = OneHotEncoder(dropLast=False, inputCol="weather_SnowHailIceIdx", outputCol="weather_SnowHailIceVec")
fullEncoded = snowEncoder.transform(rainEncoded)


### Train and Run the Model
The following Random Forest code comes directly from the Spark MLLib programming guide: 
http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests

In [19]:
#convert the dataframe to labeled points
labeled_pts = fullEncoded.map(getLabeledPoint)
labeled_pts.take(3)

[LabeledPoint(1.0, [2.0,10.0,2.0,9.0,11.0,0.0,1.0,0.0,53.0,0.0,10.0,9.0]),
 LabeledPoint(1.0, [2.0,27.0,4.0,21.0,3.0,0.0,0.0,0.0,32.0,0.0,10.0,15.0]),
 LabeledPoint(1.0, [22.0,8.0,3.0,8.0,1.0,0.0,0.0,0.0,7.0,0.0,10.0,14.0])]

In [20]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = labeled_pts.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

# Save and load model
#model.save(sc, "target/tmp/myRandomForestClassificationModel")
#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")

Test Error = 0.296343690017
Learned classification forest model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 4 <= 3.0)
     If (feature 2 <= 6.0)
      If (feature 0 <= 131.0)
       If (feature 8 <= 39.0)
        Predict: 1.0
       Else (feature 8 > 39.0)
        Predict: 1.0
      Else (feature 0 > 131.0)
       If (feature 1 <= 14.0)
        Predict: 0.0
       Else (feature 1 > 14.0)
        Predict: 0.0
     Else (feature 2 > 6.0)
      If (feature 3 <= 10.0)
       If (feature 11 <= 5.0)
        Predict: 0.0
       Else (feature 11 > 5.0)
        Predict: 0.0
      Else (feature 3 > 10.0)
       If (feature 9 <= 0.2)
        Predict: 0.0
       Else (feature 9 > 0.2)
        Predict: 1.0
    Else (feature 4 > 3.0)
     If (feature 0 <= 131.0)
      If (feature 3 <= 7.0)
       If (feature 3 <= 6.0)
        Predict: 0.0
       Else (feature 3 > 6.0)
        Predict: 1.0
      Else (feature 3 > 7.0)
       If (feature 0 <= 55.0)
        Predict: 1.0
      

In [None]:
updated_rdd.saveAsNewAPIHadoopFile(
            path='-', 
            outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
            keyClass="org.apache.hadoop.io.NullWritable", 
            valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
            conf=es_write_conf)
    

### Extra Stuff