# Accident Grid Exploration with PySpark

To run the notebook with a Spark Context (sc) enter: 'IPYTHON_OPTS="notebook --certfile=~/cert/mycert.pem --keyfile ~/cert/mykey.key" $SPARK_HOME/bin/pyspark --master spark://spark1:7077 --jars $SPARK_HOME/jars/elasticsearch-hadoop-2.2.0.jar'



In [1]:
#ML Lib libraries
#Python Libraries
import sys
sys.path.append('../Infrastructure_Capstone')
import os
import random

from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch.client import indices

import ConfigParser

#read in the config file
config = ConfigParser.ConfigParser()
config.read('../Infrastructure_Capstone/config/capstone_config.ini')

ES_url = config.get('ElasticSearch','host')
ES_password = config.get('ElasticSearch','password')
ES_username= config.get('ElasticSearch','username')

print ES_url
seed = random.seed(0)

169.53.138.84


In [2]:
#Configuration for reading from and writing to Elasticsearch
es_read_conf = { 
        "es.resource" : "dataframe_plus_weather/rows", 
        "es.nodes" : ES_url,
        "es.port" : "9200",
        "es.net.http.auth.user" : ES_username,
        "es.net.http.auth.pass" : ES_password 
    }

es_write_conf = {
        "es.resource" : "rf_output/results",
        "es.nodes" : ES_url,
        "es.port" : "9200",
        "es.net.http.auth.user" : ES_username, 
        "es.net.http.auth.pass" : ES_password
        #"es.mapping.id" : "grid_id"
    } 

### Functions for MapReduce

In [3]:
#Function to convert ES rdd rows to Labeled Points (label, features)
def getLabeledPoint(row): 
    #zip code
    zipcode = row['grid_zipcodeIdx']
    
    #date time fields
    dayOfMonth = long(row['grid_day'])
    dayOfWeek = long(row['grid_dayOfWeek'])
    hour = long(row['grid_hourOfDay'])
    month = long(row['grid_month'])
    
    #weather fields
    fog = row['weather_FogIdx']
    rain = row['weather_RainIdx']
    snow = row['weather_SnowHailIceIdx']
    
    if float(row['weather_WetBulbFarenheit']) <> 99999:
        temp = row['weather_WetBulbFarenheit']  
    else:
        temp = 70
        
    if float(row['weather_HourlyPrecip']) <> 99999:
        precip = row['weather_HourlyPrecip ']
    else:
        precip = 0
        
    if float(row['weather_Visibility']) <> 99999:
        vis = row['weather_Visibility']
    else:
        vis = 10
        
    if float(row['weather_WindSpeed']) <> 99999:
        windspeed = row['weather_WindSpeed']
    else:
        windspeed = 0
    
    #truth label
    label = long(row['grid_isAccident'])   
    
    return LabeledPoint(label,[zipcode,dayOfMonth,dayOfWeek,hour,month,fog,rain,snow,temp,precip,vis,windspeed])

#Function to convert dataframe rows to Labeled Points (label, features)
def getLabeledPointDF(row): 
    #zip code
    zipcode = row.grid_zipcodeIdx
    
    #date time fields
    dayOfMonth = long(row.grid_day)
    dayOfWeek = long(row.grid_dayOfWeek)
    hour = long(row.grid_hourOfDay)
    month = long(row.grid_month)
    
    #weather fields
    fog = row.weather_FogIdx
    rain = row.weather_RainIdx
    snow = row.weather_SnowHailIceIdx
    
    if float(row.weather_WetBulbFarenheit) <> 99999:
        temp = row.weather_WetBulbFarenheit  
    else:
        temp = 70
        
    if float(row.weather_HourlyPrecip) <> 99999:
        precip = row.weather_HourlyPrecip  
    else:
        precip = 0
        
    if float(row.weather_Visibility) <> 99999:
        vis = row.weather_Visibility  
    else:
        vis = 10
        
    if float(row.weather_WindSpeed) <> 99999:
        windspeed = row.weather_WindSpeed  
    else:
        windspeed = 0
    
    #truth label
    label = long(row.grid_isAccident)    
    
    return LabeledPoint(label,[zipcode,dayOfMonth,dayOfWeek,hour,month,fog,rain,snow,temp,precip,vis,windspeed])

### Gather data from Elasticsearch grid index

In [7]:
#get RDD of the collisions grid
grid_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable", 
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
    conf=es_read_conf).map(lambda row: row[1])

grid_rdd.first()
grid_test = sc.parallelize(grid_rdd.take(100000)) #start with a small test set

In [10]:
#get set of accident and non-accident records
accidents = grid_rdd.filter(lambda row: row['grid_isAccident'] == 1)
N = accidents.count()
N
#no_accidents = grid_test.filter(lambda row: row['grid_isAccident'] == 0)

570174

In [9]:
N

0

In [35]:
test_rdd = sc.parallelize(grid_rdd.take(2000))
test_df = sqlContext.createDataFrame(test_rdd.map(lambda row: row[1]),samplingRatio = 0.5)

### Convert categorical features

In [36]:
#define categorical indexers for the data
zipIndexer =  StringIndexer(inputCol='grid_zipcode', outputCol='grid_zipcodeIdx')
fogIndexer =  StringIndexer(inputCol='weather_Fog', outputCol='weather_FogIdx')
rainIndexer =  StringIndexer(inputCol='weather_Rain', outputCol='weather_RainIdx')
snowIndexer =  StringIndexer(inputCol='weather_SnowHailIce', outputCol='weather_SnowHailIceIdx')

In [43]:
#index the zip codes and weather categories
model1 = zipIndexer.fit(test_df)
td1 = model1.transform(test_df)

model2 = fogIndexer.fit(td1)
td2 = model2.transform(td1)

model3 = rainIndexer.fit(td2)
td3 = model3.transform(td2)

model4 = snowIndexer.fit(td3)
td4 = model4.transform(td3)

td4.take(1)

### Train and Run the Model

In [61]:
#convert the dataframe to labeled points
labeled_pts = td4.map(getLabeledPoint)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = labeled_pts.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

# Save and load model
#model.save(sc, "target/tmp/myRandomForestClassificationModel")
#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")

Test Error = 0.0
Learned classification forest model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    Predict: 0.0
  Tree 1:
    Predict: 0.0
  Tree 2:
    Predict: 0.0



In [None]:
updated_rdd.saveAsNewAPIHadoopFile(
            path='-', 
            outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
            keyClass="org.apache.hadoop.io.NullWritable", 
            valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
            conf=es_write_conf)
    