<h1>Accident Risk Predictions using Gradient Boosting</h1>
<h3>Title:</h3> <em>Processing data using Traffic and Weather datasets to prepare train and test sets for models</em> 
<h3>Author:</h3> <em>Uttam Kumar</em>

In [1]:
import pandas as pd
import numpy as np
import sys
from multiprocessing import cpu_count
import warnings; warnings.simplefilter('ignore')
from feature_extraction import *

In [2]:
from sklearn.metrics import recall_score,precision_score,f1_score,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [15]:
#cities for which we will predict the accidents
city='Austin'  #Atlanta, Houston, LosAngeles, Charlotte, Dallas

In [16]:
#defining class with functions for GradientBoosting model
class base_model(object):          
    #function to load train and test data (pre-generated as numpy arrays from feature extraction script)
    def load_data(self,category=None):
        print ('loading and testing: shapes for train and test X and Y')
        #last item [-1] is the geocode index only used when we have the embbeding
        self.X_train = np.load('../data_files/train_set/X_train_'+city+'.npy',allow_pickle=True)[:,0:-1]            
        self.y_train = np.load('../data_files/train_set/y_train_'+city+'.npy',allow_pickle=True)
        self.X_test = np.load('../data_files/train_set/X_test_'+city+'.npy',allow_pickle=True)[:,0:-1]
        self.y_test = np.load('../data_files/train_set/y_test_'+city+'.npy',allow_pickle=True)
                    
        if category!=None:
            l_train=[]
            l_test=[]
            for cat in category:
                l_train.append(reshape_cat(self.X_train,cat))
                l_test.append(reshape_cat(self.X_test,cat))
            self.X_train = np.concatenate(l_train,axis=1)
            self.X_test = np.concatenate(l_test,axis=1)
                        
        print(self.X_train.shape)
        print(self.y_train.shape)
        print(self.X_test.shape)
        print(self.y_test.shape)

    #function to performs training process and find best hyper-parameters
    def train(self):
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        #using GridSearchCV to get the best parameters for our model
        self.clf = GridSearchCV(self.model, self.tuned_parameters, cv=cv, scoring=self.metric,n_jobs=self.n_jobs)
        self.clf.fit(self.X_train, self.y_train)
        print("Best parameters set is:")
        print(self.clf.best_params_)
        print("Grid scores:")
        means = self.clf.cv_results_['mean_test_score']
        stds = self.clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, self.clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        return self.clf.best_params_
       
    #function to perform testing
    def evaluate(self):        
        y_true, y_pred = self.y_test, self.clf.predict(self.X_test)
        print(classification_report(y_true, y_pred))
        dict_out = classification_report(y_true, y_pred,output_dict=True)
        return dict_out

    #function for defining metrics that we will be using
    def __init__(self,n_jobs=-1,metric='f1_score'): 
        self.n_jobs=n_jobs
        if metric == 'precision':
            self.metric = make_scorer(precision_score, average= 'weighted')
        elif metric =='recall':
            self.metric = make_scorer(recall_score, average= 'weighted')
        elif metric =='f1_score':
            self.metric = make_scorer(f1_score, average= 'weighted')
        else:
            print ('not valid metric')
        pass 

In [17]:
#class to set up Gradient Boosting Classifier model in Sklearn
class model_GBC_SKlearn(base_model):
    def create_model(self):
        self.tuned_parameters = {'learning_rate':[0.1,0.15,0.05,0.01],'n_estimators': [100,200,300,400], 
                                 'max_depth': [3,4,5,6]}
        self.model = GBC(n_iter_no_change=15)

In [18]:
#funtion to setup the model, load data, perform training and testing
def make_models(city=city,model='GBC',category=None,metric='precision'):
    mypred = model_GBC_SKlearn(n_jobs=cpu_count(),metric=metric)  #no. of jobs is no of cpu cores
    mypred.load_data(category)
    mypred.create_model()
    best_params = mypred.train()
    dict_out = mypred.evaluate()
    return pd.DataFrame(dict_out),best_params

In [19]:
#following categories of data is being passed as input
#traffic data: 7 attributes(accident,broken-vehicle,congestion,construction,event,lane-blocked and flow-incident)
#time data: 7 attributes(weekday/weekend,5 time-bins of day, day/night)
#weather data: 10 attributes(temperature,pressure,humidity,visibility,wind-speed,precip-itation,rain,snow,fog,hail)
#geohash data: POI data that includes 13 attributes(amenity,speed bump,crossing,give-way,junction,no-exit, 
#                      railway,roundabout,station,stop,traffic_calming)
#Desc2Vec data that includes 100 attributes
categories = [None, ['traffic'], ['time'], ['weather'], ['geohash'], ['NLP']] 

In [20]:
writer = open('../data_files/GBC_{}.csv'.format(city), 'w')
writer.write('Model,Category,ReportType,Precision,Recall,F1,Support\n')
writer.close()
for c in categories: # when using None as a category, it will use the entire set of input features(314)
    print('\n','GBC',c)
    No_Acc   = []
    Acc      = []        
    Macro    = []
    Weighted = []
    
    #running the model 3 epochs to take average predictions
    for i in range(3):
        df,best_params = make_models(model='GBC', metric='f1_score', category=c)  
        #metrics [f1, precision, recall, support (i.e. sample size)]
        metrics_no_acc = list(df['0'])    # 0 is for no accidents
        metrics_acc    = list(df['1'])    # 1 is for accidents
        macro_avg      = list(df['macro avg'])
        weighted_avg   = list(df['weighted avg'])
          
        No_Acc.append(metrics_no_acc)
        Acc.append(metrics_acc)
        Macro.append(macro_avg)
        Weighted.append(weighted_avg)
            
    No_Acc    = np.mean(No_Acc, axis=0)
    Acc       = np.mean(Acc, axis=0)
    Macro     = np.mean(Macro, axis=0)
    Weighted  = np.mean(Weighted, axis=0)
      
    if c is not None: 
        c = c[0]
    else:
        c = 'All'
    # to write the output result in terms of a CSV file
    writer = open('../data_files/GBC_{}.csv'.format(city), 'a')
    writer.write('{},{},{},{},{},{},{}\n'.format('GBC',c,'No-Accident',round(No_Acc[0], 4), round(No_Acc[1], 4), round(No_Acc[2], 4), int(No_Acc[3])))
    writer.write('{},{},{},{},{},{},{}\n'.format('GBC',c,'Accident',round(Acc[0], 4), round(Acc[1], 4), round(Acc[2], 4), int(Acc[3])))
    writer.write('{},{},{},{},{},{},{}\n'.format('GBC',c,'MacroAvg',round(Macro[0], 4), round(Macro[1], 4), round(Macro[2], 4), int(Macro[3])))
    writer.write('{},{},{},{},{},{},{}\n'.format('GBC',c,'WeightedAvg',round(Weighted[0], 4), round(Weighted[1], 4), round(Weighted[2], 4), int(Weighted[3])))
    writer.close()


 GBC None
loading and testing: shapes for train and test X and Y
(22890, 314)
(22890,)
(4664, 314)
(4664,)
Best parameters set is:
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
Grid scores:
0.881 (+/-0.011) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.880 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.881 (+/-0.011) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.882 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.880 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.880 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.881 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.880 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.880 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.881 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 5, 'n_est

loading and testing: shapes for train and test X and Y
(22890, 314)
(22890,)
(4664, 314)
(4664,)
Best parameters set is:
{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200}
Grid scores:
0.880 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.880 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.880 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.879 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.880 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.880 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.881 (+/-0.012) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.880 (+/-0.005) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.880 (+/-0.005) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.879 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 

(22890, 64)
(22890,)
(4664, 64)
(4664,)
Best parameters set is:
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
Grid scores:
0.829 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.830 (+/-0.003) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.830 (+/-0.004) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.829 (+/-0.005) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.831 (+/-0.005) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.831 (+/-0.004) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.831 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.831 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.831 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.832 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.830 (+/-0.007) for {'learning_rate': 0.1, 'max_dept

(22890, 56)
(22890,)
(4664, 56)
(4664,)
Best parameters set is:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Grid scores:
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.783 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_dept

(22890, 56)
(22890,)
(4664, 56)
(4664,)
Best parameters set is:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}
Grid scores:
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.782 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.783 (+/-0.008) for {'learning_rate': 0.1, 'max_dept

(22890, 80)
(22890,)
(4664, 80)
(4664,)
Best parameters set is:
{'learning_rate': 0.15, 'max_depth': 6, 'n_estimators': 100}
Grid scores:
0.787 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.793 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.792 (+/-0.011) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.794 (+/-0.012) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.793 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.797 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.797 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.798 (+/-0.015) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.799 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.799 (+/-0.014) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.801 (+/-0.012) for {'learning_rate': 0.1, 'max_dep

(22890, 14)
(22890,)
(4664, 14)
(4664,)
Best parameters set is:
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 400}
Grid scores:
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.841 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.841 (+/-0.011) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_dep

(22890, 14)
(22890,)
(4664, 14)
(4664,)
Best parameters set is:
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Grid scores:
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.842 (+/-0.005) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.841 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.843 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.842 (+/-0.007) for {'learning_rate': 0.1, 'max_dep

(22890, 100)
(22890,)
(4664, 100)
(4664,)
Best parameters set is:
{'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 400}
Grid scores:
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
0.841 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}
0.841 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.840 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
0.840 (+/-0.006) for {'learning_rate': 0.1, 'max_d