# Part 3: Machine Learning Algorithms

Import all the packages and the combined data frame.

In [1]:
import numpy as np
import pandas as pd
import csv
from collections import Counter
from sklearn.datasets import make_classification
#!conda install py-xgboost --y
import xgboost as xgb
import operator
from random import choices
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from imblearn.over_sampling import SMOTE
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import xgboost 

#from imblearn.over_sampling import SMOTE, ADASYN

# No warnings
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('DF.csv')
#df.columns

Select the features of interest based on the preliminary analysis from Part 1.

In [2]:
df= df.loc[:,[ 'WEATHER_CONDITION',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'CRASH_HOUR', 'LIGHTING_CONDITION',
       'MANEUVER', 'TRAFFICWAY_TYPE', 'PRIM_CONTRIBUTORY_CAUSE',
       'POSTED_SPEED_LIMIT','COMBINED_DANGER_SCORE']]

df.head()
df_test_interest = df

In [None]:
# Imbalanced dataset


We used four different techniques to address the issue of imbalanced dataset.

In [3]:
# Approach 1: Upsampling or known as oversampling

## do RANDOM SAMPLING TO PICK EQUAL NUMBER OF DATA IN EACH Y GROUP
# down sampling
df4=df.reset_index()
X=df4.drop(columns = 'COMBINED_DANGER_SCORE')
y=df4.COMBINED_DANGER_SCORE

# find the number of levels in y and number of entries associated with each level

unique_levels = np.unique(y)
unique_counts = {level: sum(y == level) for level in unique_levels}
print(unique_counts)

# find the target number of data points
unique_counts.items()
max_level = max(unique_counts.items(), key=operator.itemgetter(1))[0]
min_level = min(unique_counts.items(), key=operator.itemgetter(1))[0]
target_number = unique_counts[max_level]
target_number_min = unique_counts[min_level]

# find which data points are associated with which group

grouped_levels = {}
for ii, level in enumerate(unique_levels):
    obs_idx = [idx for idx, val in enumerate(y) if val == level]
    grouped_levels[level] = obs_idx

grouped_levels

#oversampling
sampled_levels={}

# sample indices
for i in list(unique_levels):
    if i != max_level:
        sampled_levels[i] = choices(grouped_levels[i], k=target_number )
    else:
        sampled_levels[i] = grouped_levels[i]

first = df4.iloc[sampled_levels[1]].reset_index()
second = df4.iloc[sampled_levels[2]].reset_index()
third = df4.iloc[sampled_levels[3]].reset_index()

new_oversampled = pd.concat([first,second,third], axis = 0)
new_oversampled = new_oversampled.drop(columns = ['level_0','index'])


{1.0: 93876, 2.0: 111759, 3.0: 20533}


In [4]:
# Approach 2: Downsampling 

sampled_levels={}

# sample indices
for i in list(unique_levels):
    if i != min_level:
        sampled_levels[i] = choices(grouped_levels[i], k=target_number_min )
    else:
        sampled_levels[i] = grouped_levels[i]

first = df4.iloc[sampled_levels[1]].reset_index()
second = df4.iloc[sampled_levels[2]].reset_index()
third = df4.iloc[sampled_levels[3]].reset_index()

new_downsampled = pd.concat([first,second,third], axis = 0)
new_downsampled = new_downsampled.drop(columns = ['level_0','index'])

In [5]:
df_train_interest = new_downsampled

We have to convert categorical variables to categories for one-hot encoding.


In [6]:
# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')
labels = ['WEATHER_CONDITION', 'CRASH_DAY_OF_WEEK','LIGHTING_CONDITION','MANEUVER', 'TRAFFICWAY_TYPE', 'PRIM_CONTRIBUTORY_CAUSE']

# Convert df[LABELS] to a categorical type
print(df_train_interest[labels].dtypes)

WEATHER_CONDITION           int64
CRASH_DAY_OF_WEEK           int64
LIGHTING_CONDITION          int64
MANEUVER                   object
TRAFFICWAY_TYPE            object
PRIM_CONTRIBUTORY_CAUSE    object
dtype: object


In [7]:
df_train_interest = pd.get_dummies(df_train_interest, drop_first = True)
df_test_interest = pd.get_dummies(df_test_interest, drop_first = True)


Split the data into test and training datasets. While testing, remember to use the real data to obtain a more realistic result.

In [8]:
# machine learning algorithm
accuracy_list_log=[]
recall_list_log = []
precision_list_log = []

accuracy_list_forest=[]
recall_list_forest = []
precision_list_forest = []

accuracy_list_boost=[]
recall_list_boost = []
precision_list_boost = []


for i in range(0,20):
    train_df1, test_df1 = train_test_split(df_train_interest, test_size=0.2)
    train_df2, test_df2 = train_test_split(df_test_interest, test_size=0.2)

    X_train = train_df1.drop(columns = 'COMBINED_DANGER_SCORE')
    Y_train = train_df1.COMBINED_DANGER_SCORE
    X_test  = test_df2.drop(columns = 'COMBINED_DANGER_SCORE')
    Y_test = test_df2.COMBINED_DANGER_SCORE
    
    LogisticRegressionModel = linear_model.LogisticRegression()

    LogisticRegressionModel.fit(X_train,Y_train)
    accuracy = LogisticRegressionModel.score(X_test,Y_test)
    accuracy_list_log.append(accuracy)
    #print("Accuracy: %.2f%%" % (accuracy * 100.0))

    Y_true = Y_test
    Y_pred = LogisticRegressionModel.predict(X_test)
    ConfusionMatrix = pd.DataFrame(confusion_matrix(Y_true,Y_pred), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
    #print ('Confusion matrix of test data is: \n',ConfusionMatrix)
    
    recall = recall_score(Y_true, Y_pred, average = None)
    recall_list_log.append(np.array(recall))
    #print("Average recall for the 3 classes is - ", recall)
    
    precision = precision_score(Y_true, Y_pred, average = None)
    precision_list_log.append(np.array(precision))
    #print("Average precision for the 3 classes is - ", precision)

    # random forest
    classifier = RandomForestClassifier(n_estimators = 30, criterion = 'entropy')
    classifier.fit(X_train, Y_train)
    Y_pred_forest = classifier.predict(X_test)
    Y_true_forest = Y_test

    accuracy = classifier.score(X_test,Y_test)
    accuracy_list_forest.append(accuracy)

    ConfusionMatrix = pd.DataFrame(confusion_matrix(Y_true_forest,Y_pred_forest), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
    #print ('Confusion matrix of test data is: \n',ConfusionMatrix)
    #print("Average recall for the 3 classes is - ", recall_score(Y_true_forest, Y_pred_forest, average = None))
    #print("Average precision for the 3 classes is - ", precision_score(Y_true_forest, Y_pred_forest, average = None))
    
    recall = recall_score(Y_true_forest, Y_pred_forest, average = None)
    recall_list_forest.append(np.array(recall))
    #print("Average recall for the 3 classes is - ", recall)
    
    precision = precision_score(Y_true_forest, Y_pred_forest, average = None)
    precision_list_forest.append(np.array(precision))
    
      #xgboost
    import os
    os.environ['KMP_DUPLICATE_LIB_OK']='True'
    from xgboost import XGBClassifier

    model = XGBClassifier()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    predictions = [round(value) for value in Y_pred]

    from sklearn.metrics import accuracy_score

    # evaluate predictions
    boost_accuracy = accuracy_score(Y_test, predictions)
    accuracy_list_boost.append(boost_accuracy)
    #print("Accuracy: %.2f%%" % (accuracy * 100.0)) 

    boost_confusionMatrix = pd.DataFrame(confusion_matrix(Y_true,Y_pred), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
    #print ('Confusion matrix of test data using random forest is: \n',boost_confusionMatrix )
    
    recall = recall_score(Y_true, Y_pred, average = None)
    recall_list_boost.append(np.array(recall))
    precision = precision_score(Y_true, Y_pred, average = None)
    precision_list_boost.append(np.array(precision))

print(np.mean(accuracy_list_log))
print(np.mean(recall_list_log, axis = 0))
print(np.mean(precision_list_log, axis = 0))


print(np.mean(accuracy_list_forest))
print(np.mean(recall_list_forest, axis = 0))
print(np.mean(precision_list_forest, axis = 0))

print(np.mean(accuracy_list_boost))
print(np.mean(recall_list_boost, axis = 0))
print(np.mean(precision_list_boost, axis = 0))


0.33662731573595084
[0.46721094 0.16117249 0.67683508]
[0.50576758 0.52558824 0.13518016]
0.5053720652606446
[0.50553995 0.44387824 0.82959962]
[0.58740914 0.64015349 0.25550499]
0.3369368174382102
[0.4317977  0.18620068 0.707102  ]
[0.52241678 0.52901986 0.13586409]


In [9]:
from imblearn.over_sampling import SMOTE

#Approach 3: SMOTE
categorize_label = lambda x: x.astype('category')
labels = ['WEATHER_CONDITION', 'CRASH_DAY_OF_WEEK','LIGHTING_CONDITION','MANEUVER', 'TRAFFICWAY_TYPE', 'PRIM_CONTRIBUTORY_CAUSE']
# Convert df[LABELS] to a categorical type
df[labels] = df[labels].astype('category')
df = pd.get_dummies(df, drop_first = True)


In [10]:
accuracy_list_log=[]
recall_list_log = []
precision_list_log = []

accuracy_list_forest=[]
recall_list_forest = []
precision_list_forest = []

accuracy_list_boost=[]
recall_list_boost = []
precision_list_boost = []

for i in range(0,20):
    train_df1, test_df1 = train_test_split(df, test_size=0.2)
    X_train_SMOTE = train_df1.drop(columns = 'COMBINED_DANGER_SCORE')
    Y_train_SMOTE = train_df1.COMBINED_DANGER_SCORE
    X_test_SMOTE = test_df1.drop(columns = 'COMBINED_DANGER_SCORE')
    Y_test_SMOTE = test_df1.COMBINED_DANGER_SCORE

    X_resampled_SMOTE, Y_resampled_SMOTE = SMOTE().fit_sample(X_train_SMOTE, Y_train_SMOTE)
    #print(sorted(Counter(Y_resampled_SMOTE).items()))

    LogisticRegressionModel.fit(X_resampled_SMOTE,Y_resampled_SMOTE)
    log_accuracy = LogisticRegressionModel.score(X_test_SMOTE,Y_test_SMOTE)
    accuracy_list_log.append(log_accuracy)
    #print("Logistic accuracy: %.2f%%" % (log_accuracy * 100.0))

    Y_true_SMOTE = Y_test_SMOTE
    Y_pred_SMOTE = LogisticRegressionModel.predict(X_test_SMOTE)
    Log_ConfusionMatrix = pd.DataFrame(confusion_matrix(Y_true_SMOTE,Y_pred_SMOTE), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
    #print ('Confusion matrix of test data using Logistic regression is: \n',Log_ConfusionMatrix)

    recall = recall_score(Y_true_SMOTE, Y_pred_SMOTE, average = None)
    recall_list_log.append(np.array(recall))
    precision = precision_score(Y_true_SMOTE, Y_pred_SMOTE, average = None)
    precision_list_log.append(np.array(precision))
    
    #print("Average recall for the 3 classes using Logistic regression  is - ", recall_score(Y_true_SMOTE,Y_pred_SMOTE, average = None))
    #print("Average precision for the 3 classes using Logistic regression is - ", precision_score(Y_true_SMOTE,Y_pred_SMOTE, average = None))

    # random forest
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
    classifier.fit(X_train_SMOTE, Y_train_SMOTE)
    Y_pred_SMOTE_forest = classifier.predict(X_test_SMOTE)
    Y_true_SMOTE_forest = Y_test_SMOTE


    forest_accuracy = classifier.score(X_test_SMOTE,Y_test_SMOTE)
    accuracy_list_forest.append(forest_accuracy)
    #print("Random Forest accuracy: %.2f%%" % (forest_accuracy * 100.0))

    forest_confusionMatrix = pd.DataFrame(confusion_matrix(Y_true_SMOTE_forest,Y_pred_SMOTE_forest), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
    #print ('Confusion matrix of test data using random forest is: \n',forest_confusionMatrix )
    recall = recall_score(Y_true_SMOTE_forest, Y_pred_SMOTE_forest, average = None)
    recall_list_forest.append(np.array(recall))
    #print("Average recall for the 3 classes is - ", recall)
    
    precision = precision_score(Y_true_SMOTE_forest, Y_pred_SMOTE_forest, average = None)
    precision_list_forest.append(np.array(precision))
    #print("Average recall for the 3 classes using Logistic regression  is - ", recall_score(Y_true_SMOTE_forest,Y_pred_SMOTE_forest, average = None))
    #print("Average precision for the 3 classes using Logistic regression is - ", precision_score(Y_true_SMOTE_forest,Y_pred_SMOTE_forest, average = None))
    
    #xgboost
    import os
    os.environ['KMP_DUPLICATE_LIB_OK']='True'
    from xgboost import XGBClassifier

    model = XGBClassifier()
    model.fit(X_train_SMOTE, Y_train_SMOTE)

    Y_pred = model.predict(X_test_SMOTE)
    predictions = [round(value) for value in Y_pred]

    from sklearn.metrics import accuracy_score

    # evaluate predictions
    boost_accuracy = accuracy_score(Y_test_SMOTE, predictions)
    accuracy_list_boost.append(boost_accuracy)
    #print("Accuracy: %.2f%%" % (accuracy * 100.0)) 

    boost_confusionMatrix = pd.DataFrame(confusion_matrix(Y_true,Y_pred), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
    #print ('Confusion matrix of test data using random forest is: \n',boost_confusionMatrix )
    
    recall = recall_score(Y_true_SMOTE, Y_pred, average = None)
    recall_list_boost.append(np.array(recall))
    precision = precision_score(Y_true_SMOTE, Y_pred, average = None)
    precision_list_boost.append(np.array(precision))

    
print(np.mean(accuracy_list_log))
print(np.mean(recall_list_log, axis = 0))
print(np.mean(precision_list_log, axis = 0))

print(np.mean(accuracy_list_forest))
print(np.mean(recall_list_forest, axis = 0))
print(np.mean(precision_list_forest, axis = 0))

print(np.mean(accuracy_list_boost))
print(np.mean(recall_list_boost, axis = 0))
print(np.mean(precision_list_boost, axis = 0))

0.33806428792501214
[0.45988791 0.17775793 0.65612071]
[0.50262513 0.53017184 0.13096915]
0.5430870584073927
[0.53738991 0.61067441 0.20077878]
[0.52568922 0.57672516 0.34751474]
0.5259539284608923
[0.34763811 0.77168379 0.        ]
[0.53724326 0.5218363  0.        ]


In [11]:
# Approach 4: ADASYN

# sm = ADASYN()
# X_resampled_ADASYN, Y_resampled_ADASYN = sm.fit_sample(X_train, Y_train)
# print(sorted(Counter(Y_resampled_ADASYN).items()))


# LogisticRegressionModel.fit(X_resampled_ADASYN,Y_resampled_ADASYN)
# print(LogisticRegressionModel.score(X_test,Y_test))

# Y_true = Y_test
# Y_pred = LogisticRegressionModel.predict(X_test)
# ConfusionMatrix = pd.DataFrame(confusion_matrix(Y_true,Y_pred), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
# ConfusionMatrix
