# Imports

In [1]:
import pandas as pd
import psycopg2

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap

# Read Data Set

In [2]:
df = pd.read_csv("mergeData_full_v7.csv")
df.head()

Unnamed: 0,pedId,speed_total_mean,steering_total_mean,brake_total_mean,throttle_total_mean,acceleration_total_mean,speed_total_var,steering_total_var,brake_total_var,throttle_total_var,...,gender,age,job,LatestDegree,oftenDrive,accidents,yearsDriving,lastDrive(years),speedingTickets,hadCollision
0,1.0,5.919151,0.503649,0.965743,0.820576,0.030731,13.796202,0.000655,0.014468,0.028719,...,0,32,0,0,2,0,10,1.0,0,0
1,3.0,7.580378,0.499771,0.891302,0.878839,-0.026652,31.451253,0.000345,0.058767,0.010391,...,0,32,0,0,2,0,10,1.0,0,0
2,15.0,9.474048,0.494557,0.952182,0.781126,0.006292,53.873833,0.001231,0.022506,0.045416,...,0,32,0,0,2,0,10,1.0,0,1
3,2.0,11.669419,0.500661,0.891913,0.522365,0.008028,47.209285,0.000396,0.055982,0.112551,...,0,32,0,0,2,0,10,1.0,0,0
4,4.0,12.187044,0.499769,0.861132,0.55812,0.001881,42.031423,0.00043,0.102442,0.079023,...,0,32,0,0,2,0,10,1.0,0,0


### Distribution

In [3]:
num_obs = len(df)
num_true = len(df.loc[df['hadCollision'] == 1])
num_false = len(df.loc[df['hadCollision'] == 0])
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Number of True cases:  54 (9.66%)
Number of False cases: 505 (90.34%)


## Split data set

In [4]:
from sklearn.model_selection import train_test_split

data = df.copy()
X = data.drop('hadCollision', axis=1) 
Y = data['hadCollision']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=1, stratify=Y)

In [5]:
print("Training True  : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Test True      : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Test False     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Training True  : 32 (9.55%)
Training False : 303 (90.45%)

Test True      : 22 (9.82%)
Test False     : 202 (90.18%)


# Over sampling data

## Random Over Sampler

In [6]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

print("Training True  : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 1]), (len(y_resampled[y_resampled[:] == 1])/len(y_resampled) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 0]), (len(y_resampled[y_resampled[:] == 0])/len(y_resampled) * 100.0)))


Training True  : 303 (50.00%)
Training False : 303 (50.00%)


## Random forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=10, class_weight={0:1, 1:10000})      # Create random forest object
rf_model.fit(X_resampled, y_resampled.ravel())

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 10000},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

### Predict Test Data

In [8]:
from sklearn import metrics

rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.9107


In [9]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[200   2]
 [ 18   4]]

Classification Report
             precision    recall  f1-score   support

          0       0.92      0.99      0.95       202
          1       0.67      0.18      0.29        22

avg / total       0.89      0.91      0.89       224



## SMOT

In [10]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)

print("Training True  : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 1]), (len(y_resampled[y_resampled[:] == 1])/len(y_resampled) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 0]), (len(y_resampled[y_resampled[:] == 0])/len(y_resampled) * 100.0)))

Training True  : 303 (50.00%)
Training False : 303 (50.00%)


## Random forest

In [11]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=10, class_weight={0:.99, 1:.01})      # Create random forest object
rf_model.fit(X_resampled, y_resampled.ravel())

RandomForestClassifier(bootstrap=True, class_weight={0: 0.99, 1: 0.01},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

### Predict Test Data

In [12]:
rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.9062


In [13]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[188  14]
 [  7  15]]

Classification Report
             precision    recall  f1-score   support

          0       0.96      0.93      0.95       202
          1       0.52      0.68      0.59        22

avg / total       0.92      0.91      0.91       224



## ADASYN

In [22]:
from imblearn.over_sampling import SMOTE, ADASYN

X_resampled, y_resampled = ADASYN().fit_sample(X_train, y_train)

print("Training True  : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 1]), (len(y_resampled[y_resampled[:] == 1])/len(y_resampled) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 0]), (len(y_resampled[y_resampled[:] == 0])/len(y_resampled) * 100.0)))

Training True  : 304 (50.08%)
Training False : 303 (49.92%)


## Random forest

In [23]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=10, class_weight={0:10, 1:1})      # Create random forest object
rf_model.fit(X_resampled, y_resampled.ravel())

RandomForestClassifier(bootstrap=True, class_weight={0: 10, 1: 1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

### Predict Test Data

In [24]:
rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.8884


In [25]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[183  19]
 [  6  16]]

Classification Report
             precision    recall  f1-score   support

          0       0.97      0.91      0.94       202
          1       0.46      0.73      0.56        22

avg / total       0.92      0.89      0.90       224

