# Imports

In [9]:
import pandas as pd
import psycopg2

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap

# Read Data Set

In [10]:
df = pd.read_csv("mergeData_v6_with_scenario_data.csv")
df.head()

Unnamed: 0,speed_total_mean,steering_total_mean,brake_total_mean,throttle_total_mean,acceleration_total_mean,speed_total_var,steering_total_var,brake_total_var,throttle_total_var,acceleration_total_var,...,distancePed,max_speed,PKE,PKE_Steering,speed_react,reaction_time,pedestriansLevel,visibilityLevel,trafficLevel,hadCollision
0,5.919151,0.503649,0.965743,0.820576,0.030731,13.796202,0.000655,0.014468,0.028719,0.03937,...,89.99245,11.669766,1.93229,-0.00015,7.75488,1.048791,0,1,0,0
1,7.580378,0.499771,0.891302,0.878839,-0.026652,31.451253,0.000345,0.058767,0.010391,0.06348,...,85.06386,13.49971,0.878493,0.000274,13.472353,2.106615,0,1,0,0
2,9.474048,0.494557,0.952182,0.781126,0.006292,53.873833,0.001231,0.022506,0.045416,0.106281,...,789.2128,25.851397,2.857169,0.000108,25.585112,0.079211,0,1,0,1
3,11.669419,0.500661,0.891913,0.522365,0.008028,47.209285,0.000396,0.055982,0.112551,0.159198,...,88.01161,20.05507,2.969647,-0.000258,19.412087,1.161592,2,1,0,0
4,12.187044,0.499769,0.861132,0.55812,0.001881,42.031423,0.00043,0.102442,0.079023,0.158822,...,105.973686,19.697004,4.033468,6.6e-05,18.461056,1.275896,2,1,0,0


### Distribution

In [11]:
num_obs = len(df)
num_true = len(df.loc[df['hadCollision'] == 1])
num_false = len(df.loc[df['hadCollision'] == 0])
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Number of True cases:  54 (9.66%)
Number of False cases: 505 (90.34%)


## Split data set

In [12]:
from sklearn.model_selection import train_test_split

data = df.copy()
X = data.drop('hadCollision', axis=1) 
Y = data['hadCollision']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=1, stratify=Y)

In [13]:
print("Training True  : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Test True      : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Test False     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Training True  : 27 (9.68%)
Training False : 252 (90.32%)

Test True      : 27 (9.64%)
Test False     : 253 (90.36%)


# Over sampling data

## Random Over Sampler

In [14]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

print("Training True  : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 1]), (len(y_resampled[y_resampled[:] == 1])/len(y_resampled) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 0]), (len(y_resampled[y_resampled[:] == 0])/len(y_resampled) * 100.0)))


Training True  : 252 (50.00%)
Training False : 252 (50.00%)


## Random forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=10)      # Create random forest object
rf_model.fit(X_resampled, y_resampled.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

### Predict Test Data

In [16]:
from sklearn import metrics

rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.9036


In [17]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[247   6]
 [ 21   6]]

Classification Report
             precision    recall  f1-score   support

          0       0.92      0.98      0.95       253
          1       0.50      0.22      0.31        27

avg / total       0.88      0.90      0.89       280



## SMOT

In [18]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)

print("Training True  : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 1]), (len(y_resampled[y_resampled[:] == 1])/len(y_resampled) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 0]), (len(y_resampled[y_resampled[:] == 0])/len(y_resampled) * 100.0)))

Training True  : 252 (50.00%)
Training False : 252 (50.00%)


## Random forest

In [19]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=10)      # Create random forest object
rf_model.fit(X_resampled, y_resampled.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

### Predict Test Data

In [20]:
rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.8964


In [21]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[240  13]
 [ 16  11]]

Classification Report
             precision    recall  f1-score   support

          0       0.94      0.95      0.94       253
          1       0.46      0.41      0.43        27

avg / total       0.89      0.90      0.89       280



## ADASYN

In [37]:
from imblearn.over_sampling import SMOTE, ADASYN

X_resampled, y_resampled = ADASYN().fit_sample(X_train, y_train)

print("Training True  : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 1]), (len(y_resampled[y_resampled[:] == 1])/len(y_resampled) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_resampled[y_resampled[:] == 0]), (len(y_resampled[y_resampled[:] == 0])/len(y_resampled) * 100.0)))

Training True  : 254 (50.20%)
Training False : 252 (49.80%)


## Random forest

In [38]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=10)      # Create random forest object
rf_model.fit(X_resampled, y_resampled.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

### Predict Test Data

In [39]:
rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.8821


In [40]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[234  19]
 [ 14  13]]

Classification Report
             precision    recall  f1-score   support

          0       0.94      0.92      0.93       253
          1       0.41      0.48      0.44        27

avg / total       0.89      0.88      0.89       280

