# Imports

In [4]:
import pandas as pd
import psycopg2

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap

# Read Data Set

In [5]:
df = pd.read_csv("mergeData_v4.csv")
df.head()

Unnamed: 0,speed_total_mean,steering_total_mean,brake_total_mean,throttle_total_mean,acceleration_total_mean,speed_total_var,steering_total_var,brake_total_var,throttle_total_var,acceleration_total_var,total_time,distancePed,max_speed,PKE,PKE_Steering,speed_react,reaction_time,hadCollision
0,5.919151,0.503649,0.965743,0.820576,0.030731,13.796202,0.000655,0.014468,0.028719,0.03937,15.405173,89.99245,11.669766,1.93229,-0.00015,7.75488,1.048791,0
1,7.580378,0.499771,0.891302,0.878839,-0.026652,31.451253,0.000345,0.058767,0.010391,0.06348,11.412381,85.06386,13.49971,0.878493,0.000274,13.472353,2.106615,0
2,9.474048,0.494557,0.952182,0.781126,0.006292,53.873833,0.001231,0.022506,0.045416,0.106281,102.356492,789.2128,25.851397,2.857169,0.000108,25.585112,0.079211,1
3,11.669419,0.500661,0.891913,0.522365,0.008028,47.209285,0.000396,0.055982,0.112551,0.159198,7.505478,88.01161,20.05507,2.969647,-0.000258,19.412087,1.161592,0
4,12.187044,0.499769,0.861132,0.55812,0.001881,42.031423,0.00043,0.102442,0.079023,0.158822,8.681609,105.973686,19.697004,4.033468,6.6e-05,18.461056,1.275896,0


### Distribution

In [6]:
num_obs = len(df)
num_true = len(df.loc[df['hadCollision'] == 1])
num_false = len(df.loc[df['hadCollision'] == 0])
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Number of True cases:  54 (9.66%)
Number of False cases: 505 (90.34%)


## Split data set

In [7]:
from sklearn.model_selection import train_test_split

data = df.copy()
X = data.drop('hadCollision', axis=1) 
Y = data['hadCollision']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=1, stratify=Y)

In [8]:
print("Training True  : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Training False : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Test True      : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Test False     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Training True  : 32 (9.55%)
Training False : 303 (90.45%)

Test True      : 22 (9.82%)
Test False     : 202 (90.18%)


## SGD Classifier

### Default set up

In [9]:
import numpy as np
from sklearn import linear_model

clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False)

### Predict Test Data

In [10]:
from sklearn import metrics


rf_predict_test = clf.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.1339


In [11]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[  8 194]
 [  0  22]]

Classification Report
             precision    recall  f1-score   support

          0       1.00      0.04      0.08       202
          1       0.10      1.00      0.18        22

avg / total       0.91      0.13      0.09       224



## SGD with class weights

In [20]:
class_weights = {1: .9, 0: .1}


clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, class_weight=class_weights)
clf.fit(X_train, y_train)


SGDClassifier(alpha=0.0001, average=False, class_weight={1: 0.9, 0: 0.1},
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False)

In [21]:
### Predict Test Data

from sklearn import metrics


rf_predict_test = clf.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

Accuracy: 0.8929
[[187  15]
 [  9  13]]

Classification Report
             precision    recall  f1-score   support

          0       0.95      0.93      0.94       202
          1       0.46      0.59      0.52        22

avg / total       0.91      0.89      0.90       224

