# Imports

In [5]:
import pandas as pd
import psycopg2

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap

# Read Data Set

In [6]:
df = pd.read_csv("mergeData.csv")
df.head()

Unnamed: 0,speed_total_mean,steering_total_mean,brake_total_mean,throttle_total_mean,acceleration_total_mean,speed_total_var,steering_total_var,brake_total_var,throttle_total_var,acceleration_total_var,total_time,distancePed,max_speed,hadCollision
0,5.919151,0.503649,0.965743,0.820576,0.030731,13.796202,0.000655,0.014468,0.028719,0.03937,15.405173,89.99245,11.669766,0
1,7.580378,0.499771,0.891302,0.878839,-0.026652,31.451253,0.000345,0.058767,0.010391,0.06348,11.412381,85.06386,13.49971,0
2,9.474048,0.494557,0.952182,0.781126,0.006292,53.873833,0.001231,0.022506,0.045416,0.106281,102.356492,789.2128,25.851397,1
3,3.398595,0.523305,0.960227,0.858795,0.027363,5.036717,0.000101,0.018643,0.07586,0.027735,14.3151,47.977978,10.266865,0
4,11.669419,0.500661,0.891913,0.522365,0.008028,47.209285,0.000396,0.055982,0.112551,0.159198,7.505478,88.01161,20.05507,0


### Distribution

In [7]:
num_obs = len(df)
num_true = len(df.loc[df['hadCollision'] == 1])
num_false = len(df.loc[df['hadCollision'] == 0])
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Number of True cases:  61 (9.31%)
Number of False cases: 594 (90.69%)


## Split data set

In [17]:
from sklearn.model_selection import train_test_split

data = df.copy()
X = data.drop('hadCollision', axis=1) 
Y = data['hadCollision']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)

## KNN classifier (n = 3)

In [35]:
from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [36]:
#check accuracy of our model on the test data
from sklearn import metrics

print(knn.score(X_test, y_test))
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, knn.predict(X_test))))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, knn.predict(X_test)))

0.9007633587786259
Confusion Matrix
[[117   2]
 [ 11   1]]

Classification Report
             precision    recall  f1-score   support

          0       0.91      0.98      0.95       119
          1       0.33      0.08      0.13        12

avg / total       0.86      0.90      0.87       131



Baja certeza para casos verdaderos de choque

## KNN classifier (n = 5)

In [39]:
from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [40]:
print(knn.score(X_test, y_test))
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, knn.predict(X_test))))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, knn.predict(X_test)))

0.9083969465648855
Confusion Matrix
[[119   0]
 [ 12   0]]

Classification Report
             precision    recall  f1-score   support

          0       0.91      1.00      0.95       119
          1       0.00      0.00      0.00        12

avg / total       0.83      0.91      0.86       131



  'precision', 'predicted', average, warn_for)


Nula certeza para casos verdaderos de choque

## KNN with cv score

In [25]:
from sklearn.model_selection import cross_val_score
import numpy as np
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=3)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X, Y, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.91666667 0.90839695 0.91603053 0.91603053 0.90769231]
cv_scores mean:0.9129633979252301


## GridSearchCV n neighbors

In [28]:
from sklearn.model_selection import GridSearchCV
#create new a knn model
knn2 = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
#fit model to data
knn_gscv.fit(X, Y)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
#check top performing n_neighbors value
knn_gscv.best_params_

{'n_neighbors': 2}

In [30]:
#check mean score for the top performing value of n_neighbors
knn_gscv.best_score_

0.917557251908397

## KNN classifier (n = 2)

In [41]:
from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 2)
# Fit the classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')

In [42]:
print(knn.score(X_test, y_test))
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, knn.predict(X_test))))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, knn.predict(X_test)))

0.9083969465648855
Confusion Matrix
[[118   1]
 [ 11   1]]

Classification Report
             precision    recall  f1-score   support

          0       0.91      0.99      0.95       119
          1       0.50      0.08      0.14        12

avg / total       0.88      0.91      0.88       131



Resultados parecidos a n = 3