In [1]:
import sklearn
import os
import numpy as np
import pandas as pd


In [2]:
# Upsample the minority class (brakes applied)
from sklearn.utils import resample

def upsample(input_df,col="Brake",majority_value=0):
    '''
    Creates a balanced data set from the dataframe provided to it, using the column_name
    '''    
    #Split by row based on the data class
    df_majority = input_df[input_df[col]==majority_value]
    df_minority = input_df[input_df[col]!=majority_value]
    
    df_minority_upsampled = resample(df_minority,
                                  replace=True,
                                  n_samples=df_majority.shape[0],
                                  random_state=444)
    
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    #print("Before upsampling:\n",df.Brake.value_counts())
    #print("After upsampling:\n",df_upsampled.Brake.value_counts())
    #print(df_upsampled.describe())
    
    return df_upsampled


In [3]:
data_path = "data/2018-01-29.csv"

# Get our 3 target columns = accel in each direction plus boolean (1/0) for the class braking/not braking
df = pd.read_csv(data_path, sep=",",header=0)
df = df.iloc[:,3:] #don't use first two columns
df.head()

Unnamed: 0,X,Y,Z,Brake
0,13,59,67,0
1,12,61,66,0
2,14,61,65,0
3,16,60,64,0
4,15,60,64,0


In [4]:
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


def fit_and_evaluate(df_data,model,label):
    
    #setup output variables
    scores={'precision':[],'recall':[],'accuracy':[], 'f1':[]}
    cm_list=[]
    cm=np.array([[0,0],[0,0]])
    
    #Perform cross validation training
    for train, test in KFold(df_data.shape[0], n_folds=12,shuffle=True):
        df_train, df_test = df_data.iloc[train], df_data.iloc[test]
        
        df_train_upsampled = upsample(df_train,col="Brake",majority_value=0)
        
        X_train, y_train = df_train_upsampled.iloc[:,:-1], df_train_upsampled.iloc[:,-1]
        X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1]
        
        estimator = model()
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)
        
        
        
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
        
        cm_list.append(confusion_matrix(expected,predicted))
    
    for c in cm_list:
        cm = np.add(cm,cm_list[0])
    
    
    print(label)
    print('-----------------')
    print(pd.DataFrame(scores).mean())
    print(cm)
    
    return model,scores,cm



In [5]:
model, scores, cm = fit_and_evaluate(df,KNeighborsClassifier,"KN classifier")

KN classifier
-----------------
accuracy     0.758045
f1           0.794588
precision    0.863603
recall       0.758045
dtype: float64
[[30636  8580]
 [ 1932  3732]]


In [8]:
model, scores, cm = fit_and_evaluate(df,RandomForestClassifier,"RandomForestClassifier")

RandomForestClassifier
-----------------
accuracy     0.798672
f1           0.820543
precision    0.853771
recall       0.798672
dtype: float64
[[32604  6840]
 [ 2628  2808]]


In [9]:
model, scores, cm = fit_and_evaluate(df,SVC,"SVC")

SVC
-----------------
accuracy     0.788911
f1           0.818029
precision    0.871922
recall       0.788911
dtype: float64
[[31944  7272]
 [ 1884  3780]]
