In [1]:
import sklearn
import os
import numpy as np
import pandas as pd


# Our evaluation functions

In [2]:
# Upsample the minority class (brakes applied)
from sklearn.utils import resample

def upsample(input_df,col="Brake",majority_value=0):
    '''
    Creates a balanced data set from the dataframe provided to it by upsampling the
    minority class, using col as the column_name of classes to be balanced
    '''    
    #Split by row based on the data class
    df_majority = input_df[input_df[col]==majority_value]
    df_minority = input_df[input_df[col]!=majority_value]
    
    df_minority_upsampled = resample(df_minority,
                                  replace=True,
                                  n_samples=df_majority.shape[0],
                                  random_state=444)
    
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    #print("Before upsampling:\n",df.Brake.value_counts())
    #print("After upsampling:\n",df_upsampled.Brake.value_counts())
    #print(df_upsampled.describe())
    
    return df_upsampled


In [3]:
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix

def fit_and_evaluate(df_data,model,label):
    """
    Performs K-fold cross validation to create our evaluation scores, and then retrains the model
    on the entire data set. 
    
    df_data: a dataframe of the data to be modeled, with 'y' as the last column
    model: the sklearn model class that we want to create a new instance of
    label: string printed above the output; not stored in any way
    """
    
    #setup output variables
    scores={'precision':[],'recall':[],'accuracy':[], 'f1':[]}
    cm_list=[]
    cm=np.array([[0,0],[0,0]])
    
    #Perform cross validation training
    for train, test in KFold(df_data.shape[0], n_folds=12,shuffle=True):
        df_train, df_test = df_data.iloc[train], df_data.iloc[test]
        
        df_train_upsampled = upsample(df_train,col="Brake",majority_value=0)
        
        X_train, y_train = df_train_upsampled.iloc[:,:-1], df_train_upsampled.iloc[:,-1]
        X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1]
        
        estimator = model()
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
        
        cm_list.append(confusion_matrix(expected,predicted))
    
    for c in cm_list:
        cm = np.add(cm,cm_list[0])
    
    # Retrain the model on the whole data set
    estimator = model()
    df_train_upsampled = upsample(df_train,col="Brake",majority_value=0)
    estimator.fit(df_train_upsampled.iloc[:,:-1], df_train_upsampled.iloc[:,-1])
    
    print(label)
    print('-----------------')
    print(pd.DataFrame(scores).mean())
    print(cm)
    
    return estimator,scores,cm



In [4]:
def create_comparison_file(estimator,output_name):
    """
    Takes a fitted model, runs it on our unseen data set, and outputs the resulting predictions to a file.
    """
    test_data_path = 'data/2018-01-31.csv'
    test_df = pd.read_csv(test_data_path, sep=',', header=0)
    test_df = test_df.iloc[:,3:]
    
    test_X = test_df.iloc[:,:-1]
    test_y_actual = test_df.iloc[:,-1]
    
    test_y_predicted = estimator.predict(test_X)
    
    print(confusion_matrix(test_y_actual,test_y_predicted))
    
    test_comparison_df = pd.DataFrame({'actual':test_y_actual,'predicted':test_y_predicted})
    output_path = 'outputs/' + output_name
    test_comparison_df.to_csv(output_path);

# Get the actual data


In [5]:
data_path = "data/2018-01-29.csv"

# Get our 3 target columns = accel in each direction plus boolean (1/0) for the class braking/not braking
df = pd.read_csv(data_path, sep=",",header=0)
df = df.iloc[:,3:] #don't use first two columns
df.head()

Unnamed: 0,X,Y,Z,Brake
0,13,59,67,0
1,12,61,66,0
2,14,61,65,0
3,16,60,64,0
4,15,60,64,0


# Train various models and compare them to a separate dataset

In [6]:
from sklearn.neighbors import KNeighborsClassifier

kn_model, scores, cm = fit_and_evaluate(df,KNeighborsClassifier,"KN classifier")
create_comparison_file(kn_model,'kn_model_comparison.csv')

KN classifier
-----------------
accuracy     0.761834
f1           0.797487
precision    0.864764
recall       0.761834
dtype: float64
[[30900  8508]
 [ 2040  3432]]
[[8245 2972]
 [ 316  629]]


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_estimator, scores, cm = fit_and_evaluate(df,RandomForestClassifier,"Random Forest")
create_comparison_file(rf_estimator,'rf_model_comparison.csv')

Random Forest
-----------------
accuracy     0.799897
f1           0.821383
precision    0.853915
recall       0.799897
dtype: float64
[[33132  6228]
 [ 2448  3072]]
[[9079 2138]
 [ 469  476]]


In [9]:
from sklearn.svm import SVC
svc_estimator, scores, cm = fit_and_evaluate(df,SVC,"SVM")
create_comparison_file(svc_estimator,'svc_model_comparison.csv')

SVM
-----------------
accuracy     0.788020
f1           0.817363
precision    0.871732
recall       0.788020
dtype: float64
[[31728  7728]
 [ 1956  3468]]
[[8755 2462]
 [ 330  615]]
