In [93]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier


from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from yellowbrick.features import RFECV

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [94]:
tr_time = pd.read_csv("train_time_series.csv").drop(columns = ["timestamp","UTC time","accuracy"])
tr_lab = pd.read_csv("train_labels.csv").drop(columns = ["timestamp","UTC time"])

In [76]:
tr_time.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)
tr_lab.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)

df = tr_time.merge(tr_lab,on="measurement")  #remove / move??

In [77]:
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,0.024384,-0.710709,0.030304
3740,24326,0.487228,-1.099136,-0.015213
3741,24327,0.369446,-0.968506,0.036713
3742,24328,0.167877,-0.802826,0.049805
3743,24329,0.689346,-0.991043,0.034973


In [78]:
tr_lab

Unnamed: 0,measurement,label
0,20589,1
1,20599,1
2,20609,1
3,20619,1
4,20629,1
...,...,...
370,24289,4
371,24299,4
372,24309,4
373,24319,4


In [86]:
#Standardise X,Y,Z accels
std = preprocessing.StandardScaler()
for k in ["x", "y", "z"]:
    tr_time[k] = std.fit_transform(tr_time[k].values.reshape(-1,1))
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,-0.433313,0.652926,-0.15432
3740,24326,0.736537,-0.208107,-0.284082
3741,24327,0.438839,0.081464,-0.136049
3742,24328,-0.07063,0.448729,-0.098726
3743,24329,1.247394,0.031505,-0.141008


In [87]:
def accel_mag(accels):
    """takes an array containing x,y,z accelarations, calculates their magnitude"""
    return np.sqrt(np.sum(accels**2))


In [88]:
#if the above doesn't work aggregate the x's, the y's, the z's togehter - maybe use a sum or mean? then use x, y,z as input X for machine learning
previous = 0

for i in tr_lab.measurement:
    boolinds = pd.Series((tr_time.measurement.values <= i) & (tr_time.measurement.values > previous))

    step = tr_time[boolinds.values]

    #calculate some values of x, y, z from the step

    sums = []
    means = []
    abs_sums = []
    abs_means = []


    for j in ["x", "y", "z"]:
        accel_sum = np.sum(step[j].values)
        sums.append(accel_sum)
        accel_mean = np.mean(step[j].values)
        means.append(accel_mean)
        abs_sum = np.sum(np.absolute(step[j].values))
        abs_sums.append(abs_sum)
        abs_mean = np.mean(np.absolute(step[j].values))
        abs_means.append(abs_mean)

    tr_lab.loc[tr_lab.measurement == i, "x_sum"] = sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_sum"] = sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_sum"] = sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_mean"] = means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_mean"] = means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_mean"] = means[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_sum"] = abs_sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_sum"] = abs_sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_sum"] = abs_sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_mean"] = abs_means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_mean"] = abs_means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_mean"] = abs_means[2]


    #calculate magnitude of acceleration for the step - maybe delete later    
    accels = step.loc[:,("x","y","z")]
    mags = accels.apply(accel_mag, axis="columns")
    tr_lab.loc[tr_lab.measurement == i, "sum_accel_mag"] = np.sum(mags)
    tr_lab.loc[tr_lab.measurement == i, "mean_accel_mag"] = np.mean(mags)    

    previous = i

In [89]:
classification_target = 'label'
all_covariates = ['x_abs_sum', 'y_abs_sum', 'z_abs_sum', 'sum_accel_mag']

X_train, X_test, y_train, y_test = train_test_split(tr_lab[all_covariates], tr_lab[classification_target])


In [90]:
rfc = RandomForestClassifier()


parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]   
}

cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train,y_train)
best_params = cv.best_params_ 
best_params

{'max_depth': 4, 'n_estimators': 250}

In [91]:
#initialise the random forest model with best hyperparameters and train
rfc = RandomForestClassifier(max_depth = best_params['max_depth'], n_estimators = best_params['n_estimators'])
rfc.fit(new_covariates, classification_outcome)

RandomForestClassifier(max_depth=4, n_estimators=250)

In [92]:
print("Accuracy on training set: {:.3f}".format(rfc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rfc.score(X_test, y_test)))

Accuracy on training set: 0.776
Accuracy on test set: 0.723
