In [2]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [3]:
tr_time = pd.read_csv("train_time_series.csv").drop(columns = ["timestamp","UTC time","accuracy"])
tr_lab = pd.read_csv("train_labels.csv").drop(columns = ["timestamp","UTC time"])

In [4]:
tr_time.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)
tr_lab.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)

df = tr_time.merge(tr_lab,on="measurement")  #remove / move??

In [5]:
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,0.024384,-0.710709,0.030304
3740,24326,0.487228,-1.099136,-0.015213
3741,24327,0.369446,-0.968506,0.036713
3742,24328,0.167877,-0.802826,0.049805
3743,24329,0.689346,-0.991043,0.034973


In [6]:
tr_lab

Unnamed: 0,measurement,label
0,20589,1
1,20599,1
2,20609,1
3,20619,1
4,20629,1
...,...,...
370,24289,4
371,24299,4
372,24309,4
373,24319,4


In [7]:
#Standardise X,Y,Z accels
std = preprocessing.StandardScaler()
for k in ["x", "y", "z"]:
    tr_time[k] = std.fit_transform(tr_time[k].values.reshape(-1,1))
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,-0.433313,0.652926,-0.15432
3740,24326,0.736537,-0.208107,-0.284082
3741,24327,0.438839,0.081464,-0.136049
3742,24328,-0.07063,0.448729,-0.098726
3743,24329,1.247394,0.031505,-0.141008


In [8]:
def accel_mag(accels):
    """takes an array containing x,y,z accelarations, calculates their magnitude"""
    return np.sqrt(np.sum(accels**2))


In [67]:
#if the above doesn't work aggregate the x's, the y's, the z's togehter - maybe use a sum or mean? then use x, y,z as input X for machine learning
previous = 0

for i in tr_lab.measurement:
    boolinds = pd.Series((tr_time.measurement.values <= i) & (tr_time.measurement.values > previous))

    step = tr_time[boolinds.values]

    #calculate some values of x, y, z from the step

    sums = []
    means = []
    abs_sums = []
    abs_means = []


    for j in ["x", "y", "z"]:
        accel_sum = np.sum(step[j].values)
        sums.append(accel_sum)
        accel_mean = np.mean(step[j].values)
        means.append(accel_mean)
        abs_sum = np.sum(np.absolute(step[j].values))
        abs_sums.append(abs_sum)
        abs_mean = np.mean(np.absolute(step[j].values))
        abs_means.append(abs_mean)

    tr_lab.loc[tr_lab.measurement == i, "x_sum"] = sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_sum"] = sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_sum"] = sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_mean"] = means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_mean"] = means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_mean"] = means[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_sum"] = abs_sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_sum"] = abs_sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_sum"] = abs_sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_mean"] = abs_means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_mean"] = abs_means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_mean"] = abs_means[2]


    #calculate magnitude of acceleration for the step - maybe delete later    
    accels = step.loc[:,("x","y","z")]
    mags = accels.apply(accel_mag, axis="columns")
    tr_lab.loc[tr_lab.measurement == i, "sum_accel_mag"] = np.sum(mags)
    tr_lab.loc[tr_lab.measurement == i, "mean_accel_mag"] = np.mean(mags)    

    previous = i

In [185]:
classification_target = 'label'
#all_covariates = ['x_sum', 'y_sum', 'z_sum', 'x_mean', 'y_mean', 'z_mean', 'x_abs_sum', 'y_abs_sum', 'z_abs_sum', 'x_abs_mean', 'y_abs_mean', 'z_abs_mean', 'sum_accel_mag', 'mean_accel_mag']
all_covariates = ['x_abs_sum', 'y_abs_sum', 'z_abs_sum', 'sum_accel_mag']
classification_outcome = train_set[classification_target]

In [186]:
#take augmented tr-lab data and split for test train
train_set = tr_lab.sample(frac=0.75)
test_set = tr_lab.drop(train_set.index)



In [190]:
#determine the best parameters for the random forest
rfc = RandomForestClassifier()


parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
    
}

cv = GridSearchCV(RandomForestClassifier(),parameters,cv=5)
cv.fit(new_covariates,train_set.label.values.ravel())
best_params = cv.best_params_ 
best_params

{'max_depth': 2, 'n_estimators': 10}

In [199]:
rfc = RandomForestClassifier(max_depth = best_params['max_depth'], n_estimators = best_params['n_estimators'])
rfc.fit(new_covariates, classification_outcome)

RandomForestClassifier(max_depth=2, n_estimators=10)

In [205]:
predictions = rfc.predict(test_set[all_covariates])
sum(test_set.label.values == predictions)/len(predictions)


0.574468085106383

In [162]:
outcomes = []
classification_target = 'label'
all_covariates = ['x_abs_sum', 'y_abs_sum', 'z_abs_sum', 'sum_accel_mag']

for it in np.arange(50):
    train_set = tr_lab.sample(frac=0.70)
    test_set = tr_lab.drop(train_set.index)

    classification_outcome = train_set[classification_target]
    new_covariates = sel.fit_transform(train_set[all_covariates])

    rfc = RandomForestClassifier(max_depth = 16, n_estimators = 50)
    rfc.fit(new_covariates, classification_outcome)

    predictions = rfc.predict(test_set[all_covariates])
    outcome = sum(test_set.label.values == predictions)/len(predictions)

    outcomes.append(outcome)

In [158]:
np.mean(outcomes)

0.6453097345132742

In [159]:
outcomes

[0.6460176991150443,
 0.6548672566371682,
 0.6460176991150443,
 0.6371681415929203,
 0.5398230088495575,
 0.5929203539823009,
 0.6637168141592921,
 0.6194690265486725,
 0.6814159292035398,
 0.6283185840707964,
 0.6106194690265486,
 0.6194690265486725,
 0.6991150442477876,
 0.6460176991150443,
 0.6814159292035398,
 0.7699115044247787,
 0.6106194690265486,
 0.6902654867256637,
 0.6460176991150443,
 0.6283185840707964,
 0.6283185840707964,
 0.6371681415929203,
 0.7256637168141593,
 0.6283185840707964,
 0.5929203539823009,
 0.7079646017699115,
 0.6814159292035398,
 0.6371681415929203,
 0.6814159292035398,
 0.6371681415929203,
 0.6902654867256637,
 0.584070796460177,
 0.6460176991150443,
 0.5929203539823009,
 0.584070796460177,
 0.6637168141592921,
 0.6460176991150443,
 0.7168141592920354,
 0.6194690265486725,
 0.6194690265486725,
 0.6194690265486725,
 0.6460176991150443,
 0.6106194690265486,
 0.6548672566371682,
 0.672566371681416,
 0.6283185840707964,
 0.6283185840707964,
 0.6725663716814

In [105]:
test_set[new_covariate_columns]

Unnamed: 0,x_sum,y_sum,z_sum,x_abs_sum,y_abs_sum,z_abs_sum,sum_accel_mag
0,-2.410060,0.136912,-0.198931,2.410060,0.253006,0.764265,2.650540
2,-2.908349,0.075912,-3.747814,2.908349,0.233755,3.747814,4.955108
6,-5.718872,0.190712,0.618705,5.718872,0.190712,0.993945,5.823314
9,-0.690291,0.330846,1.746288,3.659627,2.984590,4.301834,7.544168
12,-2.755277,1.038858,2.305534,5.902071,3.609867,5.692205,9.851252
...,...,...,...,...,...,...,...
361,5.443810,-4.203818,-0.591356,10.285451,8.058047,4.792261,15.236714
363,-1.064737,2.872071,5.105200,5.395330,7.594407,7.629509,14.520268
364,-3.303814,-1.162364,-1.401775,13.605613,8.723880,7.208120,18.654406
369,-0.452410,-3.587538,-2.161907,11.544880,9.726101,5.581281,17.357899
