In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [2]:
tr_time = pd.read_csv("train_time_series.csv").drop(columns = ["timestamp","UTC time","accuracy"])
tr_lab = pd.read_csv("train_labels.csv").drop(columns = ["timestamp","UTC time"])

In [3]:
tr_time.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)
tr_lab.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)

df = tr_time.merge(tr_lab,on="measurement")  #remove / move??

In [4]:
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,0.024384,-0.710709,0.030304
3740,24326,0.487228,-1.099136,-0.015213
3741,24327,0.369446,-0.968506,0.036713
3742,24328,0.167877,-0.802826,0.049805
3743,24329,0.689346,-0.991043,0.034973


In [5]:
tr_lab

Unnamed: 0,measurement,label
0,20589,1
1,20599,1
2,20609,1
3,20619,1
4,20629,1
...,...,...
370,24289,4
371,24299,4
372,24309,4
373,24319,4


In [6]:
#Standardise X,Y,Z accels
std = preprocessing.StandardScaler()
for k in ["x", "y", "z"]:
    tr_time[k] = std.fit_transform(tr_time[k].values.reshape(-1,1))
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,-0.433313,0.652926,-0.15432
3740,24326,0.736537,-0.208107,-0.284082
3741,24327,0.438839,0.081464,-0.136049
3742,24328,-0.07063,0.448729,-0.098726
3743,24329,1.247394,0.031505,-0.141008


In [7]:
def accel_mag(accels):
    """takes an array containing x,y,z accelarations, calculates their magnitude"""
    return np.sqrt(np.sum(accels**2))


In [9]:
previous = 0

for i in tr_lab.measurement:
    boolinds = pd.Series((tr_time.measurement.values <= i) & (tr_time.measurement.values > previous))

    step = tr_time[boolinds.values]
    
    accels = step.loc[:,("x","y","z")]
    mags = accels.apply(accel_mag, axis="columns")
    print(i)

20589
20599
20609
20619
20629
20639
20649
20659
20669
20679
20689
20699
20709
20719
20729
20739
20749
20759
20769
20779
20789
20799
20809
20819
20829
20839
20849
20859
20869
20879
20889
20899
20909
20919
20929
20939
20949
20959
20969
20979
20989
20999
21009
21019
21029
21039
21049
21059
21069
21079
21089
21099
21109
21119


KeyboardInterrupt: 

In [8]:
#if the above doesn't work aggregate the x's, the y's, the z's togehter - maybe use a sum or mean? then use x, y,z as input X for machine learning
previous = 0

for i in tr_lab.measurement:
    boolinds = pd.Series((tr_time.measurement.values <= i) & (tr_time.measurement.values > previous))

    step = tr_time[boolinds.values]

    #calculate some values of x, y, z from the step

    sums = []
    means = []
    abs_sums = []
    abs_means = []


    for j in ["x", "y", "z"]:
        accel_sum = np.sum(step[j].values)
        sums.append(accel_sum)
        accel_mean = np.mean(step[j].values)
        means.append(accel_mean)
        abs_sum = np.sum(np.absolute(step[j].values))
        abs_sums.append(abs_sum)
        abs_mean = np.mean(np.absolute(step[j].values))
        abs_means.append(abs_mean)

    tr_lab.loc[tr_lab.measurement == i, "x_sum"] = sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_sum"] = sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_sum"] = sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_mean"] = means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_mean"] = means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_mean"] = means[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_sum"] = abs_sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_sum"] = abs_sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_sum"] = abs_sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_mean"] = abs_means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_mean"] = abs_means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_mean"] = abs_means[2]


    #calculate magnitude of acceleration for the step - maybe delete later    
    accels = step.loc[:,("x","y","z")]
    mags = accels.apply(accel_mag, axis="columns")
    tr_lab.loc[tr_lab.measurement == i, "sum_accel_mag"] = np.sum(mags)
    tr_lab.loc[tr_lab.measurement == i, "mean_accel_mag"] = np.mean(mags)    

    previous = i

In [51]:
classification_target = 'label'
all_covariates = ['x_sum', 'y_sum', 'z_sum', 'x_mean', 'y_mean', 'z_mean', 'x_abs_sum', 'y_abs_sum', 'z_abs_sum', 'x_abs_mean', 'y_abs_mean', 'z_abs_mean', 'sum_accel_mag', 'mean_accel_mag']
#all_covariates = ['x_mean', 'y_mean', 'z_mean', 'x_abs_sum', 'y_abs_sum', 'z_abs_sum', 'x_abs_mean', 'y_abs_mean', 'z_abs_mean', 'sum_accel_mag', 'mean_accel_mag']


In [52]:
train_set = tr_lab

#figure out most important variables on whole data set 
sel = VarianceThreshold(threshold=.8)
classification_outcome = train_set[classification_target]
new_covariates = sel.fit_transform(train_set[all_covariates])

In [53]:
#determine the best hyperparameters for the random forest
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
    
}

cv = GridSearchCV(RandomForestClassifier(),parameters,cv=5)
cv.fit(new_covariates,train_set.label.values.ravel())
best_params = cv.best_params_ 
best_params

{'max_depth': 2, 'n_estimators': 10}

In [54]:
#initialise the random forest model with best hyperparameters and train
rfc = RandomForestClassifier(max_depth = best_params['max_depth'], n_estimators = best_params['n_estimators'])
rfc.fit(new_covariates, classification_outcome)

RandomForestClassifier(max_depth=2, n_estimators=10)

In [55]:
acc = []
for i in np.arange(100):
    a = cross_val_score(rfc, new_covariates, classification_outcome, cv=5)
    acc.append(np.mean(a))


In [56]:
np.mean(acc)

0.6217600000000001