In [82]:
#Use papers for reference
#https://www.sciencedirect.com/science/article/pii/S0021929015005096?casa_token=bRS9rMcnv9EAAAAA:g4I7cpGwxDV1ewaLAnG-kkoYe4XNth9t9Nrc_00GZUX25CyfkEZ1WECXEhp0s54Gi_5XlOCHlqI
#https://ieeexplore.ieee.org/abstract/document/6981909?casa_token=PO2iJ-5Sa6EAAAAA:fJTGR6JuxQtKaY-0cJ1K_aRzaDoP7DrkkhGop3fMuPfah1svKK3kieIRckTjg9SIqAlNN6laZw


In [171]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [172]:
tr_time = pd.read_csv("train_time_series.csv").drop(columns = ["timestamp","UTC time","accuracy"])
tr_lab = pd.read_csv("train_labels.csv").drop(columns = ["timestamp","UTC time"])

In [173]:
tr_time.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)
tr_lab.rename(columns = {"Unnamed: 0":"measurement"}, inplace=True)

df = tr_time.merge(tr_lab,on="measurement")  #remove / move??

In [174]:
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,0.024384,-0.710709,0.030304
3740,24326,0.487228,-1.099136,-0.015213
3741,24327,0.369446,-0.968506,0.036713
3742,24328,0.167877,-0.802826,0.049805
3743,24329,0.689346,-0.991043,0.034973


In [175]:
tr_lab

Unnamed: 0,measurement,label
0,20589,1
1,20599,1
2,20609,1
3,20619,1
4,20629,1
...,...,...
370,24289,4
371,24299,4
372,24309,4
373,24319,4


In [176]:
#Standardise X,Y,Z accels
std = preprocessing.StandardScaler()
for k in ["x", "y", "z"]:
    tr_time[k] = std.fit_transform(tr_time[k].values.reshape(-1,1))
tr_time.tail()

Unnamed: 0,measurement,x,y,z
3739,24325,-0.433313,0.652926,-0.15432
3740,24326,0.736537,-0.208107,-0.284082
3741,24327,0.438839,0.081464,-0.136049
3742,24328,-0.07063,0.448729,-0.098726
3743,24329,1.247394,0.031505,-0.141008


In [177]:
def accel_mag(accels):
    """takes an array containing x,y,z accelerations, calculates their magnitude"""
    return np.sqrt(np.sum(accels**2))

def accel_var(accels):
    """takes an array containing unidirectional accelaration, calculates and returns the variance"""
    return np.var(accels)

def calc_energy(accels, num_observations=10):
    """"""
    return (np.sum(accels**2)/3)
            

In [178]:
#if the above doesn't work aggregate the x's, the y's, the z's togehter - maybe use a sum or mean? then use x, y,z as input X for machine learning
previous = 0

for i in tr_lab.measurement:
    
    
    boolinds = pd.Series((tr_time.measurement.values <= i) & (tr_time.measurement.values > previous))
    num_observations = sum(boolinds)
    step = tr_time[boolinds.values]

    #calculate some values of x, y, z from the step

    sums = []
    means = []
    abs_sums = []
    abs_means = []


    for j in ["x", "y", "z"]:
        accel_sum = np.sum(step[j].values)
        sums.append(accel_sum)
        accel_mean = np.mean(step[j].values)
        means.append(accel_mean)
        abs_sum = np.sum(np.absolute(step[j].values))
        abs_sums.append(abs_sum)
        abs_mean = np.mean(np.absolute(step[j].values))
        abs_means.append(abs_mean)

    tr_lab.loc[tr_lab.measurement == i, "x_sum"] = sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_sum"] = sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_sum"] = sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_mean"] = means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_mean"] = means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_mean"] = means[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_sum"] = abs_sums[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_sum"] = abs_sums[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_sum"] = abs_sums[2]
    tr_lab.loc[tr_lab.measurement == i, "x_abs_mean"] = abs_means[0]
    tr_lab.loc[tr_lab.measurement == i, "y_abs_mean"] = abs_means[1]
    tr_lab.loc[tr_lab.measurement == i, "z_abs_mean"] = abs_means[2]


    #calculate magnitude of acceleration and variation of acceleration for the step - maybe delete later    
    accels = step.loc[:,("x","y","z")]
    mags = accels.apply(accel_mag, axis="columns")
    tr_lab.loc[tr_lab.measurement == i, "sum_accel_mag"] = np.sum(mags)
    tr_lab.loc[tr_lab.measurement == i, "mean_accel_mag"] = np.mean(mags)
    
    tr_lab.loc[tr_lab.measurement == i, "var_x"] = accel_var(accels.x)
    tr_lab.loc[tr_lab.measurement == i, "var_y"] = accel_var(accels.y)
    tr_lab.loc[tr_lab.measurement == i, "var_z"] = accel_var(accels.z)
    tr_lab.loc[tr_lab.measurement == i, "var_mag"] = accel_var(mags)
    
    energy = accels.apply(calc_energy, axis="columns")/num_observations
    tr_lab.loc[tr_lab.measurement == i, "mean_energy"] = np.mean(energy)
    tr_lab.loc[tr_lab.measurement == i, "sum_energy"] = np.sum(energy)
    
    previous = i


In [179]:
tr_lab.head(15)

Unnamed: 0,measurement,label,x_sum,y_sum,z_sum,x_mean,y_mean,z_mean,x_abs_sum,y_abs_sum,...,y_abs_mean,z_abs_mean,sum_accel_mag,mean_accel_mag,var_x,var_y,var_z,var_mag,mean_energy,sum_energy
0,20589,1,-2.41006,0.136912,-0.198931,-0.602515,0.034228,-0.049733,2.41006,0.253006,...,0.063252,0.191066,2.65054,0.662635,0.003195,0.005737,0.063894,0.000409,0.036625,0.146498
1,20599,1,-5.34273,0.743808,3.005809,-0.534273,0.074381,0.300581,5.34273,1.810367,...,0.181037,0.349219,7.086467,0.708647,0.030539,0.044506,0.071647,0.025842,0.017601,0.176007
2,20609,1,-2.908349,0.075912,-3.747814,-0.290835,0.007591,-0.374781,2.908349,0.233755,...,0.023376,0.374781,4.955108,0.495511,0.015768,0.000734,0.012715,0.008791,0.008477,0.084774
3,20619,1,-5.095824,0.001228,0.680694,-0.509582,0.000123,0.068069,5.095824,1.575684,...,0.157568,0.315224,6.448828,0.644883,0.04338,0.047616,0.115639,0.05507,0.015698,0.156981
4,20629,1,-3.954437,0.15912,-0.69654,-0.395444,0.015912,-0.069654,3.954437,0.532057,...,0.053206,0.19511,4.821807,0.482181,0.033002,0.009473,0.045976,0.017434,0.008331,0.083311
5,20639,1,-3.462633,0.029031,-3.888017,-0.346263,0.002903,-0.388802,3.462633,0.129403,...,0.01294,0.388802,5.291966,0.529197,0.002748,0.00029,0.011957,0.00602,0.009536,0.095356
6,20649,1,-5.718872,0.190712,0.618705,-0.571887,0.019071,0.061871,5.718872,0.190712,...,0.019071,0.099395,5.823314,0.582331,0.002156,0.000232,0.007624,0.002149,0.011375,0.113753
7,20659,1,-4.521525,0.064953,4.903052,-0.452152,0.006495,0.490305,4.521525,2.68264,...,0.268264,0.490305,7.911322,0.791132,0.016143,0.174442,0.095908,0.105486,0.024379,0.243792
8,20669,1,-2.774368,0.480519,0.621489,-0.277437,0.048052,0.062149,2.912229,0.534783,...,0.053478,0.287566,4.561646,0.456165,0.047936,0.00404,0.123455,0.050488,0.008619,0.086191
9,20679,2,-0.690291,0.330846,1.746288,-0.069029,0.033085,0.174629,3.659627,2.98459,...,0.298459,0.430183,7.544168,0.754417,0.21636,0.13715,0.268431,0.089152,0.021943,0.219432


In [187]:
y_train = tr_lab['label']
cols = ['mean_accel_mag','var_x', 'var_y', 'var_z', 'var_mag', 'sum_energy']
#cols = ['sum_accel_mag']
X_train = tr_lab[cols]



In [188]:
np.var(X_train.mean_accel_mag)

0.23144457099259982

In [189]:
#figure out most important variables on whole data set 
sel = VarianceThreshold(threshold=.6)
new_covariates = sel.fit_transform(X_train)
sel.get_support(indices=False)

array([False,  True, False,  True,  True, False])

In [190]:
rfc = RandomForestClassifier()


parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]   
}

cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train,y_train)
best_params = cv.best_params_ 
best_params

{'max_depth': 8, 'n_estimators': 50}

In [191]:
#initialise the random forest model with best hyperparameters and train
rfc = RandomForestClassifier(max_depth = best_params['max_depth'], n_estimators = best_params['n_estimators'])
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, n_estimators=50)

In [192]:
cross_val_score(rfc,X_train, y_train, cv=10)

array([0.68421053, 0.65789474, 0.73684211, 0.65789474, 0.44736842,
       0.72972973, 0.56756757, 0.78378378, 0.64864865, 0.62162162])