In [223]:
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6023581/
#https://journals.sagepub.com/doi/full/10.1177/0020294018813692

NameError: name 'de' is not defined

In [203]:
import pandas as pd
import numpy as np
import datetime as dt
import timeit
from numpy.fft import fft, fftfreq, ifft
from scipy.signal import welch


from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [204]:
#initialise start timer
start = timeit.default_timer()

#read in files as DFs
tr_time = pd.read_csv("train_time_series.csv")
tr_lab = pd.read_csv("train_labels.csv")

In [205]:
#Change DF columns
tr_time = tr_time.drop(columns = ["accuracy", "UTC time", "timestamp"]).rename(columns = {"Unnamed: 0":"measurement"})
tr_lab = tr_lab.rename(columns = {"Unnamed: 0":"measurement"}).drop(columns = ["UTC time", "timestamp"])

In [206]:
def accel_mag(accels):
    """takes an array containing x,y,z accelerations, calculates their magnitude"""
    return np.sqrt(np.sum(accels**2))

def rotation(a):
    """for roll use a=y, b=z; for pitch use a=x, b=z; for yaw use a=y, b=x"""
    return np.arctan(a)

#mean freq ignores the large peak inthe
def mean_freq(fft_vals, fft_freqs):
    """Calculates  the mean frequency of fourier transformed data"""
    return sum(fft_freqs[1 : int(len(fft_vals)/2+1)]*fft_vals[1 : int(len(fft_vals)/2+1)])/(sum(fft_vals[1 : int(len(fft_vals)/2+1)]))

In [207]:
#set sampling and calculate sampling rate for FFT
samp_int = 0.1 #in seconds
samp_rate = 1/samp_int #in hz or per sec


In [208]:
def calculate_features(time_df, label_df):
    """Takes in the time series and labelled data and calculates features used for machine learning analysis"""
    #overlapping windows of ~50%
    #skip 1st value and last value, because of insufficient surrounding data.

    previous = time_df.measurement[0]

    for i in label_df.measurement:
        boolinds = pd.Series((time_df.measurement.values <= (i + 9)) & (time_df.measurement.values > previous))
        frame = time_df[boolinds.values]
        
        #calculate magnitude of acceleration and variation of acceleration for the step - maybe delete later    
        accels = frame.loc[:,("x","y","z")]
        mags = accels.apply(accel_mag, axis="columns")
        
        
        #calculate rolls, pitch, yaws
        rolls = rotation([accels.y, accels.z])
        pitches = rotation([accels.x, accels.z])
        yaws = rotation([accels.y, accels.x])
        
        #mean mag
        label_df.loc[label_df.measurement == i, "mean_accel_mag"] = np.mean(mags)
        #SD mag
        label_df.loc[label_df.measurement == i, "SD_mag"] = np.std(mags)
        #mag var
        label_df.loc[label_df.measurement == i, "var_mag"] = np.var(mags)
        #Coeff of Variation / Relative SD
        label_df.loc[label_df.measurement == i, "RSD_mag"] = np.std(mags)/np.mean(mags)
        #mag min
        label_df.loc[label_df.measurement == i, "min_mag"] = np.amin(mags)
        #mag max
        label_df.loc[label_df.measurement == i, "max_mag"] = np.amax(mags)
        # mag 25, 50, 75 percentile
        label_df.loc[label_df.measurement == i, "per_25_mag"] = np.percentile(mags,25)
        label_df.loc[label_df.measurement == i, "per_50_mag"] = np.percentile(mags,50)
        label_df.loc[label_df.measurement == i, "per_75_mag"] = np.percentile(mags,75)
        #mean rotations
        label_df.loc[label_df.measurement == i, "mean_roll"] = np.mean(rolls)
        label_df.loc[label_df.measurement == i, "mean_pitch"] = np.mean(pitches)
        label_df.loc[label_df.measurement == i, "mean_yaw"] = np.mean(yaws)
        #SD rotations
        label_df.loc[label_df.measurement == i, "SD_roll"] = np.std(rolls)
        label_df.loc[label_df.measurement == i, "SD_pitch"] = np.std(pitches)
        label_df.loc[label_df.measurement == i, "SD_yaw"] = np.std(yaws)

        #Fourier transform each step
        n = len(mags)
        mags_windowed = np.hanning(len(mags))*mags
        f = np.linspace(0, samp_rate, n)
        fft_vals = np.abs(fft(mags_windowed))
        label_df.loc[label_df.measurement == i, "mean_freq"] = mean_freq(fft_vals,f)
        label_df.loc[label_df.measurement == i, "max_intensity"] = np.max(fft_vals[1:])

        #calculate power spectrum density 
        f, psd = welch(mags, fs=samp_rate)
        #find index corresponding to max power between 0.3 and 0.8Hz
        #find max p
        label_df.loc[label_df.measurement == i, "max_power"] = np.max(psd[(f > 0.3) & (psd < 0.8)])
        #find frequency of max power
        label_df.loc[label_df.measurement == i, "max_p_freq"] = f[np.argmax(psd)]
        #total power in signal
        label_df.loc[label_df.measurement == i, "power"] = np.trapz(psd, f) 

        previous = i
    return label_df

In [209]:
#calculate features in the traianing data
calculate_features(tr_time, tr_lab)
#drop unused labelled data.
tr_lab = tr_lab.drop([tr_lab.index[0], tr_lab.index[len(tr_lab) - 1]])

In [210]:
covariates = tr_lab.drop(columns = ['measurement', 'label']).columns

y_train = tr_lab['label']
X_train = tr_lab[covariates]

In [211]:
#Determine best set of hyperparameters for a RFC on this dataset
rfc = RandomForestClassifier()

parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]   
}

cv = GridSearchCV(rfc,parameters,cv=10)
cv.fit(X_train, y_train)
best_params = cv.best_params_ 

In [212]:
#initialise random forest model with best hyperparameters and train
rfc = RandomForestClassifier(max_depth = best_params['max_depth'], n_estimators = best_params['n_estimators'])
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, n_estimators=250)

In [213]:
accuraccy = np.mean(cross_val_score(rfc,X_train, y_train, cv=5))
print(f"The RFC has an accuraccy of approximately: {accuraccy}")

The RFC has an accuraccy of approximately: 0.7532612612612614


In [214]:
#The RFC is now trained. 

In [215]:
#read in files as DFs
te_time = pd.read_csv("test_time_series.csv")
te_lab = pd.read_csv("test_labels.csv")

In [216]:
#change test data 
te_time = te_time.drop(columns = ["accuracy", "UTC time", "timestamp"]).rename(columns = {"Unnamed: 0":"measurement"})
te_lab = te_lab.rename(columns = {"Unnamed: 0":"measurement"}).drop(columns = ["UTC time", "timestamp"])

In [217]:
# calculate features for classification of test data
features = calculate_features(te_time, te_lab).drop(columns = ['measurement', 'label'])
features

Unnamed: 0,mean_accel_mag,SD_mag,var_mag,RSD_mag,min_mag,max_mag,per_25_mag,per_50_mag,per_75_mag,mean_roll,mean_pitch,mean_yaw,SD_roll,SD_pitch,SD_yaw,mean_freq,max_intensity,max_power,max_p_freq,power
0,1.050928,0.507928,0.257991,0.483314,0.310324,2.702354,0.832883,1.042505,1.233758,-0.259721,0.221563,-0.217013,0.497047,0.230338,0.537962,1.676593,5.205213,0.039964,1.666667,0.059783
1,1.069435,0.524165,0.274749,0.490132,0.291176,2.702354,0.889090,1.179666,1.277415,-0.287569,0.209458,-0.217684,0.492570,0.211406,0.550688,2.443980,5.601576,0.128695,3.684211,0.318193
2,1.043749,0.300503,0.090302,0.287907,0.291176,1.380144,0.914435,1.162418,1.215965,-0.325588,0.174246,-0.241392,0.456209,0.177818,0.533945,1.827619,3.940700,0.083467,2.105263,0.150836
3,1.088787,0.360690,0.130097,0.331276,0.355175,2.161288,0.914435,1.061572,1.204882,-0.290932,0.197818,-0.244853,0.494256,0.253236,0.531009,1.709890,4.935070,0.144780,1.578947,0.238561
4,1.040534,0.341633,0.116713,0.328324,0.355175,2.161288,0.902923,1.006377,1.098176,-0.295644,0.155349,-0.265543,0.475805,0.271762,0.499654,1.178035,4.850812,0.009121,1.052632,0.013191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,1.190278,0.460750,0.212291,0.387095,0.255789,2.507202,0.891365,1.161030,1.338375,-0.408058,0.047849,-0.321779,0.453211,0.363637,0.548692,2.401213,5.882671,0.203097,3.157895,0.414663
121,1.377015,0.817359,0.668076,0.593573,0.300963,4.099950,0.891365,1.224534,1.350859,-0.342135,0.146272,-0.320339,0.566346,0.417421,0.601219,2.360387,6.668274,0.409910,2.105263,0.991190
122,1.252640,0.845256,0.714457,0.674780,0.300963,4.099950,0.828172,0.953782,1.266122,-0.340187,0.112632,-0.273298,0.519710,0.432016,0.573834,1.859420,5.491404,0.182841,1.578947,0.234408
123,1.061209,0.472317,0.223084,0.445075,0.339535,2.008265,0.828172,0.953782,1.262206,-0.340109,0.037013,-0.274712,0.464062,0.363181,0.493115,1.690967,5.846705,0.091020,1.578947,0.119500


In [218]:
predictions = rfc.predict(features)

In [219]:
test_labels_done = pd.read_csv('test_labels.csv')
test_labels_done.label = predictions

In [220]:
test_labels_done.to_csv('test_labels_done.csv' ,index=False)

In [221]:
#finish timer
finish = timeit.default_timer()
run_time = finish - start
print(f"Total run time: {run_time}")

Total run time: 112.73216719999618


In [222]:
predictions

array([4, 4, 4, 4, 3, 2, 4, 3, 2, 4, 4, 3, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4,
       2, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 4, 2, 2, 3, 3,
       3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)