## Handcraft Features

This notebook runs different ML models using the mean and st.dev. of the x,y,z accelerometer and gyroscope measurements for each sample (window of 2 secs.).

The test/train split is by subject.

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import pickle

In [3]:
thigh100 = pd.read_pickle('./data/thigh_clean.pkl')

In [4]:
thigh100.head()

Unnamed: 0,attr_time_gyr,attr_x_gyr,attr_y_gyr,attr_z_gyr,attr_time_acc,attr_x_acc,attr_y_acc,attr_z_acc,activity,label,subject,sample_num
800,1435996984038,-0.786994,-0.328964,0.244081,1435996984038,0.469264,4.292205,1.912369,climbingdown,0,1,8
801,1435996984056,-0.539593,-0.340265,0.329907,1435996984056,0.769736,5.878364,2.906561,climbingdown,0,1,8
802,1435996984076,-0.334953,-0.454802,0.37786,1435996984076,1.098939,7.167043,3.312977,climbingdown,0,1,8
803,1435996984099,-0.086331,-0.496036,0.33907,1435996984099,1.409586,6.937798,4.225766,climbingdown,0,1,8
804,1435996984118,0.157099,-0.525052,0.350982,1435996984118,1.442507,6.427235,5.054759,climbingdown,0,1,8


In [9]:
thigh100.columns

Index(['attr_time_gyr', 'attr_x_gyr', 'attr_y_gyr', 'attr_z_gyr',
       'attr_time_acc', 'attr_x_acc', 'attr_y_acc', 'attr_z_acc', 'activity',
       'label', 'subject', 'sample_num'],
      dtype='object')

In [11]:
def Range(x):
    return x.max() - x.min()

In [15]:
ranges = thigh100.groupby(['activity','subject','sample_num']).apply(Range).add_prefix('range_')

In [16]:
st_devs = thigh100.groupby(['activity','subject','sample_num']).std().add_prefix('std_')
means = thigh100.groupby(['activity','subject','sample_num']).mean().add_prefix('mean_')

In [6]:
means.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_attr_time_gyr,mean_attr_x_gyr,mean_attr_y_gyr,mean_attr_z_gyr,mean_attr_time_acc,mean_attr_x_acc,mean_attr_y_acc,mean_attr_z_acc,mean_label
activity,subject,sample_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
climbingdown,1,0,1435997000000.0,-0.021411,-0.07507,0.191922,1435997000000.0,-0.744758,9.274814,2.047569,0.0
climbingdown,1,1,1435997000000.0,-0.020886,-0.074762,0.192169,1435997000000.0,-0.744351,9.27473,2.046779,0.0
climbingdown,1,2,1435997000000.0,-0.022025,-0.076054,0.191228,1435997000000.0,-0.739617,9.276885,2.045229,0.0
climbingdown,1,3,1435997000000.0,-0.021219,-0.07485,0.192484,1435997000000.0,-0.736684,9.278334,2.041135,0.0
climbingdown,1,4,1435997000000.0,-0.015914,-0.079395,0.20121,1435997000000.0,-0.720661,9.271325,2.042553,0.0


In [17]:
df = pd.concat([st_devs,means,ranges],axis=1)

In [18]:
df.reset_index(inplace=True)

In [19]:
df.head()

Unnamed: 0,activity,subject,sample_num,std_attr_time_gyr,std_attr_x_gyr,std_attr_y_gyr,std_attr_z_gyr,std_attr_time_acc,std_attr_x_acc,std_attr_y_acc,...,mean_label,range_attr_time_gyr,range_attr_x_gyr,range_attr_y_gyr,range_attr_z_gyr,range_attr_time_acc,range_attr_x_acc,range_attr_y_acc,range_attr_z_acc,range_label
0,climbingdown,1,8,581.291714,0.942661,0.85509,0.374873,582.030385,1.470079,3.465349,...,0.0,1985.0,4.308738,5.279708,1.747991,1985.0,9.214684,15.316306,18.915391,0.0
1,climbingdown,1,9,579.635665,0.976116,0.803382,0.449999,579.373594,1.56684,3.024721,...,0.0,1981.0,4.679228,4.421137,2.178345,1982.0,7.89069,13.982736,21.457434,0.0
2,climbingdown,1,10,582.685275,0.995383,1.088734,0.45814,582.662663,2.012424,3.810429,...,0.0,2024.0,4.831944,7.441561,2.543032,2035.0,14.829087,15.386336,20.637421,0.0
3,climbingdown,1,11,580.33192,1.016299,1.11355,0.605436,581.116097,1.996506,3.389655,...,0.0,1959.0,4.633718,6.475477,2.649017,1969.0,11.089942,14.787786,16.913838,0.0
4,climbingdown,1,13,581.322631,1.065192,0.959656,0.362656,581.596644,1.581883,3.813839,...,0.0,1993.0,4.612033,8.074722,1.933694,1993.0,7.879318,15.717933,21.53764,0.0


In [10]:
mean_by_activity = df.groupby('activity').mean()

In [11]:
mean_by_activity.head()

Unnamed: 0_level_0,subject,sample_num,std_attr_time_gyr,std_attr_x_gyr,std_attr_y_gyr,std_attr_z_gyr,std_attr_time_acc,std_attr_x_acc,std_attr_y_acc,std_attr_z_acc,std_label,mean_attr_time_gyr,mean_attr_x_gyr,mean_attr_y_gyr,mean_attr_z_gyr,mean_attr_time_acc,mean_attr_x_acc,mean_attr_y_acc,mean_attr_z_acc,mean_label
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
climbingdown,5.355433,9655.521639,585.12097,1.449157,1.6594,0.953792,586.639058,3.072917,4.408697,3.754533,0.0,1436668000000.0,0.005054,-0.064086,0.023899,1436668000000.0,-0.070666,9.211616,0.888937,0.0
climbingup,5.611293,10470.072495,584.356165,1.161388,1.139453,0.903214,584.681147,2.478625,3.578465,2.809748,0.0,1436406000000.0,-0.048676,0.049494,0.012684,1436406000000.0,-1.581276,8.432709,0.589573,1.0
jumping,5.572093,10521.206977,584.046446,1.4223,1.715647,0.974118,584.479848,4.085402,7.713975,5.077086,0.0,1437219000000.0,-0.029698,-0.002621,0.085623,1437219000000.0,0.947861,5.941585,1.011493,2.0
lying,5.50702,10569.773479,585.328988,0.017869,0.025548,0.017131,585.694989,0.076381,0.081158,0.075653,0.0,1437206000000.0,-0.015563,-0.003519,0.040072,1437206000000.0,-6.184426,-0.506557,4.820264,3.0
running,5.476879,10849.622438,584.204995,2.060803,2.353661,1.578219,583.730403,5.147105,6.883348,5.949692,0.0,1436475000000.0,-0.037049,0.012423,0.094349,1436475000000.0,0.555622,7.99295,-0.074964,4.0


In [12]:
mean_by_activity.columns

Index(['subject', 'sample_num', 'std_attr_time_gyr', 'std_attr_x_gyr',
       'std_attr_y_gyr', 'std_attr_z_gyr', 'std_attr_time_acc',
       'std_attr_x_acc', 'std_attr_y_acc', 'std_attr_z_acc', 'std_label',
       'mean_attr_time_gyr', 'mean_attr_x_gyr', 'mean_attr_y_gyr',
       'mean_attr_z_gyr', 'mean_attr_time_acc', 'mean_attr_x_acc',
       'mean_attr_y_acc', 'mean_attr_z_acc', 'mean_label'],
      dtype='object')

In [20]:
from statsmodels.api import add_constant,MNLogit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score,accuracy_score
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,GaussianNB

In [21]:
df.columns

Index(['activity', 'subject', 'sample_num', 'std_attr_time_gyr',
       'std_attr_x_gyr', 'std_attr_y_gyr', 'std_attr_z_gyr',
       'std_attr_time_acc', 'std_attr_x_acc', 'std_attr_y_acc',
       'std_attr_z_acc', 'std_label', 'mean_attr_time_gyr', 'mean_attr_x_gyr',
       'mean_attr_y_gyr', 'mean_attr_z_gyr', 'mean_attr_time_acc',
       'mean_attr_x_acc', 'mean_attr_y_acc', 'mean_attr_z_acc', 'mean_label',
       'range_attr_time_gyr', 'range_attr_x_gyr', 'range_attr_y_gyr',
       'range_attr_z_gyr', 'range_attr_time_acc', 'range_attr_x_acc',
       'range_attr_y_acc', 'range_attr_z_acc', 'range_label'],
      dtype='object')

In [22]:
y = df.activity

In [23]:
columns=['std_attr_x_gyr', 'std_attr_y_gyr', 'std_attr_z_gyr',
         'std_attr_x_acc', 'std_attr_y_acc', 'std_attr_z_acc', 
         'mean_attr_x_gyr', 'mean_attr_y_gyr', 'mean_attr_z_gyr', 
         'mean_attr_x_acc', 'mean_attr_y_acc', 'mean_attr_z_acc',
         'range_attr_x_gyr', 'range_attr_y_gyr','range_attr_z_gyr', 
         'range_attr_x_acc', 'range_attr_y_acc', 'range_attr_z_acc']
train_mask = df.subject <= 7
test_mask = df.subject >= 8
X_train = df[columns][train_mask]
X_test = df[columns][test_mask]
y_train = df.activity[train_mask]
y_test = df.activity[test_mask]

In [24]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
model = LogisticRegression()
model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')



Training F1 score is: 0.839
Testing F1 score is: 0.606


In [26]:
model = KNeighborsClassifier(n_neighbors=4)
model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 0.974
Testing F1 score is: 0.564


In [27]:
model = DecisionTreeClassifier(max_depth=22)
model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 0.999
Testing F1 score is: 0.516


In [28]:
model = BaggingClassifier(max_features=.8,max_samples=1.0,n_estimators=100,random_state=42)
model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 1.000
Testing F1 score is: 0.559


In [29]:
model = RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=1,n_estimators=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 1.000
Testing F1 score is: 0.558


In [30]:
model = ExtraTreesClassifier(max_depth=20,max_leaf_nodes=None,n_estimators=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 1.000
Testing F1 score is: 0.627


In [33]:
model = AdaBoostClassifier(RandomForestClassifier(max_depth=20,random_state=42,min_samples_leaf=1,n_estimators=100),learning_rate=1.0,random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 0.793
Testing F1 score is: 0.404


  'precision', 'predicted', average, warn_for)


In [31]:
model = svm.SVC(kernel='rbf',gamma='scale',C=2.0)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')

Training F1 score is: 0.965
Testing F1 score is: 0.549


In [32]:
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print(f'Training F1 score is: {f1_score(y_train,y_pred_train,average="macro"):.3f}')
print(f'Testing F1 score is: {f1_score(y_test,y_pred,average="macro"):.3f}')
# NOTE: dropping the columns for std of gyroscope x,y,z values increased train F1 to 

Training F1 score is: 0.659
Testing F1 score is: 0.522


In [66]:
X = add_constant(X)

  return ptp(axis=axis, out=out, **kwargs)


In [52]:
y = df.activity

In [54]:
lr_data = pd.get_dummies(data=df,columns=['subject'],drop_first=True)
columns=['std_attr_x_gyr', 'std_attr_y_gyr', 'std_attr_z_gyr',
         'std_attr_x_acc', 'std_attr_y_acc', 'std_attr_z_acc', 
         'mean_attr_x_gyr', 'mean_attr_y_gyr', 'mean_attr_z_gyr', 
         'mean_attr_x_acc', 'mean_attr_y_acc', 'mean_attr_z_acc','subject_2',
       'subject_3', 'subject_4', 'subject_5', 'subject_6', 'subject_7',
       'subject_8', 'subject_9', 'subject_10']
X = lr_data[columns]
y = lr_data.activity

In [57]:
X = add_constant(X)

  return ptp(axis=axis, out=out, **kwargs)


In [58]:
smodel = MNLogit(y,X)

In [59]:
model_fitted=smodel.fit(method='bfgs',maxiter=5000)._results

Optimization terminated successfully.
         Current function value: 0.632951
         Iterations: 865
         Function evaluations: 869
         Gradient evaluations: 869


In [60]:
model_fitted.summary()

0,1,2,3
Dep. Variable:,activity,No. Observations:,21949.0
Model:,MNLogit,Df Residuals:,21795.0
Method:,MLE,Df Model:,147.0
Date:,"Tue, 21 May 2019",Pseudo R-squ.:,0.6823
Time:,16:02:29,Log-Likelihood:,-13893.0
converged:,True,LL-Null:,-43734.0
,,LLR p-value:,0.0

activity=climbingup,coef,std err,z,P>|z|,[0.025,0.975]
const,36.8542,1.216,30.313,0.000,34.471,39.237
std_attr_x_gyr,1.8677,0.182,10.255,0.000,1.511,2.225
std_attr_y_gyr,-4.4518,0.164,-27.166,0.000,-4.773,-4.131
std_attr_z_gyr,4.6366,0.210,22.105,0.000,4.225,5.048
std_attr_x_acc,-1.2845,0.082,-15.581,0.000,-1.446,-1.123
std_attr_y_acc,0.0780,0.065,1.194,0.233,-0.050,0.206
std_attr_z_acc,0.0491,0.083,0.592,0.554,-0.114,0.212
mean_attr_x_gyr,-2.4333,0.269,-9.043,0.000,-2.961,-1.906
mean_attr_y_gyr,2.5574,0.178,14.337,0.000,2.208,2.907
mean_attr_z_gyr,-0.4690,0.368,-1.276,0.202,-1.190,0.252
