In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

import hervpd as hp

In [2]:
df = pd.read_csv( 'data/12features/combined.csv')
print(df.activity.unique())
df.describe()

['eat' 'focused-active' 'focused-passive' 'household-chores'
 'leisure-active' 'leisure-passive' 'movement' 'sleep' 'rest-active']


Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,1.1875,758.6,72.865018,81.37714,67.362033,108.560417,46.409227,20.082227,358.020221,2877.387599,1423.919671,1000.824001,2.753764
std,0.913249,129.785003,32.960406,13.75457,10.84968,36.385572,32.537407,18.316263,146.885899,3619.576847,1443.217366,1841.656665,2.296974
min,0.0,475.73,17.833,51.628,41.437,58.117,4.8825,0.0,83.0,85.654,57.852,7.9777,0.15468
25%,0.0,670.3625,51.8715,73.23225,60.29275,91.47025,27.43875,6.593075,258.0,995.1825,591.3075,258.9175,1.257
50%,2.0,750.545,67.057,79.942,66.1815,100.52,37.9945,15.4175,339.0,1807.95,1040.35,512.09,2.03735
75%,2.0,819.3075,85.82,89.504,74.28525,112.58,54.76225,26.7905,426.25,3304.25,1697.95,960.045,3.579975
max,2.0,1162.2,264.25,126.12,104.9,550.32,240.47,98.232,1103.0,29716.0,12958.0,16103.0,18.228


In [None]:
df.groupby('activity').count()['user']

### Apply scale to make all features go to mean = 0 and SD = 1. That will allow SVM kernels not to put too much weight on features with larger values.

In [None]:
#df.iloc[:, 3:15] = df.iloc[:, 3:15].apply(lambda x: scale(x))
df = hp.scaleWithinUser(df)
df.describe()

### Test dataset will have 20% of examples, the other 80% will be for training (with cross-validation)

In [None]:
train, test = train_test_split(df, test_size=0.2)
print (len(train), len(test))

### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [None]:
crossval = StratifiedShuffleSplit(n_splits=4, test_size=0.2)

c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [None]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

In [None]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

In [None]:
clf1 = svm.SVC(kernel='linear', cache_size=1000, C=grid_lin.best_params_['C'])
clf1.fit(X=train.iloc[:, 3:15], y=train['activity'])
hp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))

clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=grid_rbf.best_params_['C'], gamma=grid_rbf.best_params_['gamma'])
clf2.fit(X=train.iloc[:, 3:15], y=train['activity'])
hp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))


## Now, we can run the same steps above with multiple experiments! Time to have fun!!

In [40]:
# exclude from df activities with too few examples

df = pd.read_csv( 'data/12features/combined.csv')
df = hp.excludeActivities(df, ['leisure-active',  'rest-active'])
hp.runFlow(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


49 out of 103 right! :)
                  precision    recall  f1-score   support

             eat       0.12      0.29      0.17         7
  focused-active       0.63      0.63      0.63        19
 focused-passive       0.25      0.13      0.17        15
household-chores       0.38      0.31      0.34        16
 leisure-passive       0.53      0.59      0.56        17
        movement       0.54      0.47      0.50        15
           sleep       0.73      0.79      0.76        14

     avg / total       0.49      0.48      0.47       103



### 2 - Removing activities from the list

In [41]:
df_reduced = hp.excludeActivities(df, ['leisure-active',  'rest-active', 'sleep', 'movement', 'household-chores', 'eat'])
df_reduced.groupby('activity').count()['user']

activity
focused-active     88
focused-passive    73
leisure-passive    89
Name: user, dtype: int64

In [42]:
hp.runFlow(df_reduced)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


18 out of 50 right! :)
                 precision    recall  f1-score   support

 focused-active       0.33      0.18      0.24        22
focused-passive       0.41      0.50      0.45        14
leisure-passive       0.33      0.50      0.40        14

    avg / total       0.36      0.36      0.34        50



### 1 - All activities per user

In [32]:
ju = hp.userRows(df, 0)
ron = hp.userRows(df, 1)
edu = hp.userRows(df, 2)

ju.describe()

Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0
mean,0.0,705.760117,78.435041,87.234982,70.86617,129.230216,64.659298,21.486519,408.152047,2947.679649,1539.862433,1732.321988,1.932863
std,0.0,115.58983,42.775841,14.019577,11.197753,54.078833,44.097554,19.449443,185.021049,4435.316039,2005.189948,2836.653377,1.748349
min,0.0,478.32,23.498,51.628,41.437,71.208,14.743,0.23866,116.0,155.13,84.136,29.782,0.15468
25%,0.0,618.915,48.248,77.871,64.014,99.4045,28.7885,5.4167,275.0,696.53,473.085,258.845,0.71657
50%,0.0,707.75,70.417,84.776,72.15,111.19,54.254,16.935,397.0,1601.1,841.63,600.95,1.3738
75%,0.0,770.5,94.7935,96.9435,78.5235,144.05,88.97,32.69,505.0,3245.75,1695.8,2033.55,2.80945
max,0.0,1162.2,264.25,125.44,92.937,550.32,240.47,79.351,1103.0,29716.0,12958.0,16103.0,11.995


In [37]:
hp.runFlow(ju)



16 out of 35 right! :)
                  precision    recall  f1-score   support

             eat       0.00      0.00      0.00         1
  focused-active       0.33      0.50      0.40         2
 focused-passive       0.43      0.75      0.55         8
household-chores       0.00      0.00      0.00         4
 leisure-passive       0.40      0.33      0.36         6
        movement       0.60      0.38      0.46         8
           sleep       0.80      0.67      0.73         6

     avg / total       0.46      0.46      0.44        35



  'precision', 'predicted', average, warn_for)


In [38]:
hp.runFlow(ron)



10 out of 15 right! :)
                 precision    recall  f1-score   support

            eat       0.50      0.33      0.40         3
 focused-active       0.56      1.00      0.71         5
leisure-passive       1.00      0.50      0.67         6
       movement       1.00      1.00      1.00         1

    avg / total       0.75      0.67      0.65        15



In [39]:
hp.runFlow(edu)



26 out of 54 right! :)
                  precision    recall  f1-score   support

             eat       0.25      0.43      0.32         7
  focused-active       0.50      0.86      0.63         7
 focused-passive       0.25      0.50      0.33         4
household-chores       0.50      0.25      0.33        12
 leisure-passive       1.00      0.10      0.18        10
        movement       0.50      0.50      0.50         6
           sleep       0.89      1.00      0.94         8

     avg / total       0.60      0.48      0.45        54



### 2 - Splitting database into movement categories (movement, household chores and exercise) and all others

In [29]:
includelist = ['movement', 'household-chores']

df_move = hp.addPartition(df, includelist, pname='move', labelIn='move', labelOut='still')
df_move.groupby('move').count()['user']

move
move     133
still    378
Name: user, dtype: int64

In [30]:
hp.runFlow(df_move, labelName='move')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


408 train examples and 103 test examples
Best params for linear kernel: {'C': 1.0} with score 0.90549
Best params for RBF kernel: {'C': 1.0, 'gamma': 0.10000000000000001} with score 0.90244
--- test results for linear kernel:
93 out of 103 right! :)
--- test results for RBF kernel:
93 out of 103 right! :)


In [None]:
jumove = hp.addPartition(ju, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(jumove, labelName='move')

In [None]:
edumove = hp.addPartition(edu, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(edumove, labelName='move')

In [None]:
ronmove = hp.addPartition(ron, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(ronmove, labelName='move')

In [None]:
includelist
dfnomove = hp.excludeActivities(df, includelist)
dfnomove.describe()
hp.runFlow(dfnomove)

In [None]:
[x for x in df['activity']]

In [47]:
hp.runFlowByUser(df)

CLASSFIER FOR USER 0
136 train examples and 35 test examples




Best params for linear kernel: {'C': 1.0} with score 0.56250
--- test results for linear kernel:
                  precision    recall  f1-score   support

  focused-active       0.00      0.00      0.00         3
 focused-passive       0.43      0.50      0.46         6
household-chores       0.57      0.57      0.57         7
 leisure-passive       1.00      0.29      0.44         7
        movement       0.67      0.50      0.57         8
           sleep       0.33      1.00      0.50         4

     avg / total       0.58      0.49      0.47        35

Best params for RBF kernel: {'C': 10.0, 'gamma': 0.10000000000000001} with score 0.61607
--- test results for RBF kernel:
                  precision    recall  f1-score   support

             eat       0.00      0.00      0.00         0
  focused-active       0.25      0.33      0.29         3
 focused-passive       0.50      0.67      0.57         6
household-chores       0.80      0.57      0.67         7
 leisure-passive       

  'recall', 'true', average, warn_for)


Best params for RBF kernel: {'C': 10.0, 'gamma': 0.01} with score 0.75000
--- test results for RBF kernel:
                 precision    recall  f1-score   support

            eat       0.50      0.50      0.50         2
 focused-active       0.71      0.83      0.77         6
leisure-passive       0.40      0.40      0.40         5
       movement       1.00      0.50      0.67         2

    avg / total       0.62      0.60      0.60        15

CLASSFIER FOR USER 2
214 train examples and 54 test examples




Best params for linear kernel: {'C': 1.0} with score 0.49419
--- test results for linear kernel:
                  precision    recall  f1-score   support

             eat       0.11      0.33      0.17         3
  focused-active       0.47      0.78      0.58         9
 focused-passive       0.29      0.25      0.27         8
household-chores       0.50      0.40      0.44        10
 leisure-passive       0.50      0.25      0.33         8
        movement       1.00      0.29      0.44         7
           sleep       0.89      0.89      0.89         9

     avg / total       0.57      0.48      0.48        54

Best params for RBF kernel: {'C': 10.0, 'gamma': 0.01} with score 0.41279
--- test results for RBF kernel:
                  precision    recall  f1-score   support

             eat       0.09      0.33      0.14         3
  focused-active       0.43      0.67      0.52         9
 focused-passive       0.33      0.25      0.29         8
household-chores       0.50      0.40 