### 1. Carry out feature selection with a gradient boosting classifier

In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from parameter_feature_search import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob

import DataProcess as DP

Load the data and extract the features:

In [2]:
protocol_datadir = '../../TuringMachines/PAMAP2_Dataset/Protocol/'

In [3]:
profiles = list(glob.glob(protocol_datadir+'*.dat'))

In [5]:
all_dfs = []
for fname in profiles:
    print fname
    subject_index = int(fname.split('/')[-1].split('.')[0][-1])
    dp = DP.dataprocess(fname,T=512,stride=512)
     #For LOSO, we need subjectID in the final dataframe
    dp.df['subjectID'] = int(subject_index)*np.ones(len(dp.df))
    all_dfs.append(dp.df)
    
feature_df = pd.concat(all_dfs)

../../TuringMachines/PAMAP2_Dataset/Protocol/subject105.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject101.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject107.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject106.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject103.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject102.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject109.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject108.dat
../../TuringMachines/PAMAP2_Dataset/Protocol/subject104.dat


In [6]:
Y = feature_df['activityID']
X = feature_df.drop(['activityID','subjectID'],axis=1)

Scale the features:

In [7]:
sc = StandardScaler()
X_scaled = pd.DataFrame(sc.fit_transform(X),columns=X.columns)

#### 1.1 Find best hyperparameters using tree-selected features

In [8]:
model = GradientBoostingClassifier()

parameters = {
    'classify__n_estimators': (85,95,100,105),
    'classify__max_depth': (10,20,30,50,None),
    'classify__learning_rate':([0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5])
}

In [11]:
%time X_new, best_classifier_fselect = test_model_initial(model,X_scaled,Y,parameters)

Performing grid search...
('pipeline:', ['select', 'classify'])
parameters:
{'classify__learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'classify__n_estimators': (85, 95, 100, 105), 'classify__max_depth': (10, 20, 30, 50, None)}
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 22.4min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 100.6min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 232.9min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 394.3min
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed: 396.6min finished


done in 23868.118s
()
Best score: 0.969
Best parameters set:
	classify__learning_rate: 0.05
	classify__max_depth: 10
	classify__n_estimators: 85
Hold out score: 0.965
CPU times: user 2min, sys: 6.89 s, total: 2min 7s
Wall time: 6h 37min 48s


In [12]:
len(X_new.columns)

76

#### 1.2 Find the best classifier without reducing features

In [9]:
%time best_classifier_nodrop = test_model_initial_noselection(model,X_scaled,Y,parameters)

Performing grid search...
('pipeline:', ['classify'])
parameters:
{'classify__learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'classify__n_estimators': (85, 95, 100, 105), 'classify__max_depth': (10, 20, 30, 50, None)}
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 35.6min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 149.4min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 350.1min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 510.6min
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed: 511.2min finished


done in 30742.003s
()
Best score: 0.968
Best parameters set:
	classify__learning_rate: 0.1
	classify__max_depth: 50
	classify__n_estimators: 105
Hold out score: 0.979
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       103
        2.0       0.99      1.00      0.99        81
        3.0       0.99      0.96      0.97        91
        4.0       0.98      1.00      0.99       129
        5.0       1.00      0.98      0.99        56
        6.0       1.00      0.98      0.99        91
        7.0       1.00      0.99      1.00       112
       12.0       0.91      0.90      0.91        48
       13.0       1.00      1.00      1.00        37
       16.0       0.96      0.96      0.96       100
       17.0       0.93      0.97      0.95       129
       24.0       1.00      1.00      1.00        22

avg / total       0.98      0.98      0.98       999

CPU times: user 1min 45s, sys: 5.94 s, total: 1min 51s
Wall time: 8h 32min 22s


#### 1.3 Find the best classifier using the genetic algorithm

In [None]:
%time GA = Run_GA(X_scaled,Y,best_classifier_fselect)

In [None]:
GA.best_fitness

In [None]:
len(GA.feature_selection.columns)

In [None]:
plt.style.use('ggplot')
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
ax.plot(GA.fitness_evolution)
ax.set_xlabel('GA Iteration')
ax.set_ylabel('Test score')
plt.grid()
fig.savefig('GA_iterations_random_forest.png',dpi=400)

### 2. Gradient Boosting with LOSO Cross-Validation

#### 2.1 LOSO on all original features

In [None]:
sc = StandardScaler()
X_features_scaled = pd.DataFrame(sc.fit_transform(X),columns=X.columns)
X_features_scaled['activityID'] = feature_df['activityID'].values
X_features_scaled['subjectID'] = feature_df['subjectID'].values

First using the classifier optimized on all features:

In [None]:
subject_scores_no_selection_1 = LOSO(X_features_scaled,best_classifier_nodrop)

In [None]:
mean_LOSO_no_selection_1 = np.mean([subject_scores_no_selection_1[e] for e in list(subject_scores_no_selection_1.keys())])

Now using classifier optimized on features selected by tree-based method:

In [None]:
subject_scores_no_selection_2 = LOSO(X_features_scaled,best_classifier_fselect)

In [None]:
subject_scores_no_selection_2

In [None]:
mean_LOSO_no_selection_2 = np.mean([subject_scores_no_selection_2[e] for e in list(subject_scores_no_selection_2.keys())])

#### 2.2 LOSO on features selected by genetic algorithm

In [None]:
new_features_GA = list(GA.feature_columns) + ['activityID','subjectID']
X_feature_select_GA = feature_df[new_features_GA]

sc = StandardScaler()
X_feature_select_GA_scaled = pd.DataFrame(sc.fit_transform(X_feature_select_GA),columns=X_feature_select_GA.columns)
X_feature_select_GA_scaled['activityID'] = X_feature_select_GA['activityID'].values
X_feature_select_GA_scaled['subjectID'] = X_feature_select_GA['subjectID'].values

In [None]:
subject_scores_GA_selection = LOSO(X_feature_select_GA_scaled,best_classifier_fselect)

In [None]:
subject_scores_GA_selection

In [None]:
mean_LOSO_GA = np.mean([subject_scores_GA_selection[e] for e in list(subject_scores_GA_selection.keys())])

#### 2.3 LOSO on tree-based-selected features

In [None]:
new_features_Treebased = list(X_new.columns) + ['activityID','subjectID']
X_feature_select_Treebased = feature_df[new_features_Treebased]

sc = StandardScaler()
X_feature_select_Treebased_scaled = pd.DataFrame(sc.fit_transform(X_feature_select_Treebased),columns=X_feature_select_Treebased.columns)
X_feature_select_Treebased_scaled['activityID'] = X_feature_select_Treebased['activityID']
X_feature_select_Treebased_scaled['subjectID'] = X_feature_select_Treebased['subjectID']

In [None]:
subject_scores_Treebased_selection = LOSO(X_feature_select_Treebased,best_classifier_fselect)

In [None]:
subject_scores_Treebased_selection

In [None]:
mean_LOSO_Treebased = np.mean([subject_scores_Treebased_selection[e] for e in list(subject_scores_Treebased_selection.keys())])

In [None]:
print('Mean LOSO score for tree-based selection: %0.3f' %mean_LOSO_Treebased)
print('Mean LOSO score for GA-based selection: %0.3f' %mean_LOSO_GA)
print('Mean LOSO score for no feature selection 1: %0.3f' %mean_LOSO_no_selection_1)
print('Mean LOSO score for no feature selection 2: %0.3f' %mean_LOSO_no_selection_1)