# Tuning hyperparameters to find the best classification model

## This study is based off of [Chiu-Yun Lin's Classification.ipynb](https://github.com/kcmeehan/BiometricsActivityClassification/blob/master/Classification.ipynb)

Import the necessary libraries:

In [10]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import DataProcess as DP

Load the pre-processed, segmented, feature-extracted data:

In [11]:
for subj_n in range(1,10):
    #load the data
    subj_filename='../PAMAP2_Dataset/Protocol/subject10'+str(subj_n)+'.dat'
    dp = DP.dataprocess(subj_filename)
    np.save('data'+str(subj_n)+'.npy', (dp.feat_labels,dp.data_segmented))

In [12]:
datalabels=[]
for i in range(1,10):
    feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
    datalabels.append(datalabelsi)

In [13]:
# Stack data from different subjects into one chunk:
dataset=np.vstack(datalabels)

# Decision Tree Classifier

In [14]:
# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

In [15]:
# Set up the hyperparameter parameterspace
max_depth = np.arange(3,11)
max_feat = np.arange(50, 151)
min_samples_leaf = np.arange(1,11)
min_imp_decrease = np.arange(0.0,0.05,0.005)
max_leaf_nodes = np.arange(10,20)

param_dist = {"max_depth": max_depth,
              "max_features": max_feat,
              "min_samples_leaf": min_samples_leaf,
              "criterion": ["gini", "entropy"],
              "min_impurity_decrease": min_imp_decrease,
              "max_leaf_nodes": max_leaf_nodes}

In [16]:
# Instatiate Classifier
tree = DecisionTreeClassifier()

In [17]:
# Split data into training and test set
X = dataset[:,:-1]
y = dataset[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
# Instantiate grid search object- first I will try the Randomized method to see how
# time-consuming it is
tree_rcv = RandomizedSearchCV(tree, param_dist, cv=9, random_state=42) # 9-fold cv for now

In [34]:
# Fit model to the training data
%time tree_rcv.fit(X_train, y_train)

CPU times: user 24.1 s, sys: 4 ms, total: 24.1 s
Wall time: 24.1 s


RandomizedSearchCV(cv=9, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'min_impurity_decrease': array([ 0.   ,  0.005,  0.01 ,  0.015,  0.02 ,  0.025,  0.03 ,  0.035,
        0.04 ,  0.045]), 'max_leaf_nodes': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'criterion': ['gini', 'entropy'], 'max_features': array([ 50,  51, ..., 149, 150]), 'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn

In [None]:
# Print the tuned parameters and score
print("RCV Tuned Decision Tree Parameters: {}".format(tree_rcv.best_params_))
print("RCV Best score is {}".format(tree_rcv.best_score_))

In [None]:
# Now I will try with regular grid search
tree_cv = GridSearchCV(tree, param_dist, cv=9)
%time tree_cv.fit(X_train, y_train)

In [None]:
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))