In [2]:
#construct a decision tree classifier

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import DataProcess as DP

### Data Processing & Feature extraction

In [3]:
datalabels=[]
for i in range(1,10):
    feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
    datalabels.append(datalabelsi)

In [4]:
[x.shape for x in datalabels]

[(426, 198),
 (453, 198),
 (292, 198),
 (399, 198),
 (472, 198),
 (432, 198),
 (398, 198),
 (450, 198),
 (8, 198)]

In [5]:
# Stack data from different subjects into one chunk:
datalabels=np.vstack(datalabels)

# 1) Decision Tree Classification:

In [61]:
#shuffle data
np.random.shuffle(datalabels)

traindata=datalabels[:,:-1]
trainlabels=datalabels[:,-1]

In [62]:
cls=DecisionTreeClassifier(max_depth=8,min_impurity_decrease=0.015,max_leaf_nodes=14)
cls.fit(traindata,trainlabels)
cross_val_score(cls, traindata, trainlabels)

array([0.92619543, 0.89853556, 0.90566038])

#### Find out important nodes:

In [63]:
y=cls.feature_importances_
a=[[feature_names[i],y[i]] for i in range(len(y))]
a.sort(key=lambda x:-x[1])
a[:15]

[['ankle_acc16g_y_median', 0.12772201096576985],
 ['chest_acc16g_y_std', 0.1271225366929746],
 ['ankle_gyro_z_std', 0.12171610853890512],
 ['hand_acc16g_y_peak', 0.1156391030995954],
 ['chest_mag_x_std', 0.10585548059556159],
 ['hand_acc16g_x_median', 0.09546583378341676],
 ['ankle_acc16g_x_std', 0.09193876824886076],
 ['chest_acc_power_ratio', 0.07997891003082246],
 ['hand_acc16g_z_mean', 0.07143771040202103],
 ['chest_gyro_y_mean', 0.0631235376420725],
 ['heart_rate_mean', 0.0],
 ['hand_temp_mean', 0.0],
 ['hand_acc16g_x_mean', 0.0],
 ['hand_acc16g_y_mean', 0.0],
 ['hand_gyro_x_mean', 0.0]]

In [64]:
from sklearn import tree
from sklearn.externals.six import StringIO
import pydotplus   #using pydotplus in windows10, python 3.6.X

In [69]:
dot_data = StringIO()
class_name=[DP.activity_dict[x] for x in np.unique(trainlabels)]

tree.export_graphviz(cls, out_file=dot_data, 
                         feature_names=feature_names[:-1],  
                         class_names=class_name, 
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
file_path='tree.png'
pydotplus.graph_from_dot_data(dot_data.getvalue()).write_png(file_path)
# i = misc.imread(file_path)
# plt.imshow(i)

True

In [66]:
graph

<pydotplus.graphviz.Dot at 0x2021c68fb38>

#### LOSO cross validation:

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [12]:
#check LOSO cross validation!

mean_scores=[]
for a in [14]:
    scores=[]
    for iout in range(1,9):

        datalabels=[]
        for i in range(1,iout):
            feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
            datalabels.append(datalabelsi)

        for i in range(iout+1,9):
            feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
            datalabels.append(datalabelsi)

        datalabels=np.vstack(datalabels)

        feature_names,validdatalabels=np.load('data'+str(iout)+'.npy')
        validdata=validdatalabels[:,:-1]
        validlabels=validdatalabels[:,-1]

        #shuffle data
        np.random.shuffle(datalabels)

        traindata=datalabels[:,:-1]
        trainlabels=datalabels[:,-1]

        cls=DecisionTreeClassifier(max_depth=8,min_impurity_decrease=0.015,max_leaf_nodes=14)
        cls.fit(traindata,trainlabels)

        scores.append(cls.score(validdata,validlabels))
    mean_scores.append(np.mean(scores))
    print('a=',a,'mean_score=',np.mean(scores))

a= 14 mean_score= 0.8129711229360876


In [13]:
print('validation scores:', scores)

validation scores: [0.7652582159624414, 0.8035320088300221, 0.8938356164383562, 0.7593984962406015, 0.8347457627118644, 0.7916666666666666, 0.8442211055276382, 0.8111111111111111]


In [14]:
y_pred=cls.predict(validdata)

confusion_matrix(validlabels, y_pred)

array([[ 0, 43,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [20,  2, 21,  0,  0,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 56,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 28,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 42,  0,  0,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 52,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0, 13,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  2,  0,  1,  0, 40,  0,  0],
       [ 0,  0,  1,  0,  0,  0,  0,  0,  0,  0, 59,  0],
       [ 0,  0,  0,  1,  8,  0,  0,  0,  0,  0,  0,  4]], dtype=int64)

### Changing parameters in DecisionTreeClassifier:

In [11]:
X=datalabels[:,:-1]
y=datalabels[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

**Change depth of tree:**

In [16]:
cv_scores = []
for n in [5, 10, 15]:
    cls=DecisionTreeClassifier(max_depth=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.90779221, 0.92307692, 0.90131579]),
 array([0.93506494, 0.95827901, 0.94342105]),
 array([0.93506494, 0.95827901, 0.94342105])]

**Change minimum number of sample required for a leaf node: (default = 1)**

In [17]:
cv_scores = []
for n in [1, 4, 8]:
    cls=DecisionTreeClassifier(max_depth=12, min_samples_leaf=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.93506494, 0.95827901, 0.94342105]),
 array([0.93636364, 0.94915254, 0.93947368]),
 array([0.93246753, 0.93872229, 0.92763158])]

** Change maximum number of leaf nodes: (default = None)**

In [18]:
cv_scores = []
for n in [12, 20, 30, 40, None]:
    cls=DecisionTreeClassifier(max_depth=12, min_samples_leaf=1, max_leaf_nodes=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.91428571, 0.91916558, 0.89473684]),
 array([0.93766234, 0.93872229, 0.93026316]),
 array([0.94805195, 0.95436767, 0.93815789]),
 array([0.94155844, 0.95958279, 0.94078947]),
 array([0.93506494, 0.95827901, 0.94342105])]

**Testing with test samples:**

In [19]:
cls=DecisionTreeClassifier(max_depth=12, min_samples_leaf=1, max_leaf_nodes=None, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.96

# 2) Random Forest Classifier:

In [20]:
from sklearn.ensemble import RandomForestClassifier

#### Changing number of trees: (default n_estimators=10)

In [43]:
cv_scores = []
for n in [5, 10, 15, 20]:
    cls=RandomForestClassifier(n_estimators=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.95714286, 0.97653194, 0.96315789]),
 array([0.97272727, 0.98435463, 0.97631579]),
 array([0.98311688, 0.98435463, 0.98684211]),
 array([0.98571429, 0.98696219, 0.98421053])]

**Change depth of trees: (default max_depth=None)**

In [22]:
cv_scores = []
for n in [1, 2, 3, 4, 7, 10, 15, None]:
    cls=RandomForestClassifier(n_estimators=12, max_depth=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.41298701, 0.410691  , 0.42368421]),
 array([0.59220779, 0.55410691, 0.58289474]),
 array([0.8       , 0.80573664, 0.79342105]),
 array([0.92337662, 0.93611473, 0.925     ]),
 array([0.97532468, 0.98565841, 0.98289474]),
 array([0.98311688, 0.98696219, 0.98289474]),
 array([0.98311688, 0.98565841, 0.98157895]),
 array([0.98311688, 0.98565841, 0.98157895])]

**Testing with test samples:**

In [70]:
cls=RandomForestClassifier(n_estimators=13, max_depth=8, max_leaf_nodes=25, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.9686956521739131

In [73]:
y=cls.feature_importances_
a=[[feature_names[i],y[i]] for i in range(len(y))]
a.sort(key=lambda x:-x[1])
a[:50]

[['ankle_acc16g_x_mean', 0.04579098236012126],
 ['ankle_acc16g_x_std', 0.0428716667704422],
 ['chest_mag_y_median', 0.040187762216172665],
 ['hand_acc16g_y_peak', 0.03687652252553823],
 ['chest_mag_y_mean', 0.028191026976163436],
 ['chest_acc_spectrum_energy', 0.02260209371667907],
 ['chest_acc16g_y_peak', 0.022151749348904314],
 ['chest_acc16g_x_std', 0.020969962876016972],
 ['hand_acc16g_y_kurtosis', 0.02036078596616982],
 ['ankle_gyro_z_std', 0.020123250645034577],
 ['hand_acc16g_x_median', 0.019909564447120054],
 ['ankle_gyro_z_median', 0.019734539007480713],
 ['hand_acc16g_y_std', 0.019651365344506063],
 ['chest_mag_y_peak', 0.019321737965442794],
 ['hand_gyro_z_std', 0.019180750453920203],
 ['ankle_acc16g_y_median', 0.017552059411974838],
 ['chest_mag_x_median', 0.01622981157052565],
 ['hand_acc16g_y_mean', 0.015789425016589063],
 ['chest_gyro_z_std', 0.0156535080904836],
 ['hand_mag_x_std', 0.015463975325369835],
 ['chest_gyro_x_std', 0.015066909992379559],
 ['ankle_acc16g_x_pea

In [56]:
#check LOSO cross validation!

mean_scores=[]
for a in [1]:
    scores=[]
    for iout in range(1,9):

        datalabels=[]
        for i in range(1,iout):
            feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
            datalabels.append(datalabelsi)

        for i in range(iout+1,9):
            feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
            datalabels.append(datalabelsi)

        datalabels=np.vstack(datalabels)

        feature_names,validdatalabels=np.load('data'+str(iout)+'.npy')
        validdata=validdatalabels[:,:-1]
        validlabels=validdatalabels[:,-1]

        #shuffle data
        np.random.shuffle(datalabels)

        traindata=datalabels[:,:-1]
        trainlabels=datalabels[:,-1]

        cls=RandomForestClassifier(n_estimators=13, max_depth=8, max_leaf_nodes=25, random_state=42)
        cls.fit(traindata,trainlabels)

        scores.append(cls.score(validdata,validlabels))
    mean_scores.append(np.mean(scores))
    print('a=',a,'mean_score=',np.mean(scores))

a= 1 mean_score= 0.9331191791311483


In [57]:
print('validation scores:', scores)

validation scores: [0.8779342723004695, 0.9028697571743929, 0.928082191780822, 0.9624060150375939, 0.9491525423728814, 0.9027777777777778, 0.957286432160804, 0.9844444444444445]


# 3) K-Nearest Neighbors Classifier:

In [24]:
from sklearn.neighbors import KNeighborsClassifier

#### Change number of nearest neighbors: (default n_neighbors=5)

In [25]:
cv_scores = []
for n in [1, 2, 3, 5, 7, 9]:
    cls=KNeighborsClassifier(n_neighbors=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.97012987, 0.96610169, 0.97368421]),
 array([0.94545455, 0.95045632, 0.95526316]),
 array([0.95584416, 0.96349413, 0.96315789]),
 array([0.95064935, 0.9517601 , 0.95131579]),
 array([0.94285714, 0.94002608, 0.95263158]),
 array([0.94285714, 0.93872229, 0.94078947])]

**Change the weight function used in prediction: (default='uniform')**

In [26]:
cv_scores = []
for n in ['uniform', 'distance']:
    cls=KNeighborsClassifier(n_neighbors=3, weights=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.95584416, 0.96349413, 0.96315789]),
 array([0.95844156, 0.96740548, 0.96447368])]

**Testing with test samples:**

In [27]:
cls=KNeighborsClassifier(n_neighbors=3, weights='distance')
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.9617391304347827

# 4) Support Vector Machine:

In [6]:
from sklearn.svm import SVC

In [18]:
np.arange(-10,10)

array([-10,  -9,  -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,   0,   1,   2,
         3,   4,   5,   6,   7,   8,   9])

In [22]:
for a in np.arange(-7,-2):
    cls=SVC(kernel='rbf',gamma=(10.0**a))
    print(cross_val_score(cls, X_train, y_train))

[0.28555431 0.2765237  0.31638418]
[0.81858903 0.83408578 0.80451977]
[0.95856663 0.94808126 0.94011299]
[0.96976484 0.96726862 0.97062147]
[0.65845465 0.66139955 0.68135593]


**Change kernel type: [‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’]**

In [29]:
cv_scores = []
for n in ['linear','poly', 'rbf', 'sigmoid']:
    cls=SVC(kernel=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.98311688, 0.97653194, 0.97894737]),
 array([0.98441558, 0.97392438, 0.98157895]),
 array([0.25324675, 0.26205997, 0.26447368]),
 array([0.12597403, 0.12646675, 0.12763158])]

**Change parameters correspond to different kernel: gamma and degree. Both parameters don't seem to affect too much.**

In [30]:
cv_scores = []
for n in ['auto', 1/100, 1/50, 1/200]:
    cls=SVC(kernel='linear', gamma=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.98311688, 0.97653194, 0.97894737]),
 array([0.98311688, 0.97653194, 0.97894737]),
 array([0.98311688, 0.97653194, 0.97894737]),
 array([0.98311688, 0.97653194, 0.97894737])]

In [31]:
cv_scores = []
for n in [1, 2, 3, 4, 5]:
    cls=SVC(kernel='poly', degree=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.98311688, 0.97653194, 0.97763158]),
 array([0.98571429, 0.97653194, 0.98289474]),
 array([0.98441558, 0.97392438, 0.98157895]),
 array([0.98051948, 0.97392438, 0.98289474]),
 array([0.97922078, 0.97131682, 0.98421053])]

**Testing with test samples:**

In [32]:
cls=SVC(kernel='poly', degree=2)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.9860869565217392

# 5) Gaussian Naive Bayes:

In [33]:
from sklearn.naive_bayes import GaussianNB

In [34]:
cls=GaussianNB()
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.9495652173913044

**It seems that GaussianNB is not doing super well. Is it because it assumes that the features are independent?**

# 6) Gradient Boost classifier:

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

**Change learning_rate: default = 0.1**

In [36]:
cv_scores = []
for n in [0.05, 0.1, 0.5]:
    cls=GradientBoostingClassifier(learning_rate=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.97402597, 0.98305085, 0.97368421]),
 array([0.97792208, 0.98435463, 0.97631579]),
 array([0.97272727, 0.97653194, 0.96842105])]

**Change the number of boosting stages to perform: default n_estimators = 100**

In [37]:
cv_scores = []
for n in [20, 50, 120]:
    cls=GradientBoostingClassifier(learning_rate=0.1, n_estimators=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.95714286, 0.97131682, 0.97105263]),
 array([0.97272727, 0.98305085, 0.975     ]),
 array([0.98051948, 0.98435463, 0.975     ])]

**Change the maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree.**

In [38]:
cv_scores = []
for n in [1, 3, 5]:
    cls=GradientBoostingClassifier(learning_rate=0.2, n_estimators=75, max_depth=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([0.98311688, 0.98174707, 0.98026316]),
 array([0.97922078, 0.98305085, 0.97763158]),
 array([0.96883117, 0.96870926, 0.96973684])]

**Tesing with the test samples:**

In [39]:
cls=GradientBoostingClassifier(learning_rate=0.2, n_estimators=75, max_depth=3)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.9860869565217392

# 7) Neural Network Multi-layer Perceptron classifier:

In [40]:
from sklearn.neural_network import MLPClassifier

In [41]:
clf = MLPClassifier(activation='relu', solver='adam', alpha=1e-5, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.9860869565217392

In [42]:
y_pred=cls.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[54,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 55,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0, 62,  0,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  0, 90,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 27,  0,  0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0, 49,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 53,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 22,  0,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  0,  0,  0, 24,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 42,  1,  0],
       [ 0,  0,  1,  0,  0,  0,  0,  0,  0,  1, 77,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12]], dtype=int64)