# Supervised Machine Learning Pipeline - Multi-class Classification

# Overview

Use activity recognition test dataset to:
1. Perform multi-class classification of activity recognition tasks (6 classes) using 131 features in the time and frequency domain.
2. Compare linear classifiers using a machine learning pipeline.

Linear classifiers
- k nearest neighbors
- logistic regression
- SVM - linearSVC
- SVM - SVC which uses nonlinear SVM by default

# Import packages

In [63]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import sklearn.datasets
import pandas as pd
import numpy as np

# Loading data

In [17]:
# load test set
testfile = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\Data analysis\Analysis_ActivityRecognition\testfeaturematrix.csv'
df = pd.read_csv(testfile)

In [70]:
df['task'].str.get_dummies().values.tolist()

[[1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0,

In [85]:
# once I have X and y, modify below
X = df.drop(df.columns[0:8], axis=1)
# adjust X to drop 'target column'
#################################

# y = df[['task']]
# create dummy variables, did NOT drop one column
# y = pd.get_dummies(df[['task']])
df['target'] = df['task'].str.get_dummies().values.tolist()

# create dummy variables and drop first column since it can be implied
# y_dummy = pd.get_dummies(y, drop_first=True)
# y_dummy = pd.get_dummies(y)

# stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [86]:
df

Unnamed: 0,subject,date,test,task,trial,location,sensor,rawdata,meanX,meanY,...,meanpower_bin12_z,meanpower_bin13_z,meanpower_bin14_z,meanpower_bin15_z,meanpower_bin16_z,meanpower_bin17_z,meanpower_bin18_z,meanpower_bin19_z,meanpower_bin20_z,target
0,HC02,temp date,activity recognition,LYING,0,tibialis_anterior_left,accel,Accel X (g) Accel Y (g) Acce...,-0.049779,-0.901753,...,1.571855e-06,1.570076e-06,1.759060e-06,1.658028e-06,1.511982e-06,1.167052e-06,1.633379e-06,1.997187e-06,1.479819e-06,"[1, 0, 0, 0, 0, 0]"
1,HC02,temp date,activity recognition,LYING,0,gastrocnemius_right,accel,Accel X (g) Accel Y (g) Acce...,-0.102802,-0.829551,...,2.090799e-06,1.601778e-06,1.505428e-06,1.082194e-06,1.385893e-06,1.225574e-06,1.330999e-06,1.440693e-06,1.559841e-06,"[1, 0, 0, 0, 0, 0]"
2,HC02,temp date,activity recognition,LYING,0,sacrum,accel,Accel X (g) Accel Y (g) Acce...,-0.058348,0.183076,...,2.174058e-08,2.343769e-08,1.544247e-08,1.614271e-08,1.672363e-08,2.054926e-08,1.836407e-08,1.183496e-08,1.365652e-08,"[1, 0, 0, 0, 0, 0]"
3,HC02,temp date,activity recognition,LYING,0,distal_lateral_shank_right,accel,Accel X (g) Accel Y (g) Acce...,0.105776,0.906628,...,2.416130e-08,2.409428e-08,3.310691e-08,3.517773e-08,2.358800e-08,3.311520e-08,2.889551e-08,2.031967e-08,2.555851e-08,"[1, 0, 0, 0, 0, 0]"
4,HC02,temp date,activity recognition,LYING,0,tibialis_anterior_right,accel,Accel X (g) Accel Y (g) Acce...,-0.198152,0.909473,...,1.362341e-06,2.060577e-06,2.465715e-06,1.308655e-06,1.561659e-06,1.610429e-06,1.532966e-06,1.301695e-06,1.341989e-06,"[1, 0, 0, 0, 0, 0]"
5,HC02,temp date,activity recognition,LYING,0,posterior_forearm_right,accel,Accel X (g) Accel Y (g) Acce...,0.066610,-0.720693,...,5.722675e-08,7.993909e-08,1.099146e-07,1.722232e-07,2.150785e-07,2.915800e-07,4.447235e-07,4.919889e-07,5.704279e-07,"[1, 0, 0, 0, 0, 0]"
6,HC02,temp date,activity recognition,LYING,0,bicep_right,accel,Accel X (g) Accel Y (g) Acce...,-0.389766,-0.169622,...,1.640974e-06,1.355730e-06,2.136291e-06,2.489741e-06,1.838155e-06,2.137969e-06,1.935105e-06,3.030136e-06,4.187565e-06,"[1, 0, 0, 0, 0, 0]"
7,HC02,temp date,activity recognition,LYING,0,rectus_femoris_left,accel,Accel X (g) Accel Y (g) Acce...,-0.194103,-0.182047,...,1.452148e-06,1.247615e-06,2.470176e-06,1.506914e-06,2.123494e-06,1.861959e-06,1.387050e-06,1.561775e-06,1.637523e-06,"[1, 0, 0, 0, 0, 0]"
8,HC02,temp date,activity recognition,LYING,0,biceps_femoris_right,accel,Accel X (g) Accel Y (g) Acce...,0.109888,0.013114,...,1.453238e-06,1.725766e-06,1.397455e-06,1.249781e-06,1.522370e-06,1.580323e-06,1.036328e-06,1.347126e-06,1.226288e-06,"[1, 0, 0, 0, 0, 0]"
9,HC02,temp date,activity recognition,LYING,0,posterior_forearm_left,accel,Accel X (g) Accel Y (g) Acce...,-0.081090,0.903993,...,9.869932e-08,8.375677e-08,5.267274e-08,7.915480e-08,4.847091e-08,1.185819e-07,1.436963e-07,1.849029e-07,1.820151e-07,"[1, 0, 0, 0, 0, 0]"


## Dealing with categorical features:
- scikit-learn: OneHotEncoder()
- pandas: get_dummies()

In [None]:
OneHotEncoder()
LabelEncoder()

In [46]:
# one-hot encoding

# create dummy variables and drop first column since it can be implied
y_dummy = pd.get_dummies(y, drop_first=True)
print(y_dummy.head(3))


   task_LYING  task_SITTING  task_STAIRS DOWN  task_STAIRS UP  task_STANDING
0           1             0                 0               0              0
1           1             0                 0               0              0
2           1             0                 0               0              0


# Exploratory data analysis (EDA)

In [52]:
# check dimensions
print('AR data dimensions: ', X.shape)
print('AR target dimensions: ', y.shape)

AR data dimensions:  (242, 131)
AR target dimensions:  (242, 6)


In [53]:
# check dimensions
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(181, 131) (181, 6) (61, 131) (61, 6)


# 1. k nearest neighbors (knn)

In [72]:
from sklearn.neighbors import KNeighborsClassifier

# Create and fit the model with default hyperparameters
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [73]:
############# reference code for StandardScaler
# CV and scaling in a pipeline using Normalization
steps = [('scaler', StandardScaler()),
         ('svc', LinearSVC())]
pipeline = Pipeline(steps)

# Specify hyperparameter space using a dictionary
parameters = {'svc__C':[0.1, 1, 10]}

X_train_svc, X_test_svc, y_train_svc, y_test_svc = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train_svc, y_train_svc)
y_pred_svc = cv.predict(X_test_svc)

# Compute and print metrics
# print best parameters
print("Tuned Model Parameters: {}".format(cv.best_params_))
print("LinearSVC training accuracy:", linearsvm.score(X_train, y_train))
print("Test Accuracy: {}".format(cv.score(X_test_svc, y_test_svc)))
print(classification_report(y_test_svc, y_pred_svc))
print("Confusion matrix:\n", confusion_matrix(y_test_svc, y_pred_svc))

ValueError: bad input shape (128, 6)

In [55]:
knn.score(X_test, y_test)

0.4426229508196721

In [56]:
# Predict on the test features, print the results
pred = knn.predict(X_test)[0]
# Prediction for test example 0: 1.0
print("Prediction for test example 0:", pred)

Prediction for test example 0: [0 1 0 0 0 0]


In [57]:
# test k=6 with parameter n_neighbors
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.32786885245901637

In [60]:
# hyperparameter tuning for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 30)}
knn = KNeighborsClassifier()
# args: model, grid, number of folds for cross validation
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train, y_train)

# Print the tuned parameters and score
print("Tuned KNN Parameters: {}".format(knn_cv.best_params_)) 
print("Best score is {}".format(knn_cv.best_score_))

print("knn training accuracy:", knn_cv.score(X_train, y_train))
print("knn test accuracy    :", knn_cv.score(X_test, y_test))

Tuned KNN Parameters: {'n_neighbors': 1}
Best score is 0.6906077348066298
knn training accuracy: 1.0
knn test accuracy    : 0.6721311475409836


## knn Overfit
- add scaling

# 2. Logistic Regression - multi-class
Key hyperparameters:
- C (inverse regularization strength)
- penalty (type of regularization - L1 and L2)
- multi_class (type of multi-class)

## 2.1 One-vs-Rest

In [74]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
# lr.predict(X_test)
# lr.score(X_test, y_test)

ValueError: bad input shape (181, 6)

In [None]:
lr.score(X_test, y_test)

In [67]:
# Fit one-vs-rest logistic regression classifier
# lr_ovr = LogisticRegression()
lr_ovr = OneVsRestClassifier(LogisticRegression()) 
lr_ovr.fit(X_train, y_train)

print("OVR training accuracy:", lr_ovr.score(X_train, y_train))
print("OVR test accuracy    :", lr_ovr.score(X_test, y_test))

OVR training accuracy: 0.850828729281768
OVR test accuracy    : 0.5081967213114754


In [65]:
y_pred_lr_ovr = lr_ovr.predict(X_test)
print(classification_report(y_test, y_pred_lr_ovr))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr_ovr))

             precision    recall  f1-score   support

          0       0.50      0.50      0.50         6
          1       0.57      0.36      0.44        11
          2       0.67      0.40      0.50         5
          3       0.50      0.17      0.25         6
          4       0.75      0.82      0.78        22
          5       0.62      0.73      0.67        11

avg / total       0.64      0.59      0.60        61



ValueError: multilabel-indicator is not supported

In [None]:
# confidence intervals
lr.predict_proba(X_test[:1])
# array([[ 9.951e-01, 4.357e-03, 5.339e-04]])

## 2.2 Softmax/Multinomial/Cross-Entropy Loss

In [66]:
lr_mn = LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_mn.fit(X_train, y_train)

print("Softmax training accuracy:", lr_mn.score(X_train, y_train))
print("Softmax test accuracy    :", lr_mn.score(X_test, y_test))

ValueError: bad input shape (181, 6)

In [None]:
y_pred_lr_mn = lr_mn.predict(X_test)
print(classification_report(y_test, y_pred_lr_mn))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr_mn))

## Optional: L1 regularization

In [None]:
# Specify L1 regularization
lr = LogisticRegression()#penalty='l1')

# Instantiate the GridSearchCV object and run the search
searcher = GridSearchCV(lr, {'C':[0.001, 0.01, 0.1, 1, 10, 100]})
searcher.fit(X_train, y_train)

# Report the best parameters
print("Best CV params", searcher.best_params_)

# Find the number of nonzero coefficients (selected features)
best_lr = searcher.best_estimator_
coefs = best_lr.coef_
print("Total number of features:", coefs.size)
print("Number of selected features:", np.count_nonzero(coefs))

# with l1 reg - C=1
# without l1 reg - C=10

# 3. LinearSVC for SVM

In [68]:
# LinearSVC
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.svm import LinearSVC

# svm.ft(wine.data, wine.target)
# svm.score(wine.data, wine.target)

In [None]:
linearsvm = LinearSVC()
linearsvm.fit(X_train, y_train)

In [None]:
print("LinearSVC training accuracy:", linearsvm.score(X_train, y_train))
print("LinearSVC test accuracy    :", linearsvm.score(X_test, y_test))

In [None]:
LinearSVC().get_params().keys()

In [None]:
# CV and scaling in a pipeline using Normalization
steps = [('scaler', StandardScaler()),
         ('svc', LinearSVC())]
pipeline = Pipeline(steps)

# Specify hyperparameter space using a dictionary
parameters = {'svc__C':[0.1, 1, 10]}

X_train_svc, X_test_svc, y_train_svc, y_test_svc = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train_svc, y_train_svc)
y_pred_svc = cv.predict(X_test_svc)

# Compute and print metrics
# print best parameters
print("Tuned Model Parameters: {}".format(cv.best_params_))
print("LinearSVC training accuracy:", linearsvm.score(X_train, y_train))
print("Test Accuracy: {}".format(cv.score(X_test_svc, y_test_svc)))
print(classification_report(y_test_svc, y_pred_svc))
print("Confusion matrix:\n", confusion_matrix(y_test_svc, y_pred_svc))

# 4. SVC - default nonlinear SVM

In [None]:
# SVC
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.svm import SVC
svm = SVC()
# svm.fit(wine.data, wine.target)
# svm.score(wine.data, wine.target)

In [None]:
svm.fit(X_train, y_train)

In [None]:
print("SVC training accuracy:", svm.score(X_train, y_train))
print("SVC test accuracy    :", svm.score(X_test, y_test))

# overfit model

## 4.1 SVC: Tune hyperparameters to improve test accuracy

In [None]:

# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X_train,y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)

# Report the test accuracy using these best parameters
print("Train accuracy of best grid search hypers:", 
      searcher.score(X_train,y_train))
# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", 
      searcher.score(X_test, y_test))

## 4.2 SVC: normalize data, tune hyperparameters and check final result

In [None]:
# CV and scaling in a pipeline
steps = [('scaler', StandardScaler()),
         ('svm', SVC())]
pipeline = Pipeline(steps)

# Specify hyperparameter space using a dictionary
parameters = {'svm__C':[0.1, 1, 10],
              'svm__gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}

X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train_svm, y_train_svm)
y_pred_svm = cv.predict(X_test_svm)

# Compute and print metrics
# print best parameters
print("Tuned Model Parameters: {}".format(cv.best_params_))
print("Test Accuracy: {}".format(cv.score(X_test_svm, y_test_svm)))
print(classification_report(y_test_svm, y_pred_svm))
print("Confusion matrix:\n", confusion_matrix(y_test_svm, y_pred_svm))

# 5. SGDClassifier (SGD=stochastic gradient descent)
- scales better to larger data sets
- specify loss
- hyperparameter 'alpha' is like '1/C'
    - bigger alpha > more regularization

In [None]:
# Review: SGDClassfier for logreg vs SVM
from skelearn.linear_model import SGDClassifier
logreg = SGDClassifier(loss='log')
linsvm = SGDClassifier(loss='hinge')

In [None]:
from sklearn.linear_model import SGDClassifier

# We set random_state=0 for reproducibility 
linear_classifier = SGDClassifier(random_state=0)

# Instantiate the GridSearchCV object and run the search
# Search over the regularization strength, the hinge vs. log losses,
# and L1 vs. L2 regularization.
parameters = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
             'loss':['hinge','log'], 'penalty':['l1','l2']}
searcher = GridSearchCV(linear_classifier, parameters, cv=10)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding train and test scores
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Train accuracy of best grid search hypers:", 
      searcher.score(X_train,y_train))
print("Test accuracy of best grid search hypers:", 
      searcher.score(X_test, y_test))

# Decision Tree models - RandomForest

# 

# 

# 

# 