In [39]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [66]:
# Check if extracted data folder exists
usingColab = 0
if usingColab:
  if(not os.path.exists('SYSC4415W23_A3_dataset')):

    # Download and extract the dataset if the zip file does not exist
    if (not os.path.isfile('SYSC4415W23_A3_dataset.zip')):
      !wget https://github.com/jrgreen7/SYSC4906/releases/download/Assignment3/SYSC4415W23_A3_dataset.zip
      !unzip SYSC4415W23_A3_dataset.zip

  datasetPath = r"SYSC4415W23_A3_dataset"

else:
  datasetPath = r"C:/Users/googl/Documents/ML/A3_Dataset/SYSC4415W23_A3_dataset"

In [77]:
train_labels = pd.read_csv(f'{datasetPath}/train/labels.csv')
train_features = pd.read_csv(f'{datasetPath}/train/extracted_features.csv')

train_features.set_index(keys='sample_id', inplace=True)
train_features.sort_values(by=['sample_id'], inplace=True)
train_labels.set_index(keys='sample_id', inplace=True)
train_labels.sort_values(by=['sample_id'], inplace=True)

print(f'Shape of original features: {train_features.shape}')

Shape of original features: (1621, 7047)


In [None]:
# Data visualization
counts = train_labels.value_counts(subset=['label'])
keys = ['Normal Walk', 'Fast Walk', 'Ascent', 'Descent', 'Jumping Jacks']
vals = [counts[0],counts[1], counts[2], counts[3], counts[4]]

fig = plt.figure(figsize=(10,5))
plt.bar(x=keys, height=vals, width=0.5)
plt.xlabel("Data Label")
plt.ylabel("Num samples")
plt.title("Number of each type of training sample")
plt.show()

# No class imbalance

### Feature Selection 
Using variance thresholding, dropping any features with NaN values and using univariate stats to determince the ***__top 50% of features__*** for classification to determine which features are useful.

In [78]:
# Remove features that have very low variance
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Drop columns with nan values
toDrop = train_features.columns[train_features.isnull().any()].tolist()
train_selected = train_features.drop(toDrop, axis=1)
print(f'Number of reduced features from removing NaN values: {train_selected.shape}')

# Drop columns with very low variance
sel = VarianceThreshold(threshold=(0.95 * (1 - 0.95)))
sel.fit_transform(train_selected)
cols = [column for column in train_selected.columns 
          if column not in train_selected.columns[sel.get_support()]]
train_selected.drop(columns=cols, inplace=True)
print(f'Shape after dropping columns with variance lower than .9525: {train_selected.shape}')

# # Find indexes of columns with correlation greater than 0.8
# corr_matrix = train_selected.corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = []

# to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
# train_selected.drop(columns=to_drop, inplace=True)
# print(f'Number of features after removing columns with high correlation: {train_selected.shape}')

# Using top 1750 features
numFeatures = 2000
nextSel = SelectKBest(score_func=f_classif, k=numFeatures)
nextSel.fit_transform(train_selected, train_labels['label'])
cols = [column for column in train_selected.columns if column not in train_selected.columns[nextSel.get_support()]]
dataset = train_selected.drop(columns=cols)
print(f'Feature data shape after SelectKBest using f-value stats: {dataset.shape}')

# Free up memory for next tasks
del toDrop, cols, sel, nextSel


Number of reduced features from removing NaN values: (1621, 6993)
Shape after dropping columns with variance lower than .9525: (1621, 5402)
Feature data shape after SelectKBest using f-value stats: (1621, 2000)


#### Data Loading and organization

In [79]:
# from torch.utils.data import DataLoader
batchSize = 32
tr_split = 1300 # ~80% of labelled data
te_split = 321  # ~10% of labelled data

dataset['label'] = train_labels['label']
dataset = dataset.sample(frac=1) # Shuffle rows before breaking into sets

print(dataset.shape, '- Dataset shape prior to training')

trainSet = dataset.iloc[:tr_split] # Shape: 1400, numFeatures
testSet = dataset.iloc[tr_split:]  # Shape: 221, numFeatures

(1621, 2001) - Dataset shape prior to training


### Building model
**Hyperparameters**: 
- Learning rate: Using effective LR calculated from base LR using:
$$\eta_{eff} = \frac{B\eta_{base}}{256}$$
- Input size: 2700
- Output size: 5 (1 per class label)
- Hiddel layers: 1
- Nodes/hidden layer: 1800 (2/3 of input size, since input size > num samples)

In [None]:
from sklearn.neural_network import MLPClassifier
import time
lr_base = 0.0001
maxIterations = 1500

lr = (batchSize*lr_base)/256
MLP = MLPClassifier((1000,), 'relu', solver='adam', learning_rate_init=lr, max_iter=maxIterations, batch_size=batchSize, tol=0.00001)
trainData = trainSet.iloc[:, :-1] # 1300, 1500
trainLabels = trainSet.iloc[:,-1] # 1300, 1

start = time.time()
# MLP.fit(trainData, trainLabels)
totalTime = time.time() - start
print(f'Training time: {round(totalTime, ndigits=2)} s')

In [None]:
# losses = MLP.loss_curve_
# valScores = MLP.validation_scores_
# iterations = np.linspace(0, MLP.n_iter_, len(losses))

# print(f'Number of iterations: {MLP.n_iter_}')
# figs, ax = plt.subplots(nrows=1, ncols=1)
# ax.plot(iterations, losses)
# ax.set_title('Training losses'); ax.set_xlabel('Iteration'); ax.set_ylabel('Loss')

In [101]:
from sklearn.ensemble import ExtraTreesClassifier
import sklearn.ensemble as en
import time

eTree = ExtraTreesClassifier(n_estimators=900, min_samples_split=2, max_features=None, n_jobs=10, criterion='entropy')

trainData = trainSet.iloc[:, :-1] # 1400, 1700
trainLabels = trainSet.iloc[:,-1] # 1400, 1

start = time.time()
eTree.fit(trainData, trainLabels)
totalTime = time.time() - start
print(f'ETrees Training time: {round(totalTime, ndigits=2)} s')

ETrees Training time: 24.76 s


In [103]:
from sklearn.metrics import accuracy_score

testCols = testSet.columns
for col in trainSet.columns:
  if col not in testCols:
    testSet.drop(columns=col)

testData = testSet.iloc[:, :-1] # 321, 1500
testLabels = testSet.iloc[:,-1] # 321, 1

# Extra Trees
trainLabelsEx_ert = pd.Series(data=eTree.predict(X=trainData), index=trainData.index) # output 1400, 1
testLabelsEx_ert = pd.Series(data=eTree.predict(X=testData), index=testData.index)    # output 221, 1

tr_score = accuracy_score(trainLabels, trainLabelsEx_ert)
te_score = accuracy_score(testLabels, testLabelsEx_ert)

print(f"Train accuracy ERT: {round(tr_score*100, 2)}%")
print(f"Test accuracy ERT: {round(te_score*100, 2)}%")

Train accuracy ERT: 100.0%
Test accuracy ERT: 77.26%


In [100]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(eTree, X=testData, y=testLabels, cv=5, n_jobs=10, scoring='accuracy')
print("5-fold Cross validation score on eTree model")
print(f"Score: {score.mean(): .2f}\t[± {score.std(): .2f}]")

5-fold Cross validation score on eTree model
Score:  0.63	[±  0.08]
