In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Check if extracted data folder exists
if(not os.path.exists('SYSC4415W23_A3_dataset')):

  # Download and extract the dataset if the zip file does not exist
  if (not os.path.isfile('SYSC4415W23_A3_dataset.zip')):
    !wget https://github.com/jrgreen7/SYSC4906/releases/download/Assignment3/SYSC4415W23_A3_dataset.zip
    !unzip SYSC4415W23_A3_dataset.zip

datasetPath = r"SYSC4415W23_A3_dataset"

In [None]:
train_labels = pd.read_csv(f'{datasetPath}/train/labels.csv')
train_features = pd.read_csv(f'{datasetPath}/train/extracted_features.csv')

train_features.set_index(keys='sample_id', inplace=True)
train_features.sort_values(by=['sample_id'], inplace=True)
train_labels.set_index(keys='sample_id', inplace=True)
train_labels.sort_values(by=['sample_id'], inplace=True)

print(f'Shape of original features: {train_features.shape}')

In [None]:
# Data visualization
counts = train_labels.value_counts(subset=['label'])
keys = ['Normal Walk', 'Fast Walk', 'Ascent', 'Descent', 'Jumping Jacks']
vals = [counts[0],counts[1], counts[2], counts[3], counts[4]]

fig = plt.figure(figsize=(10,5))
plt.bar(x=keys, height=vals, width=0.5)
plt.xlabel("Data Label")
plt.ylabel("Num samples")
plt.title("Number of each type of training sample")
plt.show()

# No class imbalance

### Feature Selection 
Using variance thresholding, dropping any features with NaN values and using univariate stats to determince the ***__top 50% of features__*** for classification to determine which features are useful.

In [None]:
# Remove features that have very low variance
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Drop columns with nan values
toDrop = train_features.columns[train_features.isnull().any()].tolist()
train_selected = train_features.drop(toDrop, axis=1)
print(f'Number of reduced features from removing NaN values: {train_selected.shape}')

# Drop columns with very low variance
sel = VarianceThreshold(threshold=(0.95 * (1 - 0.95)))
sel.fit_transform(train_selected)
cols = [column for column in train_selected.columns 
          if column not in train_selected.columns[sel.get_support()]]
train_selected.drop(columns=cols, inplace=True)
print(f'Shape after dropping columns with variance lower than .9525: {train_selected.shape}')

# Using top k features
numFeatures = 1250
nextSel = SelectKBest(score_func=f_classif, k=numFeatures)
nextSel.fit_transform(train_selected, train_labels['label'])
cols = [column for column in train_selected.columns if column not in train_selected.columns[nextSel.get_support()]]
dataset = train_selected.drop(columns=cols)
print(f'Feature data shape after SelectKBest using f-value stats: {dataset.shape}')

# Free up memory for next tasks
del toDrop, cols, sel, nextSel


#### Data Loading and organization

In [None]:
# from torch.utils.data import DataLoader
tr_split = 1300 # ~80% of labelled data

dataset['label'] = train_labels['label']
dataset = dataset.sample(frac=1) # Shuffle rows before breaking into sets

print(dataset.shape, '- Dataset shape prior to training')

trainSet = dataset.iloc[:tr_split] # Shape: 1400, numFeatures
testSet = dataset.iloc[tr_split:]  # Shape: 221, numFeatures

### Building model
**Model**

Using a Extremely Randomized Trees approach to classify samples, an ensemble learning method akin to Random Forests is used to determine class labels for examples. 

**Hyperparameters** 
- Number of trees in Forest: 1000
- Input size: 1250
- Output size: 5 (1 per class label)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import time

eTree = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, max_features=None, n_jobs=10, criterion='log_loss')

trainData = trainSet.iloc[:, :-1] # 1400, 1700
trainLabels = trainSet.iloc[:,-1] # 1400, 1

start = time.time()
eTree.fit(trainData, trainLabels)
totalTime = time.time() - start
print(f'ETrees Training time: {round(totalTime, ndigits=2)} s')

In [None]:
from sklearn.metrics import accuracy_score

testData = testSet.iloc[:, :-1] # 321, numFeatures
testLabels = testSet.iloc[:,-1] # 321, 1

# Extra Trees
trainLabelsEx_ert = pd.Series(data=eTree.predict(X=trainData), index=trainData.index) # output 1400, 1
testLabelsEx_ert = pd.Series(data=eTree.predict(X=testData), index=testData.index)    # output 221, 1

tr_score = accuracy_score(trainLabels, trainLabelsEx_ert)
te_score = accuracy_score(testLabels, testLabelsEx_ert)

print(f"Train accuracy ERT: {round(tr_score*100, 2)}%")
print(f"Test accuracy ERT: {round(te_score*100, 2)}%")

In [None]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(eTree, X=testData, y=testLabels, cv=5, n_jobs=10, scoring='accuracy')
print("5-fold Cross validation score on eTree model")
print(f"Score: {score.mean(): .2f}\t[± {score.std(): .2f}]")