In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading Data

Download zip files from github and unzip into mounted location. Once unzipped, read data files and load into dataframes.
Plot histogram to show label occurences

In [None]:
# Check if extracted data folder exists
usingColab = False
if usingColab:
  if(not os.path.exists('SYSC4415W23_A3_dataset')):

    # Download and extract the dataset if the zip file does not exist
    if (not os.path.isfile('SYSC4415W23_A3_dataset.zip')):
      !wget https://github.com/jrgreen7/SYSC4906/releases/download/Assignment3/SYSC4415W23_A3_dataset.zip
      !unzip SYSC4415W23_A3_dataset.zip


In [None]:
# Original Training Data
train_labels = pd.read_csv(f'SYSC4415W23_A3_dataset/train/labels.csv')
train_features = pd.read_csv(f'SYSC4415W23_A3_dataset/train/extracted_features.csv')

train_features.set_index(keys='sample_id', inplace=True)
train_labels.set_index(keys='sample_id', inplace=True)

train_features.sort_values(by=['sample_id'], inplace=True)
train_labels.sort_values(by=['sample_id'], inplace=True)

print(f'Shape of original features: {train_features.shape}')

# Evaluation Test Data
test_features = pd.read_csv(f'SYSC4415W23_A3_dataset/test/extracted_features.csv')
test_labels = pd.read_csv(f'SYSC4415W23_A3_dataset/test/labels.csv')

test_features.set_index('sample_id', inplace=True)
test_labels.set_index('sample_id', inplace=True)

test_features.sort_values(by=['sample_id'], inplace=True)
test_labels.sort_values(by=['sample_id'], inplace=True)

In [None]:
# Data visualization
counts = train_labels.value_counts(subset=['label'])
keys = ['Normal Walk', 'Fast Walk', 'Ascent', 'Descent', 'Jumping Jacks']
vals = [counts[0],counts[1], counts[2], counts[3], counts[4]]

fig = plt.figure(figsize=(10,5))
plt.bar(x=keys, height=vals, width=0.5)
plt.xlabel("Data Label")
plt.ylabel("Num samples")
plt.title("Number of each type of training sample")
plt.show()

# No class imbalance

### Feature Selection 
Using variance thresholding, dropping any features with NaN values and using univariate stats to determince the ***__top 50% of features__*** for classification to determine which features are useful.

In [None]:
# Remove features that have very low variance
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Feature Selection, done as a function to accomodate different features and methods needed by different models
def trimDataset(numFeatures: int, useCorr=False) -> tuple[pd.DataFrame, pd.DataFrame]:

    # Drop columns with nan values
    toDrop = train_features.columns[train_features.isnull().any()].tolist()
    train_selected = train_features.drop(toDrop, axis=1)
    test_selected = test_features.drop(toDrop, axis=1)

    # Drop columns with very low variance
    sel = VarianceThreshold(threshold=(0.95 * (1 - 0.95)))
    sel.fit_transform(train_selected)
    cols = [column for column in train_selected.columns 
            if column not in train_selected.columns[sel.get_support()]]
    train_selected.drop(columns=cols, inplace=True)
    test_selected.drop(columns=cols, inplace=True)

    # Compute the correlation matrix
    if useCorr:
        corr_matrix = train_selected.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        # Find index of feature columns with correlation greater than 0.8
        to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
        train_selected.drop(columns=to_drop, inplace=True)
        test_selected.drop(columns=to_drop, inplace=True)

        del to_drop

    # Using top 1750 features
    nextSel = SelectKBest(score_func=f_classif, k=numFeatures)
    nextSel.fit_transform(train_selected, train_labels['label'])
    cols = []
    cols = [column for column in train_selected.columns if column not in train_selected.columns[nextSel.get_support()]]
    train_selected.drop(columns=cols, inplace=True)
    test_selected.drop(columns=cols, inplace=True)

    print(f'Feature data shape after SelectKBest using f-value stats: {train_selected.shape}')

    # Free up memory for next tasks
    del toDrop, cols, sel, nextSel

    return train_selected, test_selected

# Extra Trees Model
**Model**

Using a Extremely Randomized Trees approach to classify samples, an ensemble learning method akin to Random Forests is used to determine class labels for examples. 

### Training

**Hyperparameters** 

The following hyperparameters were determined experimentally using a grid search method 
- Number of trees in Forest: 1000
- Loss Criterion: Log Loss
- Input size: 1250
- Output size: 5

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
import time

# Dataset Preparation
dataset, evalSet = trimDataset(1250, False)
dataset.sort_values(by=['sample_id'], inplace=True); evalSet.sort_values(by=['sample_id'], inplace=True)

dataset['label'] = train_labels['label']
dataset = dataset.sample(frac=1) # Shuffle rows before breaking into sets

print(dataset.shape, ' --> Dataset shape prior to training')

trainData, trainLabels, testData, testLabels = train_test_split(dataset, train_labels, shuffle=False, test_size=0.2, random_state=42)

# Model Training
eTree = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, max_features=None, n_jobs=10, criterion='log_loss')

start = time.time()
eTree.fit(trainData, trainLabels)
totalTime = time.time() - start
print(f'ETrees Training time: {round(totalTime, ndigits=2)} s')

### Testing

Testing model using 10% of the original dataset used as a holdout for validation. 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Extra Trees
trainLabelsEx = pd.Series(data=eTree.predict(X=trainData), index=trainData.index) # output trainData.shape
testLabelsEx = pd.Series(data=eTree.predict(X=testData), index=testData.index)    # output testData.shape

tr_score = accuracy_score(trainLabels, trainLabelsEx)
te_score = accuracy_score(testLabels,  testLabelsEx)

print(f"Train accuracy ETrees: {round(tr_score*100, 2)}%")
print(f"Test accuracy ETrees:  {round(te_score*100, 2)}%")

score = cross_val_score(eTree, X=testData, y=testLabels, cv=5, n_jobs=10, scoring='accuracy')
print("5-fold Cross validation score on eTree model")
print(f"Score: {score.mean(): .2f}\t[± {score.std(): .2f}]")

# SVM Classifier

In [None]:
from sklearn.model_selection import train_test_split

dataset, evalSet = trimDataset(1750, False)
trainX, testX, trainY, testY = train_test_split(dataset, train_labels, shuffle=True, test_size=0.2, random_state=42)

from sklearn import svm
svm_model = svm.SVC(kernel='linear', C = 1.0)

svm_model.fit(trainX, trainY.values.ravel())

In [None]:
from sklearn.metrics import accuracy_score

train_pred = svm_model.predict(trainX)
test_pred = svm_model.predict(testX)

print("====== Training dataset results ======")
print(f"Training accuracy: {accuracy_score(trainY, train_pred)}")
print("====== Validation dataset results ======")
print(f"Testing accuracy: {accuracy_score(testY, test_pred)}")

In [None]:
from sklearn.model_selection import cross_val_score
scores_2 = cross_val_score(svm_model, dataset, train_labels.values.ravel(), cv=5)
print("%0.5f accuracy with a standard deviation of %0.5f" % (scores_2.mean(), scores_2.std()))

# XGBoost Classifier

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

tr_split = 1300 # ~80% of labelled data

# Dataset Preparation
dataset, evalSet = trimDataset(1250, True)
dataset.sort_values(by=['sample_id'], inplace=True); evalSet.sort_values(by=['sample_id'], inplace=True)

dataset['label'] = train_labels['label']
dataset = dataset.sample(frac=1) # Shuffle rows before breaking into sets

print(dataset.shape, '- Dataset shape prior to training')

trainSet = dataset.iloc[:tr_split] # Shape: 1400, numFeatures
testSet = dataset.iloc[tr_split:]  # Shape: 221, numFeatures

xgb_cl = xgb.XGBClassifier(n_estimators=1000, min_samples_split=2, max_features=None, n_jobs=10, criterion='log_loss')

trainData = trainSet.iloc[:, :-1] # 1400, 1700
trainLabels = trainSet.iloc[:,-1] # 1400, 1

testData = testSet.iloc[:, :-1] # 1400, 1700
testLabels = testSet.iloc[:,-1] # 1400, 1

start = time.time()
xgb_cl.fit(trainData, trainLabels)
totalTime = time.time() - start
print(f'XGBoost Training time: {round(totalTime, ndigits=2)} s')

test_preds = xgb_cl.predict(testData)

a_s1 = accuracy_score(testLabels, test_preds)

print(a_s1)

In [None]:
from sklearn.model_selection import cross_val_score
scores_2 = cross_val_score(xgb_cl, trainData, trainLabels, cv=5)
print("%0.5f accuracy with a standard deviation of %0.5f" % (scores_2.mean(), scores_2.std()))