In [3]:
import pandas as pd
import torch
import numpy as np

In [9]:
# torch tensors
accelaration = torch.load("data/acceleration.pt", weights_only=True)
gestures = torch.load("data/gestures.pt", weights_only=True)
users = torch.load("data/users.pt", weights_only=True)

# df
u_wave_gestures = pd.read_parquet("data/uWaveGestures.parquet", engine="pyarrow")

### Task 1

#### Preprocessing

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

from xgboost import XGBClassifier

random_state = 0

In [6]:
# drop NAs
u_wave_gestures = u_wave_gestures.dropna(axis=0, how="any")

user = u_wave_gestures["user"]

X = u_wave_gestures.drop(["gesture", "user"], axis=1)
y = u_wave_gestures["gesture"]

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale MinMaxScaler
scaler_minmax = MinMaxScaler()
X_train_minmaxscaled = scaler_minmax.fit_transform(X_train)
X_test_minmaxscaled = scaler_minmax.transform(X_test)

# logistic regg acc
# MinMaxScaler = 0.7310267857142857
# StandardScaler = 0.8191964285714286

#### Logistic Regression

In [92]:
lr_clf = LogisticRegression(
    random_state=random_state,
    tol=0.0001,
    solver="lbfgs",  # [‘l2’, None]
    max_iter=10000,
)

lr_clf.fit(X_train_scaled, y_train)

# predicted labels for test set
lr_y_predict = lr_clf.predict(X_test_scaled)

print(classification_report(y_true=y_test, y_pred=lr_y_predict, digits=5))

              precision    recall  f1-score   support

           1    0.83019   0.89796   0.86275        98
           2    0.97222   0.92920   0.95023       113
           3    0.72414   0.73684   0.73043       114
           4    0.76068   0.74790   0.75424       119
           5    0.80357   0.84906   0.82569       106
           6    0.87255   0.79464   0.83178       112
           7    0.78512   0.81897   0.80169       116
           8    0.82456   0.79661   0.81034       118

    accuracy                        0.81920       896
   macro avg    0.82163   0.82140   0.82089       896
weighted avg    0.82095   0.81920   0.81947       896



#### Random Forest Classifier

In [97]:
rf_clf = RandomForestClassifier(
    random_state=random_state,
    n_estimators=100,
    criterion="gini",
    max_depth=None,
    bootstrap=True,
)

# random forest doens't require normalization

rf_clf.fit(X_train, y_train)

# predicted labels for test set
rf_y_predict = rf_clf.predict(X_test)

print(classification_report(y_true=y_test, y_pred=rf_y_predict, digits=5))

              precision    recall  f1-score   support

           1    0.92929   0.93878   0.93401        98
           2    0.93860   0.94690   0.94273       113
           3    0.84167   0.88596   0.86325       114
           4    0.88393   0.83193   0.85714       119
           5    0.79464   0.83962   0.81651       106
           6    0.83810   0.78571   0.81106       112
           7    0.88596   0.87069   0.87826       116
           8    0.88333   0.89831   0.89076       118

    accuracy                        0.87388       896
   macro avg    0.87444   0.87474   0.87422       896
weighted avg    0.87430   0.87388   0.87371       896



#### K-Nearest-Neigbours

In [101]:
knn = neighbors.KNeighborsClassifier(
    n_neighbors=5, weights="uniform", algorithm="auto", leaf_size=30, p=2, metric="minkowski"
)

knn.fit(X_train_scaled, y_train)

# predicted labels for test set
knn_y_predict = knn.predict(X_test_scaled)

print(classification_report(y_true=y_test, y_pred=knn_y_predict, digits=5))

              precision    recall  f1-score   support

           1    0.86916   0.94898   0.90732        98
           2    0.96552   0.99115   0.97817       113
           3    0.83193   0.86842   0.84979       114
           4    0.88073   0.80672   0.84211       119
           5    0.83333   0.84906   0.84112       106
           6    0.87736   0.83036   0.85321       112
           7    0.92105   0.90517   0.91304       116
           8    0.92308   0.91525   0.91915       118

    accuracy                        0.88839       896
   macro avg    0.88777   0.88939   0.88799       896
weighted avg    0.88872   0.88839   0.88798       896



#### XGBoost 

In [9]:
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Subtract 1 from y_train and y_test to ensure zero-based indexing
y_train_zero_indexed = y_train - 1
y_test_zero_indexed = y_test - 1

# Define XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

In [110]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    "clf__n_estimators": [100, 200, 300, 400, 500],
    "clf__max_depth": [3, 5, 7, 9, 11],
    "clf__learning_rate": [0.1, 0.01, 0.001],
}

# Create pipeline
pipeline = Pipeline([("clf", xgb_classifier)])

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Fit the model using zero-indexed y_train
grid_search.fit(X_train, y_train_zero_indexed)

# Best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

# Evaluate on test set using zero-indexed y_test
test_score = grid_search.score(X_test, y_test_zero_indexed)
print("Test set score:", test_score)



Best parameters found: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 400}
Best score found: 0.8874956172132489
Test set score: 0.8995535714285714


In [10]:
xgb_clf = xgb.XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.1)

xgb_clf.fit(X_train, y_train_zero_indexed)

xgb_y_predict = xgb_clf.predict(X_test)

print(classification_report(y_true=y_test_zero_indexed, y_pred=xgb_y_predict, digits=5))

              precision    recall  f1-score   support

           0    0.92233   0.96939   0.94527        98
           1    0.97345   0.97345   0.97345       113
           2    0.84800   0.92982   0.88703       114
           3    0.92661   0.84874   0.88596       119
           4    0.84545   0.87736   0.86111       106
           5    0.87850   0.83929   0.85845       112
           6    0.90351   0.88793   0.89565       116
           7    0.90435   0.88136   0.89270       118

    accuracy                        0.89955       896
   macro avg    0.90028   0.90092   0.89995       896
weighted avg    0.90051   0.89955   0.89938       896



---

In [131]:
# Save 5 test data
X_test.iloc[0:10].to_pickle("test_data/X_test_1.pkl")
X_test.iloc[10:20].to_pickle("test_data/X_test_2.pkl")
X_test.iloc[20:30].to_pickle("test_data/X_test_3.pkl")
X_test.iloc[30:40].to_pickle("test_data/X_test_4.pkl")
X_test.iloc[40:50].to_pickle("test_data/X_test_5.pkl")

In [11]:
# Save extend data
u_wave_gestures.iloc[0:10].to_pickle("extend_data/extend_1.pkl")