### Install and import packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/cs610'

Mounted at /content/drive
/content/drive/MyDrive/cs610


In [2]:
# !git clone https://github.com/ivanckng/CS610_AML_Group_Project.git
# !git pull

In [3]:
import cudf
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import os
import tqdm
from skimage.feature import hog
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score

import torch
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader

In [4]:
import cuml
print(cuml.__version__)
%load_ext cuml.accel

25.02.01
[2025-06-18 01:21:21.200] [CUML] [info] cuML: Installed accelerator for sklearn.
[2025-06-18 01:21:37.108] [CUML] [info] cuML: Installed accelerator for umap.
[2025-06-18 01:21:37.216] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-06-18 01:21:37.216] [CUML] [info] cuML: Successfully initialized accelerator.


### Image Processing

#### CNN Feature

In [5]:
#Process image data for feature extraction using CNN
base_output_dir = '/content/drive/MyDrive/cs610/CS610_AML_Group_Project/resized_images'
img_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])]) #mean and std based on ImageNet - normalise image data closer to normal distribution
img_dataset = datasets.ImageFolder(base_output_dir, transform=img_transform)
data_loader = DataLoader(img_dataset, batch_size=32, num_workers=4)

input_dir = '/content/drive/MyDrive/cs610/CS610_AML_Group_Project/grayscale_images'

In [6]:
#define function for CNN feature extraction
def cnn_feature_extract(cnn_feature_extractor, data_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #prepare cnn model to use for feature extraction
    cnn_feature_extractor.eval()
    cnn_feature_extractor.fc = torch.nn.Identity() #replace fully connected layer of pretrained cnn with Identity layer
    for para in cnn_feature_extractor.parameters():
        para.requires_grad = False #freeze weights
    #feature extraction
    features_list, labels_list = [], []
    cnn_feature_extractor.to(device)
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            feature = cnn_feature_extractor(images)
            feature = feature.view(feature.size(0),-1) #flatten into (n_samples, n_features) for non-CNN models
            #convert tensors into numpy for fitting into non-CNN models and add into lists
            features_list.append(feature.cpu().numpy())
            labels_list.append(labels.numpy())

    return cnn_feature_extractor, np.vstack(features_list), np.hstack(labels_list)

In [7]:
#initialise and extract features using CNN feature extractor
weights = models.ResNet50_Weights.IMAGENET1K_V2
resnet50_extractor = models.resnet50(weights=weights)
resnet50_extractor, X, y = cnn_feature_extract(resnet50_extractor, data_loader) #X = features, y =labels
#no need labelling as the numpy array is generated from the data_loader

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 194MB/s]


#### Data Splitting

In [8]:
#CNN training and test split
x_train, x_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
x_train, x_test, y_train, y_test = train_test_split(x_train,y_train, test_size=0.2, random_state=42, stratify=y)
x_train = pd.DataFrame(x_train, dtype=np.float32)
y_train = pd.Series(y_train, dtype=np.int32)
x_valid = pd.DataFrame(x_valid, dtype=np.float32)
y_valid = pd.Series(y_valid, dtype=np.int32)
x_test = pd.DataFrame(x_test, dtype=np.float32)
y_test = pd.Series(y_test, dtype=np.int32)
#same as original flow
print("Number of Samples:", len(y_train))
print("Number of Labels:", len(np.unique(y_train)))
counts = y_train.value_counts()
print("Label Distribution:")
print(counts)

Number of Samples: 3811
Number of Labels: 50
Label Distribution:
10    99
7     99
33    99
16    98
0     98
5     97
37    97
36    97
18    97
42    96
43    96
21    96
29    95
2     95
15    94
48    94
41    94
26    94
39    93
49    93
13    92
20    92
28    91
45    90
27    72
47    70
14    69
25    68
34    65
12    62
22    62
40    62
3     61
17    61
44    61
31    61
35    60
32    59
46    59
8     59
1     58
30    57
19    55
38    53
6     53
24    50
11    48
23    48
9     47
4     45
Name: count, dtype: int64


#### XGBoost

In [9]:
import time
import xgboost as xgb
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight

# Start timing
start_time = time.time()

# Balance class weights
sample_weights = compute_sample_weight(
    class_weight="balanced",
    y=y_train
)

# Base model
base_model = xgb.XGBClassifier(
    device="cuda",
    tree_method="hist",
    objective="multi:softprob",
    num_class=len(np.unique(y_train)),
    early_stopping_rounds=10,
    eval_metric=['merror','mlogloss'],
    random_state=42
)

# Hyperparameters
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 12),
    'learning_rate': uniform(0.01, 0.19),  # range: 0.01 to 0.2
    'subsample': uniform(0.7, 0.3),        # range: 0.7 to 1.0
    'colsample_bytree': uniform(0.7, 0.3)  # range: 0.7 to 1.0
}


# Randomized search tuning
random_search = RandomizedSearchCV(
    base_model,
    param_dist,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    error_score='raise'
)
random_search.fit(
    x_train, y_train,
    sample_weight=sample_weights,
    eval_set=[(x_valid, y_valid)],
    verbose=0)

# End timing
end_time = time.time()
training_time = end_time - start_time

Fitting 5 folds for each of 20 candidates, totalling 100 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  25.0s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  25.0s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  27.7s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  27.6s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  24.7s
[CV] END colsample_bytree=0.8337498258560773, learning_rate=0.028995234005420548, max_depth=10, n_estimators=422, subsample=0.8803345035229626; total time= 1.1min
[CV] END colsample_bytree=0.8337

In [10]:
# Best model
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print(f"Best Accuracy: {random_search.best_score_:.6f}", )
print(f"Total Training Time: {training_time/60:.2f} minutes")

Best Parameters: {'colsample_bytree': np.float64(0.7692681476866446), 'learning_rate': np.float64(0.05579483854494223), 'max_depth': 9, 'n_estimators': 477, 'subsample': np.float64(0.848553073033381)}
Best Accuracy: 0.480193
Total Training Time: 51.30 minutes


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score

beta = 0.5  # mis-labelled sneakers are more costly than missing labels

# Predictions
y_pred_train = best_model.predict(x_train)
y_pred_test = best_model.predict(x_test)

# --- Train Scores ---
print("TRAIN METRICS")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Precision (macro):", precision_score(y_train, y_pred_train, average='macro'))
print("Recall (macro):", recall_score(y_train, y_pred_train, average='macro'))
print(f"F{beta}-Score (macro):", fbeta_score(y_train, y_pred_train, beta=beta, average='macro'))

# --- Test Scores ---
print("\n TEST METRICS")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Precision (macro):", precision_score(y_test, y_pred_test, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred_test, average='macro'))
print(f"F{beta}-Score (macro):", fbeta_score(y_test, y_pred_test, beta=beta, average='macro'))

TRAIN METRICS
Accuracy: 0.9986880083967462
Precision (macro): 0.9980447662936142
Recall (macro): 0.998463719663305
F0.5-Score (macro): 0.9981037815524161

 TEST METRICS
Accuracy: 0.49789915966386555
Precision (macro): 0.49190364467781644
Recall (macro): 0.4849040774557971
F0.5-Score (macro): 0.4810487944253243
