### Install and import packages

In [None]:
import cudf
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import os
import tqdm
from skimage.feature import hog
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score

In [None]:
import cuml
print(cuml.__version__)

25.02.01


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('/content/drive/MyDrive/cs610')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Image Processing

#### HOG

In [None]:
from skimage.feature import hog

def extract_hog_features_recursive(input_dir, pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    features = []
    filenames = []
    supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    for root, dirs, files in os.walk(input_dir):
        for filename in files:
            if filename.lower().endswith(supported_formats):
                img_path = os.path.join(root, filename)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    continue
                # Extract HOG features
                hog_feature = hog(img, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, feature_vector=True)
                features.append(hog_feature)
                rel_path = os.path.relpath(img_path, input_dir)
                filenames.append(rel_path)
    hogged = np.array(features)
    return hogged, filenames

# Example usage:
input_dir = 'grayscale_images'
hogged, filenames = extract_hog_features_recursive(input_dir)
print(hogged.shape)  # (num_images, hog_feature_dim)

#### Labelling

In [None]:
y = [f.split(os.sep)[0] for f in filenames]

#### Data Splitting

In [None]:
x = hogged.astype(np.float32)
y = np.array(y)
y_encoded, uniques = pd.factorize(y)

In [None]:
x_train, x_valid, y_encoded_train, y_encoded_valid = train_test_split(x, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
x_train, x_test, y_encoded_train, y_encoded_test = train_test_split(x_train, y_encoded_train, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
x_train_gpu = cudf.DataFrame(x_train, dtype=np.float32)
y_train_gpu = cudf.Series(y_encoded_train, dtype=np.int32)
x_valid_gpu = cudf.DataFrame(x_valid, dtype=np.float32)
y_valid_gpu = cudf.Series(y_encoded_valid, dtype=np.int32)
x_test_gpu = cudf.DataFrame(x_test, dtype=np.float32)
y_test_gpu = cudf.Series(y_encoded_test, dtype=np.int32)

x_train_cpu = pd.DataFrame(x_train, dtype=np.float32)
y_train_cpu = pd.Series(y_encoded_train, dtype=np.int32)
x_valid_cpu = pd.DataFrame(x_valid, dtype=np.float32)
y_valid_cpu = pd.Series(y_encoded_valid, dtype=np.int32)
x_test_cpu = pd.DataFrame(x_test, dtype=np.float32)
y_test_cpu = pd.Series(y_encoded_test, dtype=np.int32)

In [None]:
print("Number of Samples:", len(y_train_cpu))
print("Number of Labels:", len(np.unique(y_train_cpu)))

counts = y_train_cpu.value_counts()
print("Label Distribution:")
print(counts)

Number of Samples: 2974
Number of Labels: 40
Label Distribution:
37    101
35    101
13    101
17    100
24     97
8      97
14     96
22     96
28     96
30     96
11     95
2      95
18     94
29     94
4      94
21     92
9      89
32     71
26     69
12     66
33     65
31     64
20     64
5      63
7      62
34     61
1      61
16     60
36     60
0      58
27     57
15     56
19     55
10     54
23     51
38     51
39     51
25     50
3      48
6      43
Name: count, dtype: int64


#### XGBoost

In [None]:
%load_ext cuml.accel
import time
import xgboost as xgb
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight

# Start timing
start_time = time.time()

# Balance class weights
sample_weights = compute_sample_weight(
    class_weight="balanced",
    y=y_train_cpu
)

# Base model
base_model = xgb.XGBClassifier(
    device="cuda",
    tree_method="hist",
    objective="multi:softprob",
    num_class=len(np.unique(y_train_cpu)),
    early_stopping_rounds=10,
    eval_metric=['merror','mlogloss'],
    random_state=42
)

# Hyperparameters
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 12),
    'learning_rate': uniform(0.01, 0.19),  # range: 0.01 to 0.2
    'subsample': uniform(0.7, 0.3),        # range: 0.7 to 1.0
    'colsample_bytree': uniform(0.7, 0.3)  # range: 0.7 to 1.0
}


# Randomized search tuning
random_search = RandomizedSearchCV(
    base_model,
    param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    error_score='raise'
)
random_search.fit(
    x_train_cpu, y_train_cpu,
    sample_weight=sample_weights,
    eval_set=[(x_valid_cpu, y_valid_cpu)],
    verbose=0)

# End timing
end_time = time.time()
training_time = end_time - start_time

The cuml.accel extension is already loaded. To reload it, use:
  %reload_ext cuml.accel
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  31.7s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  34.9s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  30.3s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  35.6s
[CV] END colsample_bytree=0.8123620356542087, learning_rate=0.19063571821788408, max_depth=10, n_estimators=238, subsample=0.879055047383946; total time=  34.4s
[CV] END colsample_bytree=0.8337498258560773,

In [2]:
# Best model
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print(f"Best Accuracy: {random_search.best_score_:.6f}", )
print(f"Total Training Time: {training_time/60:.2f} minutes")

Best Parameters: {'colsample_bytree': np.float64(0.9950269422684528), 'learning_rate': np.float64(0.08577664406446507), 'max_depth': 3, 'n_estimators': 250, 'subsample': np.float64(0.7962340194915207)}
Best Accuracy: 0.2938822397645927
Total Training Time: 193.18 minutes


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score

beta = 0.5  # mis-labelled sneakers are more costly than missing labels

# Predictions
y_pred_train = best_model.predict(x_train_cpu)
y_pred_test = best_model.predict(x_test_cpu)

# --- Train Scores ---
print("TRAIN METRICS")
print("Accuracy:", accuracy_score(y_train_cpu, y_pred_train))
print("Precision (macro):", precision_score(y_train_cpu, y_pred_train, average='macro'))
print("Recall (macro):", recall_score(y_train_cpu, y_pred_train, average='macro'))
print(f"F{beta}-Score (macro):", fbeta_score(y_train_cpu, y_pred_train, beta=beta, average='macro'))

# --- Test Scores ---
print("\n TEST METRICS")
print("Accuracy:", accuracy_score(y_test_cpu, y_pred_test))
print("Precision (macro):", precision_score(y_test_cpu, y_pred_test, average='macro'))
print("Recall (macro):", recall_score(y_test_cpu, y_pred_test, average='macro'))
print(f"F{beta}-Score (macro):", fbeta_score(y_test_cpu, y_pred_test, beta=beta, average='macro'))

TRAIN METRICS
Accuracy: 0.9996637525218561
Precision (macro): 0.9995192307692307
Recall (macro): 0.9995967741935484
F0.5-Score (macro): 0.9995322002674942

 TEST METRICS
Accuracy: 0.3041722745625841
Precision (macro): 0.3239029516593595
Recall (macro): 0.2903715446023294
F0.5-Score (macro): 0.30795705037248194
