### Install and import packages

In [1]:
import cudf
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import os
import tqdm
from skimage.feature import hog
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score

In [2]:
import cuml
print(cuml.__version__)

25.02.01


In [3]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/cs610')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Image Processing

#### HOG

In [4]:
from skimage.feature import hog

def extract_hog_features_recursive(input_dir, pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    features = []
    filenames = []
    supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    for root, dirs, files in os.walk(input_dir):
        for filename in files:
            if filename.lower().endswith(supported_formats):
                img_path = os.path.join(root, filename)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    continue
                # Extract HOG features
                hog_feature = hog(img, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, feature_vector=True)
                features.append(hog_feature)
                rel_path = os.path.relpath(img_path, input_dir)
                filenames.append(rel_path)
    hogged = np.array(features)
    return hogged, filenames

# Example usage:
input_dir = 'grayscale_images'
hogged, filenames = extract_hog_features_recursive(input_dir)
print(hogged.shape)  # (num_images, hog_feature_dim)

(5953, 6084)


#### Labelling

In [5]:
y = [f.split(os.sep)[0] for f in filenames]

#### Data Splitting

In [6]:
x = hogged.astype(np.float32)
y = np.array(y)
y_encoded, uniques = pd.factorize(y)

In [7]:
x_train, x_valid, y_encoded_train, y_encoded_valid = train_test_split(x, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
x_train, x_test, y_encoded_train, y_encoded_test = train_test_split(x_train, y_encoded_train, test_size=0.2, random_state=42, stratify=y_encoded)

In [8]:
x_train_gpu = cudf.DataFrame(x_train, dtype=np.float32)
y_train_gpu = cudf.Series(y_encoded_train, dtype=np.int32)
x_valid_gpu = cudf.DataFrame(x_valid, dtype=np.float32)
y_valid_gpu = cudf.Series(y_encoded_valid, dtype=np.int32)
x_test_gpu = cudf.DataFrame(x_test, dtype=np.float32)
y_test_gpu = cudf.Series(y_encoded_test, dtype=np.int32)

x_train_cpu = pd.DataFrame(x_train, dtype=np.float32)
y_train_cpu = pd.Series(y_encoded_train, dtype=np.int32)
x_valid_cpu = pd.DataFrame(x_valid, dtype=np.float32)
y_valid_cpu = pd.Series(y_encoded_valid, dtype=np.int32)
x_test_cpu = pd.DataFrame(x_test, dtype=np.float32)
y_test_cpu = pd.Series(y_encoded_test, dtype=np.int32)


In [9]:
print("Number of Samples:", len(y_train_cpu))
print("Number of Labels:", len(np.unique(y_train_cpu)))

counts = y_train_cpu.value_counts()
print("Label Distribution:")
print(counts)

Number of Samples: 3811
Number of Labels: 50
Label Distribution:
18    100
9      99
26     99
31     98
23     97
30     97
41     97
44     97
4      97
14     96
22     96
38     96
43     96
16     95
2      95
20     95
11     95
7      95
10     94
46     94
49     93
39     93
33     92
17     92
12     70
45     70
32     69
34     65
25     65
27     64
47     63
24     62
6      61
15     61
37     61
3      59
1      59
28     58
42     57
40     57
19     57
8      55
29     53
13     52
21     51
36     51
48     50
0      49
35     49
5      45
Name: count, dtype: int64


#### KNN

In [10]:
%load_ext cuml.accel
import time
import xgboost as xgb
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight

# Start timing
start_time = time.time()

# Base model
base_model = cuml.neighbors.KNeighborsClassifier()

# Hyperparameters
param_dist = {
    'n_neighbors': randint(1, 30),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'cosine']
}


# Randomized search tuning
random_search = RandomizedSearchCV(
    base_model,
    param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    error_score='raise'
)
random_search.fit(x_train_cpu, y_train_cpu)

# End timing
end_time = time.time()
training_time = end_time - start_time

[2025-06-16 19:29:27.663] [CUML] [info] cuML: Installed accelerator for sklearn.
[2025-06-16 19:29:38.496] [CUML] [info] cuML: Installed accelerator for umap.
[2025-06-16 19:29:38.510] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-06-16 19:29:38.510] [CUML] [info] cuML: Successfully initialized accelerator.
[2025-06-16 19:29:38.530] [CUML] [info] Unused keyword parameter: handle during CPU estimator initialization
[2025-06-16 19:29:38.530] [CUML] [info] Unused keyword parameter: verbose during CPU estimator initialization
[2025-06-16 19:29:38.530] [CUML] [info] Unused keyword parameter: output_type during CPU estimator initialization
[2025-06-16 19:29:38.533] [CUML] [info] Unused keyword parameter: leaf_size during cuML estimator initialization
[2025-06-16 19:29:38.533] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[2025-06-16 19:29:38.540] [CUML] [info] Unused keyword par

In [11]:
# Best model
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print(f"Best Accuracy: {random_search.best_score_:.6f}", )
print(f"Total Training Time: {training_time/60:.2f} minutes")

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'distance'}
Best Accuracy: 0.271317
Total Training Time: 0.31 minutes


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score

beta = 0.5  # mis-labelled sneakers are more costly than missing labels

# Predictions
y_pred_train = best_model.predict(x_train_cpu)
y_pred_test = best_model.predict(x_test_cpu)

# --- Train Scores ---
print("TRAIN METRICS")
print("Accuracy:", accuracy_score(y_train_cpu, y_pred_train))
print("Precision (macro):", precision_score(y_train_cpu, y_pred_train, average='macro'))
print("Recall (macro):", recall_score(y_train_cpu, y_pred_train, average='macro'))
print(f"F{beta}-Score (macro):", fbeta_score(y_train_cpu, y_pred_train, beta=beta, average='macro'))

# --- Test Scores ---
print("\n TEST METRICS")
print("Accuracy:", accuracy_score(y_test_cpu, y_pred_test))
print("Precision (macro):", precision_score(y_test_cpu, y_pred_test, average='macro'))
print("Recall (macro):", recall_score(y_test_cpu, y_pred_test, average='macro'))
print(f"F{beta}-Score (macro):", fbeta_score(y_test_cpu, y_pred_test, beta=beta, average='macro'))

TRAIN METRICS
Accuracy: 0.9981632117554448
Precision (macro): 0.9975538024564198
Recall (macro): 0.9975573294089424
F0.5-Score (macro): 0.9975452255511602

 TEST METRICS
Accuracy: 0.27941176470588236
Precision (macro): 0.30356876286853507
Recall (macro): 0.2732286067098424
F0.5-Score (macro): 0.28696642446362525
