### Install and import packages

In [None]:
!pip install opencv-python torch torchvision



In [3]:
# import cudf
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import os
import tqdm
import xgboost as xgb
import time
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
import torch
from torch import nn, optim
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader, random_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import LabelEncoder
import warnings
import shutil
warnings.filterwarnings("ignore")

In [None]:
# Running On CPU, Please skip this cell
import cuml
print(cuml.__version__)
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score
%load_ext cuml.accel

## Check Data Source

In [None]:
def count_images(datasource_path):
    image_counts = {}
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}

    if not os.path.isdir(datasource_path):
        print(f"Error: Path '{datasource_path}' is not a directory.")
        return image_counts

    for subfolder_name in os.listdir(datasource_path):
        subfolder_path = os.path.join(datasource_path, subfolder_name)

        if os.path.isdir(subfolder_path):
            count = 0
            for file_name in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, file_name)
                if os.path.isfile(file_path):
                    _, ext = os.path.splitext(file_name)
                    if ext.lower() in image_extensions:
                        count += 1
            image_counts[subfolder_name] = count
    return image_counts

image_dir = 'datasource'
print(f"Scanning: {image_dir}")
counts = count_images(image_dir)

if counts:
    for folder, count in counts.items():
        print(f"{folder}: {count} images")
else:
    print("No images found or path is incorrect/empty.")

Scanning: datasource
Error: Path 'datasource' is not a directory.
No images found or path is incorrect/empty.


## Image Processing

### Resizing

In [None]:
# resize images
def resize_image_in_folder(input_dir, output_dir, size=(224, 224), desc='resizing images'):
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist. Please check the path.")
        return

    os.makedirs(output_dir, exist_ok=True)
    supported_formats = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(supported_formats):
            img_input_path = os.path.join(input_dir, filename)
            img_output_path = os.path.join(output_dir, filename)
            try:
                img = cv2.imread(img_input_path, cv2.IMREAD_UNCHANGED)

                if img is None:
                    print(f"Error loading {img_input_path}")
                    continue
                resized_img = cv2.resize(img, size, interpolation=cv2.INTER_LANCZOS4)

                if img_output_path.lower().endswith(('.jpg', '.jpeg')) and resized_img.shape[-1] == 4:
                    resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGRA2BGR)
                cv2.imwrite(img_output_path, resized_img)
            except Exception as e:
                print(f"Error processing {img_input_path}: {e}")

In [None]:
# process all folders
def batch_resize_images(base_input_dir, base_output_dir, size=(128, 128)):
    if not os.path.exists(base_input_dir):
        print(f"Base directory {base_input_dir} does not exist. Please check the path.")
        return

    os.makedirs(base_output_dir, exist_ok=True) # if output directory does not exist, create it.

    for folder in tqdm.tqdm(os.listdir(base_input_dir)):
        current_input_subfolder = os.path.join(base_input_dir, folder)
        current_output_subfolder = os.path.join(base_output_dir, folder)

        if os.path.isdir(current_input_subfolder):
            resize_image_in_folder(current_input_subfolder, current_output_subfolder, size=size)
        else:
            print(f"Skipping {current_input_subfolder} as it is not a directory.")

    print("Batch resizing completed.")

In [None]:
input_dir = '../CS610_AML_Group_Project/datasource'
output_dir = '../CS610_AML_Group_Project/resized_images'
batch_resize_images(input_dir, output_dir, size=(128, 128))

100%|██████████| 50/50 [01:47<00:00,  2.15s/it]

Batch resizing completed.





### Gray Scaling

In [None]:
def grayscale_image_in_folder(input_dir, output_dir):
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist. Please check the path.")
        return

    os.makedirs(output_dir, exist_ok=True)
    supported_formats = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(supported_formats):
            img_input_path = os.path.join(input_dir, filename)
            img_output_path = os.path.join(output_dir, filename)
            try:
                img = cv2.imread(img_input_path)
                if img is None:
                    print(f"Error loading {img_input_path}")
                    continue
                # Convert to grayscale
                gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                cv2.imwrite(img_output_path, gray_img)
            except Exception as e:
                print(f"Error processing {img_input_path}: {e}")

In [None]:
def batch_grayscale_images(base_input_dir, base_output_dir):
    if not os.path.exists(base_input_dir):
        print(f"Base directory {base_input_dir} does not exist. Please check the path.")
        return

    os.makedirs(base_output_dir, exist_ok=True)

    for folder in tqdm.tqdm(os.listdir(base_input_dir)):
        current_input_subfolder = os.path.join(base_input_dir, folder)
        current_output_subfolder = os.path.join(base_output_dir, folder)

        if os.path.isdir(current_input_subfolder):
            grayscale_image_in_folder(current_input_subfolder, current_output_subfolder)
        else:
            print(f"Skipping {current_input_subfolder} as it is not a directory.")

    print("Batch grayscale completed.")

In [None]:
base_input_dir = '../CS610_AML_Group_Project/resized_images'
base_output_dir = '../CS610_AML_Group_Project/grayscale_images'
batch_grayscale_images(base_input_dir, base_output_dir)

100%|██████████| 50/50 [01:29<00:00,  1.79s/it]

Batch grayscale completed.





In [None]:
def image_to_df(data_dir):
    """
    Create a DataFrame with image paths and their corresponding class labels
    """
    image_data = []

    # Walk through all subdirectories in the data directory
    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)

        # Skip if not a directory
        if not os.path.isdir(class_path):
            continue

        # Get all image files in the class directory
        for filename in os.listdir(class_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_path = os.path.join(class_path, filename)
                image_data.append({
                    'image_path': image_path,
                    'class': class_name
                })

    return pd.DataFrame(image_data)

def split_dataset(data_dir, output_dir, test_size=0.2, val_size=0.1, random_state=42):
    """
    Split the dataset into training, validation, and test sets

    Args:
        data_dir: Directory containing the original dataset
        output_dir: Directory to save the split datasets
        test_size: Proportion of data for test set (default: 0.2)
        val_size: Proportion of remaining data for validation set (default: 0.2)
        random_state: Random seed for reproducibility
    """

    print("Creating dataset DataFrame...")
    df = image_to_df(data_dir)

    print(f"Total images found: {len(df)}")
    print(f"Number of classes: {df['class'].nunique()}")
    print("\nClass distribution:")
    print(df['class'].value_counts())

    # Encode class labels
    label_encoder = LabelEncoder()
    df['encoded_class'] = label_encoder.fit_transform(df['class'])

    # Display class mapping
    print("\nClass to encoded label mapping:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"{class_name}: {i}")

    # First split: separate test set
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['encoded_class']
    )

    # Create output directories
    train_dir = os.path.join(output_dir, 'train')
    test_dir = os.path.join(output_dir, 'test')

    for dir_path in [train_dir, test_dir]:
        os.makedirs(dir_path, exist_ok=True)

    # Copy files to respective directories
    print("\nCopying files to split directories...")

    # Copy training files
    print("Copying training files...")
    for _, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df)):
        class_dir = os.path.join(train_dir, row['class'])
        os.makedirs(class_dir, exist_ok=True)

        filename = os.path.basename(row['image_path'])
        dest_path = os.path.join(class_dir, filename)
        shutil.copy2(row['image_path'], dest_path)

    # Copy test files
    print("Copying test files...")
    for _, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df)):
        class_dir = os.path.join(test_dir, row['class'])
        os.makedirs(class_dir, exist_ok=True)

        filename = os.path.basename(row['image_path'])
        dest_path = os.path.join(class_dir, filename)
        shutil.copy2(row['image_path'], dest_path)

    # Save split information
    split_info = {
        'total_images': len(df),
        'train_images': len(train_df),
        'test_images': len(test_df),
        'num_classes': len(label_encoder.classes_),
        'class_mapping': dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
    }

    # Save DataFrames
    train_df.to_csv(os.path.join(output_dir, 'train_split.csv'), index=False)
    test_df.to_csv(os.path.join(output_dir, 'test_split.csv'), index=False)

    # Save split information
    split_info_df = pd.DataFrame([split_info])
    split_info_df.to_csv(os.path.join(output_dir, 'split_info.csv'), index=False)

    # Print summary
    print("\n" + "="*50)
    print("DATASET SPLIT SUMMARY")
    print("="*50)
    print(f"Total images: {len(df)}")
    print(f"Training set: {len(train_df)} images ({len(train_df)/len(df)*100:.1f}%)")
    print(f"Test set: {len(test_df)} images ({len(test_df)/len(df)*100:.1f}%)")
    print(f"Number of classes: {len(label_encoder.classes_)}")
    print(f"\nSplit datasets saved to: {output_dir}")

    return train_df, test_df, label_encoder

In [None]:
data_dir = '../CS610_AML_Group_Project/grayscale_images'
out_dir = '../CS610_AML_Group_Project/split_images'
split_dataset(data_dir, out_dir)

Creating dataset DataFrame...
Total images found: 6480
Number of classes: 50

Class distribution:
class
adidas_forum_high                      150
nike_air_jordan_4                      150
nike_air_max_90                        150
new_balance_992                        150
new_balance_574                        150
new_balance_550                        150
adidas_ultraboost                      150
nike_cortez                            150
converse_one_star                      150
nike_dunk_high                         150
nike_air_vapormax_flyknit              149
nike_air_max_270                       149
vans_sk8-hi                            149
adidas_gazelle                         149
converse_chuck_70_low                  148
reebok_club_c_85                       148
vans_authentic                         148
yeezy_boost_350_v2                     148
nike_air_force_1_mid                   148
puma_suede_classic                     148
salomon_xt-6                        

100%|██████████| 5184/5184 [00:03<00:00, 1614.43it/s]


Copying test files...


100%|██████████| 1296/1296 [00:00<00:00, 1657.68it/s]


DATASET SPLIT SUMMARY
Total images: 6480
Training set: 5184 images (80.0%)
Test set: 1296 images (20.0%)
Number of classes: 50

Split datasets saved to: ../CS610_AML_Group_Project/split_images





(                                             image_path  \
 6177  ../CS610_AML_Group_Project/grayscale_images\ye...   
 288   ../CS610_AML_Group_Project/grayscale_images\ad...   
 6340  ../CS610_AML_Group_Project/grayscale_images\ye...   
 3096  ../CS610_AML_Group_Project/grayscale_images\ni...   
 3640  ../CS610_AML_Group_Project/grayscale_images\ni...   
 ...                                                 ...   
 2769  ../CS610_AML_Group_Project/grayscale_images\ni...   
 5597  ../CS610_AML_Group_Project/grayscale_images\va...   
 5385  ../CS610_AML_Group_Project/grayscale_images\re...   
 4353  ../CS610_AML_Group_Project/grayscale_images\ni...   
 1725  ../CS610_AML_Group_Project/grayscale_images\co...   
 
                           class  encoded_class  
 6177      yeezy_700_wave_runner             47  
 288              adidas_gazelle              2  
 6340                yeezy_slide             49  
 3096      nike_air_jordan_1_low             24  
 3640           nike_air_max

## Data Augmentation

In [None]:
def image_augmentation(image, augmentation_type,angle_range=(-15, 15), brightness_range=(0.7, 1.3)):
    if augmentation_type == 'flip':
        return cv2.flip(image, 1)
    elif augmentation_type == 'rotate':
        angle = np.random.randint(angle_range[0], angle_range[1] + 1)
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        return cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
    elif augmentation_type == 'brightness':
        brightness_factor = np.random.uniform(brightness_range[0], brightness_range[1])
        return np.clip(image * brightness_factor, 0, 255).astype(np.uint8)
    return image # return orginal image

In [None]:
# Create a directory to store the augmented images
aug_dir = 'augmented_train_images'
os.makedirs(aug_dir, exist_ok = True)
image_dir = '../CS610_AML_Group_Project/split_images/train'

all_images_paths = []
all_images_labels = []



sneaker_names_list = os.listdir(image_dir)
print("====== Image Augmentation Starts ======")
for sneaker_name in tqdm.tqdm(sneaker_names_list, desc="Augmenting"):
    original_path = os.path.join(image_dir, sneaker_name)
    if os.path.isdir(original_path):
        aug_path = os.path.join(aug_dir, sneaker_name)
        os.makedirs(aug_path, exist_ok = True)

        for image in os.listdir(original_path):

            # design saved path
            # 1. orinal
            image_full_path = os.path.join(original_path, image)
            original_image = cv2.imread(image_full_path)
            if original_image is None:
                print(f'WARNING: CANNOT READ IMAGE {image_full_path}, SKIPPED!')
                continue

            base, ext = os.path.splitext(image)

            # design saved path
            # 1. orinal
            image_name_original = f'{base}_original{ext}'
            original_image_saved_path = os.path.join(aug_path,image_name_original)
            # 2. flipped
            image_name_flipped = f'{base}_flipped{ext}'
            flipped_image_saved_path = os.path.join(aug_path,image_name_flipped)
            # 3. rotated
            image_name_rotated = f'{base}_rotated{ext}'
            rotated_image_saved_path = os.path.join(aug_path,image_name_rotated)
            # 4. bright
            image_name_brightened = f'{base}_brightened{ext}'
            brightened_image_saved_path = os.path.join(aug_path,image_name_brightened)

            # augmentation operations
            # 1. original
            cv2.imwrite(original_image_saved_path, original_image)
            all_images_paths.append(original_image_saved_path)
            all_images_labels.append(sneaker_name)

            # 2. flipped
            img_flipped = image_augmentation(original_image, augmentation_type = 'flip')
            cv2.imwrite(flipped_image_saved_path, img_flipped)
            all_images_paths.append(flipped_image_saved_path)
            all_images_labels.append(sneaker_name)

            # 3. rotated
            img_rotated = image_augmentation(original_image, augmentation_type = 'rotate')
            cv2.imwrite(rotated_image_saved_path, img_rotated)
            all_images_paths.append(rotated_image_saved_path)
            all_images_labels.append(sneaker_name)

            # 4. brightness
            img_bright = image_augmentation(original_image, augmentation_type = 'brightness')
            cv2.imwrite(brightened_image_saved_path, img_bright)
            all_images_paths.append(brightened_image_saved_path)
            all_images_labels.append(sneaker_name)

print("====== Image Augmentation Starts ======")

image_df_augmented = pd.DataFrame({
    'path': all_images_paths,
    'label': all_images_labels
})

print(f"We have now {len(image_df_augmented)} images for modelling")

image_df_augmented



Augmenting: 100%|██████████| 50/50 [01:16<00:00,  1.53s/it]

We have now 20736 images for modelling





Unnamed: 0,path,label
0,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
1,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
2,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
3,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
4,augmented_train_images\adidas_forum_high\0004_...,adidas_forum_high
...,...,...
20731,augmented_train_images\yeezy_slide\0144_bright...,yeezy_slide
20732,augmented_train_images\yeezy_slide\0145_origin...,yeezy_slide
20733,augmented_train_images\yeezy_slide\0145_flippe...,yeezy_slide
20734,augmented_train_images\yeezy_slide\0145_rotate...,yeezy_slide


### Pipeline Models using Feature Extraction Method 1 - By HOG

#### Feature Extraction by HOG

In [4]:
def extract_hog_features_recursive(input_dir, force_size = (128, 128), pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    features = []
    filenames = []
    supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    for root, dirs, files in tqdm.tqdm(os.walk(input_dir)):
        for filename in tqdm.tqdm(files):
            if filename.lower().endswith(supported_formats):
                img_path = os.path.join(root, filename)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    continue
                # force resized in case feature extraction failed
                img_resized = cv2.resize(img, force_size, interpolation=cv2.INTER_AREA)
                # pixel normalisation
                img_normalised = img_resized.astype(np.float32) / 255.0
                # Extract HOG features
                try:
                    hog_feature = hog(img_normalised, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, feature_vector=True)
                    features.append(hog_feature)
                    rel_path = os.path.relpath(img_path, input_dir)
                    filenames.append(rel_path)
                except Exception as e:
                    print("WARNING: {img_path} Failed with HOG feature extraction!")
                    continue
    hogged = np.array(features)
    return hogged, filenames


In [5]:
# train set:
input_dir = '/content/CS610_AML_Group_Project/augmented_train_images'
print('====== HOG Extraction Starts! ======')
hogged_train, filenames_train = extract_hog_features_recursive(input_dir)
print('====== HOG Extraction Completed! ======')
print(hogged_train.shape)  # (num_images, hog_feature_dim)

# test set:
input_dir = '/content/CS610_AML_Group_Project/split_images/test'
print('====== HOG Extraction Starts! ======')
hogged_test, filenames_test = extract_hog_features_recursive(input_dir)
print('====== HOG Extraction Completed! ======')
print(hogged_test.shape)  # (num_images, hog_feature_dim)




0it [00:00, ?it/s]
0it [00:00, ?it/s]

  0%|          | 0/472 [00:00<?, ?it/s][A
  5%|▍         | 22/472 [00:00<00:02, 216.46it/s][A
 12%|█▎        | 59/472 [00:00<00:01, 302.83it/s][A
 20%|██        | 96/472 [00:00<00:01, 332.16it/s][A
 28%|██▊       | 133/472 [00:00<00:00, 343.91it/s][A
 36%|███▌      | 170/472 [00:00<00:00, 350.87it/s][A
 44%|████▍     | 207/472 [00:00<00:00, 356.44it/s][A
 51%|█████▏    | 243/472 [00:00<00:00, 355.70it/s][A
 60%|█████▉    | 281/472 [00:00<00:00, 360.83it/s][A
 67%|██████▋   | 318/472 [00:00<00:00, 363.14it/s][A
 75%|███████▌  | 355/472 [00:01<00:00, 363.30it/s][A
 83%|████████▎ | 392/472 [00:01<00:00, 364.07it/s][A
 91%|█████████ | 430/472 [00:01<00:00, 365.81it/s][A
100%|██████████| 472/472 [00:01<00:00, 353.22it/s]
2it [00:01,  1.49it/s]
  0%|          | 0/476 [00:00<?, ?it/s][A
  8%|▊         | 38/476 [00:00<00:01, 378.03it/s][A
 16%|█▌        | 76/476 [00:00<00:01, 379.04it/s][A
 24%|██▍       | 115/476 [00:00<00:00, 380.13it/s]

(20736, 1764)


0it [00:00, ?it/s]
0it [00:00, ?it/s]

100%|██████████| 30/30 [00:00<00:00, 378.22it/s]

100%|██████████| 30/30 [00:00<00:00, 389.12it/s]
3it [00:00, 17.91it/s]
100%|██████████| 30/30 [00:00<00:00, 404.65it/s]

100%|██████████| 23/23 [00:00<00:00, 401.69it/s]
5it [00:00, 16.09it/s]
100%|██████████| 30/30 [00:00<00:00, 396.59it/s]

100%|██████████| 30/30 [00:00<00:00, 351.20it/s]
7it [00:00, 14.08it/s]
100%|██████████| 21/21 [00:00<00:00, 348.02it/s]

100%|██████████| 23/23 [00:00<00:00, 387.15it/s]
9it [00:00, 14.68it/s]
100%|██████████| 23/23 [00:00<00:00, 387.10it/s]

100%|██████████| 23/23 [00:00<00:00, 359.91it/s]
11it [00:00, 14.92it/s]
100%|██████████| 23/23 [00:00<00:00, 384.45it/s]

100%|██████████| 23/23 [00:00<00:00, 372.12it/s]
13it [00:00, 15.12it/s]
100%|██████████| 30/30 [00:00<00:00, 392.62it/s]

100%|██████████| 29/29 [00:00<00:00, 378.18it/s]
15it [00:01, 14.18it/s]
100%|██████████| 23/23 [00:00<00:00, 334.50it/s]

100%|██████████| 23/23 [00:00<00:00, 384.98it/s]
17it 

(1296, 1764)





In [6]:
#Labeling
y_train = [f.split(os.sep)[0] for f in filenames_train]

#split data into train_test split
x_train = hogged_train.astype(np.float32)
y_train = np.array(y_train)
y_train, uniques = pd.factorize(y_train)
x_train = pd.DataFrame(x_train, dtype=np.float32)
y_train = pd.Series(y_train, dtype=np.int32)

In [7]:
#Labeling
y_test = [f.split(os.sep)[0] for f in filenames_test]

#split data into train_test split
x_test = hogged_test.astype(np.float32)
y_test = np.array(y_test)
y_test, uniques = pd.factorize(y_test)
x_test = pd.DataFrame(x_test, dtype=np.float32)
y_test = pd.Series(y_test, dtype=np.int32)

In [8]:
#Check if data is prepared successfully
print("Number of Train Samples:", len(y_train))
print("Number of Train Labels:", len(np.unique(y_train)))
counts = y_train.value_counts()
print("Train Label Distribution:")
print(counts)

print("Number of Test Samples:", len(y_test))
print("Number of Test Labels:", len(np.unique(y_test)))
counts = y_test.value_counts()
print("Test Label Distribution:")
print(counts)

Number of Train Samples: 20736
Number of Train Labels: 50
Train Label Distribution:
16    480
17    480
34    480
35    480
40    480
39    480
41    480
49    480
47    480
25    480
12    476
1     476
22    476
36    476
13    472
4     472
0     472
2     472
30    472
5     472
21    472
29    472
44    472
32    464
19    368
10    368
3     368
37    368
27    368
15    368
7     368
8     368
11    368
20    368
26    368
33    368
28    368
9     364
48    364
42    364
31    364
18    364
38    364
14    360
6     348
23    348
43    344
46    340
45    320
24    292
Name: count, dtype: int64
Number of Test Samples: 1296
Number of Test Labels: 50
Test Label Distribution:
0     30
1     30
2     30
4     30
5     30
12    30
17    30
16    30
47    30
44    30
49    30
35    30
36    30
39    30
41    30
25    30
22    30
21    30
40    30
34    30
30    29
13    29
29    29
32    29
18    23
10    23
3     23
14    23
11    23
15    23
7     23
8     23
9     23
20    23
42  

#### Feature Standardisation

In [9]:
print("\n====== Feature Standardisation Started! ======")
scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

print("\n====== Feature Standardisation Completed! ======")
print(f"The Shape for Training Set after Feature Standardisation: {x_train_scaled.shape}")
print(f"The Shape for Testing Set after Feature Standardisation: {x_test_scaled.shape}")



The Shape for Training Set after Feature Standardisation: (20736, 1764)
The Shape for Testing Set after Feature Standardisation: (1296, 1764)


In [10]:
print("\n====== Dimensionality Reduction by PCA Started! ======")
pca = PCA(n_components=0.85, random_state=42)
pca.fit(x_train_scaled)


x_train_pca = pca.transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

print("\n====== Dimensionality Reduction by PCA Completed! ======")
print(f"The Shape for Training Set after Dimensionality Reduction by PCA: {x_train_pca.shape}")
print(f"The Shape for Testing Set after Dimensionality Reduction by PCA: {x_test_pca.shape}")
print(f"The Number of Chosen PCA: {pca.n_components_}")
print(f"The Explained Variance Ratio: {np.sum(pca.explained_variance_ratio_):.4f}")



The Shape for Training Set after Dimensionality Reduction by PCA: (20736, 241)
The Shape for Testing Set after Dimensionality Reduction by PCA: (1296, 241)
The Number of Chosen PCA: 241
The Explained Variance Ratio: 0.8505


#### 1) RandomForestClassifier - feature extraction by hog

In [None]:
skip_train = True

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 30 candidates, totalling 30 fits")
    with open('model_bank/best_hog_rf_model.pkl', 'rb') as file:
        best_hog_rf = pickle.load(file)

else:
    start_time = time.time()

    # Base model
    base_model = RandomForestClassifier(random_state=42)

    # Hyperparameters
    param_dist = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, 40],
        'max_features': ['sqrt', 'log2', 0.5, 0.8, 1.0]
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train_pca, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Training skipped, importing model trained previously...
Fitted 3 folds for each of 30 candidates, totalling 30 fits


In [None]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    best_hog_rf = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = 26.8
else:
    best_hog_rf = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time /60, 2)
print(f"Total Training Time: {training_time} minutes")

Training skipped, printing model trained previously...

Best Parameters: {'n_estimators': 150, 'max_features': 0.5, 'max_depth': 40}
Best Accuracy: 0.316551
Total Training Time: 26.8 minutes


In [None]:
# Predict
y_train_pred = best_hog_rf.predict(x_train_pca)
y_test_pred = best_hog_rf.predict(x_test_pca)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
hog_rf_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(hog_rf_metrics)

NameError: name 'best_hog_rf' is not defined

In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_hog_rf_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file:
        pickle.dump(best_hog_rf, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_hog_rf_model.pkl


#### 2) KNNClassifier - feature extraction by hog

In [None]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 10 candidates, totalling 30 fits")
    with open('model_bank/best_hog_knn_model.pkl', 'rb') as file:
        best_hog_knn = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Base model
    base_model = KNeighborsClassifier()

    # Hyperparameters
    param_dist = {
        'n_neighbors': randint(1, 30),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'cosine']
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train_pca, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
skip_train = False
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}")
    print("Best Accuracy: 0.675010")
    training_time = 1.03
else:
    best_hog_knn = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
Best Accuracy: 0.279900
Total Training Time: 0.0 minutes


In [None]:
# Predict
y_train_pred = best_hog_knn.predict(x_train_pca)
y_test_pred = best_hog_knn.predict(x_test_pca)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
hog_knn_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(hog_knn_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,0.0,
1,Accuracy,,0.998939,0.368827
2,Precision,macro,0.998861,0.379593
3,Precision,micro,0.998939,0.368827
4,Precision,weighted,0.998971,0.385752
5,Recall,macro,0.998801,0.367517
6,Recall,micro,0.998939,0.368827
7,Recall,weighted,0.998939,0.368827
8,F0.5-Score,macro,0.998841,0.371319
9,F0.5-Score,micro,0.998939,0.368827


In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_hog_knn_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file:
        pickle.dump(best_hog_knn, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_hog_knn_model.pkl


#### 3) XGBoostClassifier - feature extraction by hog

In [11]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 10 candidates, totalling 30 fits")
    with open('model_bank/best_hog_xgb_model.pkl', 'rb') as file:
        best_hog_xgb = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Balance class weights
    sample_weights = compute_sample_weight(
        class_weight="balanced",
        y=y_train
    )

    # Base model
    base_model = xgb.XGBClassifier(
        device="cuda",
        tree_method="hist",
        objective="multi:softprob",
        num_class=len(np.unique(y_train)),
        eval_metric=['merror','mlogloss'],
        random_state=42,
    )

    # Hyperparameters
    param_dist = {
        'n_estimators': randint(50, 500),
        'max_depth': randint(3, 12),
        'learning_rate': uniform(0.01, 0.19),  # range: 0.01 to 0.2
        'subsample': uniform(0.7, 0.3),        # range: 0.7 to 1.0
        'colsample_bytree': uniform(0.7, 0.3)  # range: 0.7 to 1.0
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(
        x_train_pca, y_train,
        sample_weight = sample_weights)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'colsample_bytree': 0.7692681476866446, 'learning_rate': 0.05579483854494223, 'max_depth': 9, 'n_estimators': 477, 'subsample': 0.848553073033381}")
    print("Best Accuracy: 0.569396")
    training_time = 42.59
else:
    best_hog_xgb = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'colsample_bytree': 0.7692681476866446, 'learning_rate': 0.05579483854494223, 'max_depth': 9, 'n_estimators': 477, 'subsample': 0.848553073033381}
Best Accuracy: 0.256028
Total Training Time: 48.2 minutes


In [None]:
# Predict
y_train_pred = best_hog_xgb.predict(x_train_pca)
y_test_pred = best_hog_xgb.predict(x_test_pca)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
hog_xgb_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(hog_xgb_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,48.2,
1,Accuracy,,0.999035,0.341821
2,Precision,macro,0.998918,0.353889
3,Precision,micro,0.999035,0.341821
4,Precision,weighted,0.999046,0.353453
5,Recall,macro,0.998934,0.337784
6,Recall,micro,0.999035,0.341821
7,Recall,weighted,0.999035,0.341821
8,F0.5-Score,macro,0.998919,0.345981
9,F0.5-Score,micro,0.999035,0.341821


In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_hog_xgb_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file:
        pickle.dump(best_hog_xgb, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_hog_xgb_model.pkl


### Pipeline Models using Feature Extraction Method 2 - Using pretrained CNN

ResNet50 will be used as the feature extractor due to its pre-trained weights, derived from large datasets like ImageNet, and is a popular choice to use for computer vision applications such as image classification.
Reference:
1) https://medium.com/@meetkalathiya1301/feature-extraction-using-pre-trained-models-for-image-classification-16e6ff43f268
2) https://stackoverflow.com/questions/62117707/extract-features-from-pretrained-resnet50-in-pytorch

In [15]:
#Process image data for feature extraction using CNN
input_dir = '/content/CS610_AML_Group_Project/resized_images'
img_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])]) #mean and std based on ImageNet - normalise image data closer to normal distribution
img_dataset = datasets.ImageFolder(input_dir, transform=img_transform)
data_loader = DataLoader(img_dataset, batch_size=32, num_workers=4)

In [16]:
#define function for CNN feature extraction
def cnn_feature_extract(cnn_feature_extractor, data_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #prepare cnn model to use for feature extraction
    cnn_feature_extractor.eval()
    cnn_feature_extractor.fc = torch.nn.Identity() #replace fully connected layer of pretrained cnn with Identity layer
    for para in cnn_feature_extractor.parameters():
        para.requires_grad = False #freeze weights
    #feature extraction
    features_list, labels_list = [], []
    cnn_feature_extractor.to(device)
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            feature = cnn_feature_extractor(images)
            feature = feature.view(feature.size(0),-1) #flatten into (n_samples, n_features) for non-CNN models
            #convert tensors into numpy for fitting into non-CNN models and add into lists
            features_list.append(feature.cpu().numpy())
            labels_list.append(labels.numpy())

    return cnn_feature_extractor, np.vstack(features_list), np.hstack(labels_list)

In [17]:
#initialise and extract features using CNN feature extractor
weights = models.ResNet50_Weights.IMAGENET1K_V2
resnet50_extractor = models.resnet50(weights=weights)
resnet50_extractor, X, y = cnn_feature_extract(resnet50_extractor, data_loader) #X = features, y =labels
#no need labelling as the numpy array is generated from the data_loader

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 215MB/s]


In [18]:
#CNN training and test split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
x_train = pd.DataFrame(x_train, dtype=np.float32)
y_train = pd.Series(y_train, dtype=np.int32)
x_test = pd.DataFrame(x_test, dtype=np.float32)
y_test = pd.Series(y_test, dtype=np.int32)
#same as original flow
print("Number of Samples:", len(y_train))
print("Number of Labels:", len(np.unique(y_train)))
counts = y_train.value_counts()
print("Label Distribution:")
print(counts)

Number of Samples: 5184
Number of Labels: 50
Label Distribution:
37    120
16    120
7     120
13    120
26    120
36    120
15    120
18    120
29    120
0     120
45    119
2     119
28    119
33    119
42    118
20    118
10    118
41    118
39    118
5     118
21    118
48    118
43    118
49    116
31     92
1      92
38     92
4      92
9      92
35     92
24     92
30     92
46     92
40     92
3      92
19     92
32     92
44     91
6      91
11     91
17     91
12     91
23     91
22     90
14     87
47     87
34     86
27     85
25     80
8      73
Name: count, dtype: int64


#### 1) RandomForestClassifier - feature extraction by CNN

In [None]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 30 candidates, totalling 30 fits")
    with open('model_bank/best_cnn_rf_model.pkl', 'rb') as file:
        best_cnn_rf = pickle.load(file)

else:
    start_time = time.time()

    # Base model
    base_model = RandomForestClassifier(random_state=42)

    # Hyperparameters
    param_dist = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, 40],
        'max_features': ['sqrt', 'log2', 0.5, 0.8, 1.0]
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 20}")
    print("Best Accuracy: 0.719473")
    training_time = 87.56
else:
    best_cnn_rf = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'n_estimators': 150, 'max_features': 0.5, 'max_depth': 40}
Best Accuracy: 0.316551
Total Training Time: 49.15 minutes


In [None]:
# Predict
y_train_pred = best_cnn_rf.predict(x_train)
y_test_pred = best_cnn_rf.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
cnn_rf_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(cnn_rf_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,49.15,
1,Accuracy,,0.998071,0.368827
2,Precision,macro,0.997978,0.391356
3,Precision,micro,0.998071,0.368827
4,Precision,weighted,0.998166,0.38563
5,Recall,macro,0.997817,0.357844
6,Recall,micro,0.998071,0.368827
7,Recall,weighted,0.998071,0.368827
8,F0.5-Score,macro,0.997926,0.356892
9,F0.5-Score,micro,0.998071,0.368827


In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_cnn_rf_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file:
        pickle.dump(best_cnn_rf, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_cnn_rf_model.pkl


#### 2) KNNClassifier - feature extraction by CNN

In [None]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 10 candidates, totalling 30 fits")
    with open('model_bank/best_cnn_knn_model.pkl', 'rb') as file:
        best_cnn_knn = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Base model
    base_model = KNeighborsClassifier()

    # Hyperparameters
    param_dist = {
        'n_neighbors': randint(1, 30),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'cosine']
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}")
    print("Best Accuracy: 0.896123")
    training_time = 3.96
else:
    best_cnn_knn = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'metric': 'cosine', 'n_neighbors': 11, 'weights': 'distance'}
Best Accuracy: 0.302083
Total Training Time: 0.22 minutes


In [None]:
# Predict
y_train_pred = best_cnn_knn.predict(x_train)
y_test_pred = best_cnn_knn.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
cnn_knn_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(cnn_knn_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,0.22,
1,Accuracy,,0.998071,0.367284
2,Precision,macro,0.997988,0.389494
3,Precision,micro,0.998071,0.367284
4,Precision,weighted,0.998176,0.393255
5,Recall,macro,0.997819,0.361929
6,Recall,micro,0.998071,0.367284
7,Recall,weighted,0.998071,0.367284
8,F0.5-Score,macro,0.997929,0.37711
9,F0.5-Score,micro,0.998071,0.367284


In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_cnn_knn_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file:
        pickle.dump(best_cnn_knn, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_cnn_knn_model.pkl


#### 3) XGBoostClassifier - feature extraction by CNN

In [19]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 30 candidates, totalling 30 fits")
    with open('model_bank/best_cnn_xgb_model.pkl', 'rb') as file:
        best_cnn_xgb = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Balance class weights
    sample_weights = compute_sample_weight(
        class_weight="balanced",
        y=y_train
    )

    # Base model
    base_model = xgb.XGBClassifier(
        device="cuda",
        tree_method="hist",
        objective="multi:softprob",
        num_class=len(np.unique(y_train)),
        eval_metric=['merror','mlogloss'],
        random_state=42,
    )

    # Hyperparameters
    param_dist = {
        'n_estimators': randint(50, 500),
        'max_depth': randint(3, 12),
        'learning_rate': uniform(0.01, 0.19),  # range: 0.01 to 0.2
        'subsample': uniform(0.7, 0.3),        # range: 0.7 to 1.0
        'colsample_bytree': uniform(0.7, 0.3)  # range: 0.7 to 1.0
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(
        x_train, y_train,
        sample_weight = sample_weights)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [23]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'colsample_bytree': 0.7195154778955838, 'learning_rate': 0.19028825207813332, 'max_depth': 4, 'n_estimators': 314, 'subsample': 0.7047898756660642}")
    print("Best Accuracy: 0.796345")
    training_time = 216.38
else:
    best_cnn_xgb = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'colsample_bytree': np.float64(0.7692681476866446), 'learning_rate': np.float64(0.05579483854494223), 'max_depth': 9, 'n_estimators': 477, 'subsample': np.float64(0.848553073033381)}
Best Accuracy: 0.410687
Total Training Time: 0.79 minutes


In [24]:
# Predict
y_train_pred = best_cnn_xgb.predict(x_train)
y_test_pred = best_cnn_xgb.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
cnn_xgb_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(cnn_xgb_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,0.79,
1,Accuracy,,0.998071,0.491512
2,Precision,macro,0.997954,0.493303
3,Precision,micro,0.998071,0.491512
4,Precision,weighted,0.998204,0.494231
5,Recall,macro,0.997872,0.486433
6,Recall,micro,0.998071,0.491512
7,Recall,weighted,0.998071,0.491512
8,F0.5-Score,macro,0.997909,0.48728
9,F0.5-Score,micro,0.998071,0.491512


In [26]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '/content/CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_cnn_xgb_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file:
        pickle.dump(best_cnn_xgb, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully /content/CS610_AML_Group_Project/model_bank/best_cnn_xgb_model.pkl
