### Install and import packages

In [1]:
!pip install opencv-python torch torchvision



In [1]:
# import cudf
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import os
import tqdm
import xgboost as xgb
import time
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
import torch
from torch import nn, optim
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader, random_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import LabelEncoder
import warnings
import shutil
warnings.filterwarnings("ignore")

In [None]:
# Running On CPU, Please skip this cell
import cuml
print(cuml.__version__)
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score
%load_ext cuml.accel

## Check Data Source

In [3]:
def count_images(datasource_path):
    image_counts = {}
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}

    if not os.path.isdir(datasource_path):
        print(f"Error: Path '{datasource_path}' is not a directory.")
        return image_counts

    for subfolder_name in os.listdir(datasource_path):
        subfolder_path = os.path.join(datasource_path, subfolder_name)

        if os.path.isdir(subfolder_path):
            count = 0
            for file_name in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, file_name)
                if os.path.isfile(file_path):
                    _, ext = os.path.splitext(file_name)
                    if ext.lower() in image_extensions:
                        count += 1
            image_counts[subfolder_name] = count
    return image_counts

image_dir = 'datasource'
print(f"Scanning: {image_dir}")
counts = count_images(image_dir)

if counts:
    for folder, count in counts.items():
        print(f"{folder}: {count} images")
else:
    print("No images found or path is incorrect/empty.")

Scanning: datasource
adidas_forum_high: 150 images
adidas_forum_low: 115 images
adidas_gazelle: 149 images
adidas_nmd_r1: 115 images
adidas_samba: 115 images
adidas_stan_smith: 147 images
adidas_superstar: 114 images
adidas_ultraboost: 150 images
asics_gel-lyte_iii: 91 images
converse_chuck_70_high: 115 images
converse_chuck_70_low: 148 images
converse_chuck_taylor_all-star_high: 114 images
converse_chuck_taylor_all-star_low: 114 images
converse_one_star: 150 images
new_balance_327: 108 images
new_balance_550: 150 images
new_balance_574: 150 images
new_balance_990: 113 images
new_balance_992: 150 images
nike_air_force_1_high: 115 images
nike_air_force_1_low: 147 images
nike_air_force_1_mid: 148 images
nike_air_jordan_11: 113 images
nike_air_jordan_1_high: 114 images
nike_air_jordan_1_low: 115 images
nike_air_jordan_3: 100 images
nike_air_jordan_4: 150 images
nike_air_max_1: 106 images
nike_air_max_270: 149 images
nike_air_max_90: 150 images
nike_air_max_95: 115 images
nike_air_max_97: 

## Image Processing

### Resizing

In [4]:
# resize images
def resize_image_in_folder(input_dir, output_dir, size=(224, 224), desc='resizing images'):
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist. Please check the path.")
        return

    os.makedirs(output_dir, exist_ok=True)
    supported_formats = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(supported_formats):
            img_input_path = os.path.join(input_dir, filename)
            img_output_path = os.path.join(output_dir, filename)
            try:
                img = cv2.imread(img_input_path, cv2.IMREAD_UNCHANGED)

                if img is None:
                    print(f"Error loading {img_input_path}")
                    continue
                resized_img = cv2.resize(img, size, interpolation=cv2.INTER_LANCZOS4)

                if img_output_path.lower().endswith(('.jpg', '.jpeg')) and resized_img.shape[-1] == 4:
                    resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGRA2BGR)
                cv2.imwrite(img_output_path, resized_img)
            except Exception as e:
                print(f"Error processing {img_input_path}: {e}")

In [5]:
# process all folders
def batch_resize_images(base_input_dir, base_output_dir, size=(128, 128)):
    if not os.path.exists(base_input_dir):
        print(f"Base directory {base_input_dir} does not exist. Please check the path.")
        return

    os.makedirs(base_output_dir, exist_ok=True) # if output directory does not exist, create it.

    for folder in tqdm.tqdm(os.listdir(base_input_dir)):
        current_input_subfolder = os.path.join(base_input_dir, folder)
        current_output_subfolder = os.path.join(base_output_dir, folder)

        if os.path.isdir(current_input_subfolder):
            resize_image_in_folder(current_input_subfolder, current_output_subfolder, size=size)
        else:
            print(f"Skipping {current_input_subfolder} as it is not a directory.")

    print("Batch resizing completed.")

In [7]:
input_dir = '../CS610_AML_Group_Project/datasource'
output_dir = '../CS610_AML_Group_Project/resized_images'
batch_resize_images(input_dir, output_dir, size=(128, 128))

100%|██████████| 50/50 [01:47<00:00,  2.15s/it]

Batch resizing completed.





### Gray Scaling

In [8]:
def grayscale_image_in_folder(input_dir, output_dir):
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist. Please check the path.")
        return

    os.makedirs(output_dir, exist_ok=True)
    supported_formats = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(supported_formats):
            img_input_path = os.path.join(input_dir, filename)
            img_output_path = os.path.join(output_dir, filename)
            try:
                img = cv2.imread(img_input_path)
                if img is None:
                    print(f"Error loading {img_input_path}")
                    continue
                # Convert to grayscale
                gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                cv2.imwrite(img_output_path, gray_img)
            except Exception as e:
                print(f"Error processing {img_input_path}: {e}")

In [9]:
def batch_grayscale_images(base_input_dir, base_output_dir):
    if not os.path.exists(base_input_dir):
        print(f"Base directory {base_input_dir} does not exist. Please check the path.")
        return

    os.makedirs(base_output_dir, exist_ok=True)

    for folder in tqdm.tqdm(os.listdir(base_input_dir)):
        current_input_subfolder = os.path.join(base_input_dir, folder)
        current_output_subfolder = os.path.join(base_output_dir, folder)

        if os.path.isdir(current_input_subfolder):
            grayscale_image_in_folder(current_input_subfolder, current_output_subfolder)
        else:
            print(f"Skipping {current_input_subfolder} as it is not a directory.")

    print("Batch grayscale completed.")

In [10]:
base_input_dir = '../CS610_AML_Group_Project/resized_images'
base_output_dir = '../CS610_AML_Group_Project/grayscale_images'
batch_grayscale_images(base_input_dir, base_output_dir)

100%|██████████| 50/50 [01:29<00:00,  1.79s/it]

Batch grayscale completed.





In [34]:
def image_to_df(data_dir):
    """
    Create a DataFrame with image paths and their corresponding class labels
    """
    image_data = []
    
    # Walk through all subdirectories in the data directory
    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)
        
        # Skip if not a directory
        if not os.path.isdir(class_path):
            continue
            
        # Get all image files in the class directory
        for filename in os.listdir(class_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_path = os.path.join(class_path, filename)
                image_data.append({
                    'image_path': image_path,
                    'class': class_name
                })
    
    return pd.DataFrame(image_data)

def split_dataset(data_dir, output_dir, test_size=0.2, val_size=0.1, random_state=42):
    """
    Split the dataset into training, validation, and test sets
    
    Args:
        data_dir: Directory containing the original dataset
        output_dir: Directory to save the split datasets
        test_size: Proportion of data for test set (default: 0.2)
        val_size: Proportion of remaining data for validation set (default: 0.2)
        random_state: Random seed for reproducibility
    """
    
    print("Creating dataset DataFrame...")
    df = image_to_df(data_dir)
    
    print(f"Total images found: {len(df)}")
    print(f"Number of classes: {df['class'].nunique()}")
    print("\nClass distribution:")
    print(df['class'].value_counts())
    
    # Encode class labels
    label_encoder = LabelEncoder()
    df['encoded_class'] = label_encoder.fit_transform(df['class'])
    
    # Display class mapping
    print("\nClass to encoded label mapping:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"{class_name}: {i}")
    
    # First split: separate test set
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['encoded_class']
    )
    
    # Create output directories
    train_dir = os.path.join(output_dir, 'train')
    test_dir = os.path.join(output_dir, 'test')
    
    for dir_path in [train_dir, test_dir]:
        os.makedirs(dir_path, exist_ok=True)
    
    # Copy files to respective directories
    print("\nCopying files to split directories...")
    
    # Copy training files
    print("Copying training files...")
    for _, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df)):
        class_dir = os.path.join(train_dir, row['class'])
        os.makedirs(class_dir, exist_ok=True)
        
        filename = os.path.basename(row['image_path'])
        dest_path = os.path.join(class_dir, filename)
        shutil.copy2(row['image_path'], dest_path)
    
    # Copy test files
    print("Copying test files...")
    for _, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df)):
        class_dir = os.path.join(test_dir, row['class'])
        os.makedirs(class_dir, exist_ok=True)
        
        filename = os.path.basename(row['image_path'])
        dest_path = os.path.join(class_dir, filename)
        shutil.copy2(row['image_path'], dest_path)
    
    # Save split information
    split_info = {
        'total_images': len(df),
        'train_images': len(train_df),
        'test_images': len(test_df),
        'num_classes': len(label_encoder.classes_),
        'class_mapping': dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
    }
    
    # Save DataFrames
    train_df.to_csv(os.path.join(output_dir, 'train_split.csv'), index=False)
    test_df.to_csv(os.path.join(output_dir, 'test_split.csv'), index=False)
    
    # Save split information
    split_info_df = pd.DataFrame([split_info])
    split_info_df.to_csv(os.path.join(output_dir, 'split_info.csv'), index=False)
    
    # Print summary
    print("\n" + "="*50)
    print("DATASET SPLIT SUMMARY")
    print("="*50)
    print(f"Total images: {len(df)}")
    print(f"Training set: {len(train_df)} images ({len(train_df)/len(df)*100:.1f}%)")
    print(f"Test set: {len(test_df)} images ({len(test_df)/len(df)*100:.1f}%)")
    print(f"Number of classes: {len(label_encoder.classes_)}")
    print(f"\nSplit datasets saved to: {output_dir}")
    
    return train_df, test_df, label_encoder

In [35]:
data_dir = '../CS610_AML_Group_Project/grayscale_images'
out_dir = '../CS610_AML_Group_Project/split_images'
split_dataset(data_dir, out_dir)

Creating dataset DataFrame...
Total images found: 6480
Number of classes: 50

Class distribution:
class
adidas_forum_high                      150
nike_air_jordan_4                      150
nike_air_max_90                        150
new_balance_992                        150
new_balance_574                        150
new_balance_550                        150
adidas_ultraboost                      150
nike_cortez                            150
converse_one_star                      150
nike_dunk_high                         150
nike_air_vapormax_flyknit              149
nike_air_max_270                       149
vans_sk8-hi                            149
adidas_gazelle                         149
converse_chuck_70_low                  148
reebok_club_c_85                       148
vans_authentic                         148
yeezy_boost_350_v2                     148
nike_air_force_1_mid                   148
puma_suede_classic                     148
salomon_xt-6                        

100%|██████████| 5184/5184 [00:03<00:00, 1614.43it/s]


Copying test files...


100%|██████████| 1296/1296 [00:00<00:00, 1657.68it/s]


DATASET SPLIT SUMMARY
Total images: 6480
Training set: 5184 images (80.0%)
Test set: 1296 images (20.0%)
Number of classes: 50

Split datasets saved to: ../CS610_AML_Group_Project/split_images





(                                             image_path  \
 6177  ../CS610_AML_Group_Project/grayscale_images\ye...   
 288   ../CS610_AML_Group_Project/grayscale_images\ad...   
 6340  ../CS610_AML_Group_Project/grayscale_images\ye...   
 3096  ../CS610_AML_Group_Project/grayscale_images\ni...   
 3640  ../CS610_AML_Group_Project/grayscale_images\ni...   
 ...                                                 ...   
 2769  ../CS610_AML_Group_Project/grayscale_images\ni...   
 5597  ../CS610_AML_Group_Project/grayscale_images\va...   
 5385  ../CS610_AML_Group_Project/grayscale_images\re...   
 4353  ../CS610_AML_Group_Project/grayscale_images\ni...   
 1725  ../CS610_AML_Group_Project/grayscale_images\co...   
 
                           class  encoded_class  
 6177      yeezy_700_wave_runner             47  
 288              adidas_gazelle              2  
 6340                yeezy_slide             49  
 3096      nike_air_jordan_1_low             24  
 3640           nike_air_max

## Data Augmentation

In [36]:
def image_augmentation(image, augmentation_type,angle_range=(-15, 15), brightness_range=(0.7, 1.3)):
    if augmentation_type == 'flip':
        return cv2.flip(image, 1)
    elif augmentation_type == 'rotate':
        angle = np.random.randint(angle_range[0], angle_range[1] + 1)
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        return cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
    elif augmentation_type == 'brightness':
        brightness_factor = np.random.uniform(brightness_range[0], brightness_range[1])
        return np.clip(image * brightness_factor, 0, 255).astype(np.uint8)
    return image # return orginal image

In [37]:
# Create a directory to store the augmented images
aug_dir = 'augmented_train_images'
os.makedirs(aug_dir, exist_ok = True)
image_dir = '../CS610_AML_Group_Project/split_images/train'

all_images_paths = []
all_images_labels = []



sneaker_names_list = os.listdir(image_dir)
print("====== Image Augmentation Starts ======")
for sneaker_name in tqdm.tqdm(sneaker_names_list, desc="Augmenting"):
    original_path = os.path.join(image_dir, sneaker_name)
    if os.path.isdir(original_path):
        aug_path = os.path.join(aug_dir, sneaker_name)
        os.makedirs(aug_path, exist_ok = True)
        
        for image in os.listdir(original_path):

            # design saved path
            # 1. orinal
            image_full_path = os.path.join(original_path, image)
            original_image = cv2.imread(image_full_path)
            if original_image is None:
                print(f'WARNING: CANNOT READ IMAGE {image_full_path}, SKIPPED!')
                continue
            
            base, ext = os.path.splitext(image)

            # design saved path
            # 1. orinal
            image_name_original = f'{base}_original{ext}'
            original_image_saved_path = os.path.join(aug_path,image_name_original)
            # 2. flipped
            image_name_flipped = f'{base}_flipped{ext}'
            flipped_image_saved_path = os.path.join(aug_path,image_name_flipped)
            # 3. rotated
            image_name_rotated = f'{base}_rotated{ext}'
            rotated_image_saved_path = os.path.join(aug_path,image_name_rotated)
            # 4. bright
            image_name_brightened = f'{base}_brightened{ext}'
            brightened_image_saved_path = os.path.join(aug_path,image_name_brightened)

            # augmentation operations
            # 1. original
            cv2.imwrite(original_image_saved_path, original_image)
            all_images_paths.append(original_image_saved_path)
            all_images_labels.append(sneaker_name)

            # 2. flipped
            img_flipped = image_augmentation(original_image, augmentation_type = 'flip')
            cv2.imwrite(flipped_image_saved_path, img_flipped)
            all_images_paths.append(flipped_image_saved_path)
            all_images_labels.append(sneaker_name)

            # 3. rotated
            img_rotated = image_augmentation(original_image, augmentation_type = 'rotate')
            cv2.imwrite(rotated_image_saved_path, img_rotated)
            all_images_paths.append(rotated_image_saved_path)
            all_images_labels.append(sneaker_name)

            # 4. brightness
            img_bright = image_augmentation(original_image, augmentation_type = 'brightness')
            cv2.imwrite(brightened_image_saved_path, img_bright)
            all_images_paths.append(brightened_image_saved_path)
            all_images_labels.append(sneaker_name)

print("====== Image Augmentation Starts ======")

image_df_augmented = pd.DataFrame({
    'path': all_images_paths,
    'label': all_images_labels
})

print(f"We have now {len(image_df_augmented)} images for modelling")

image_df_augmented



Augmenting: 100%|██████████| 50/50 [01:16<00:00,  1.53s/it]

We have now 20736 images for modelling





Unnamed: 0,path,label
0,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
1,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
2,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
3,augmented_train_images\adidas_forum_high\0001_...,adidas_forum_high
4,augmented_train_images\adidas_forum_high\0004_...,adidas_forum_high
...,...,...
20731,augmented_train_images\yeezy_slide\0144_bright...,yeezy_slide
20732,augmented_train_images\yeezy_slide\0145_origin...,yeezy_slide
20733,augmented_train_images\yeezy_slide\0145_flippe...,yeezy_slide
20734,augmented_train_images\yeezy_slide\0145_rotate...,yeezy_slide


### Pipeline Models using Feature Extraction Method 1 - By HOG

#### Feature Extraction by HOG

In [2]:
def extract_hog_features_recursive(input_dir, force_size = (128, 128), pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    features = []
    filenames = []
    supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    for root, dirs, files in tqdm.tqdm(os.walk(input_dir)):
        for filename in tqdm.tqdm(files):
            if filename.lower().endswith(supported_formats):
                img_path = os.path.join(root, filename)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    continue
                # force resized in case feature extraction failed
                img_resized = cv2.resize(img, force_size, interpolation=cv2.INTER_AREA)
                # pixel normalisation
                img_normalised = img_resized.astype(np.float32) / 255.0
                # Extract HOG features
                try:
                    hog_feature = hog(img_normalised, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, feature_vector=True)
                    features.append(hog_feature)
                    rel_path = os.path.relpath(img_path, input_dir)
                    filenames.append(rel_path)
                except Exception as e:
                    print("WARNING: {img_path} Failed with HOG feature extraction!")
                    continue
    hogged = np.array(features)
    return hogged, filenames


In [3]:
# train set:
input_dir = '../CS610_AML_Group_Project/augmented_train_images'
print('====== HOG Extraction Starts! ======')
hogged_train, filenames_train = extract_hog_features_recursive(input_dir)
print('====== HOG Extraction Completed! ======')
print(hogged_train.shape)  # (num_images, hog_feature_dim)

# test set:
input_dir = '../CS610_AML_Group_Project/split_images/test'
print('====== HOG Extraction Starts! ======')
hogged_test, filenames_test = extract_hog_features_recursive(input_dir)
print('====== HOG Extraction Completed! ======')
print(hogged_test.shape)  # (num_images, hog_feature_dim)




0it [00:00, ?it/s]
100%|██████████| 480/480 [00:09<00:00, 50.06it/s]
100%|██████████| 368/368 [00:08<00:00, 45.84it/s]
100%|██████████| 476/476 [00:10<00:00, 47.13it/s]
100%|██████████| 368/368 [00:07<00:00, 49.97it/s]
100%|██████████| 368/368 [00:06<00:00, 53.65it/s]
100%|██████████| 472/472 [00:08<00:00, 53.44it/s]
100%|██████████| 364/364 [00:05<00:00, 69.42it/s]
100%|██████████| 480/480 [00:09<00:00, 53.22it/s]
100%|██████████| 292/292 [00:05<00:00, 52.74it/s]
100%|██████████| 368/368 [00:06<00:00, 56.24it/s]
100%|██████████| 472/472 [00:09<00:00, 50.76it/s]
100%|██████████| 364/364 [00:07<00:00, 51.49it/s]
100%|██████████| 364/364 [00:07<00:00, 51.38it/s]
100%|██████████| 480/480 [00:07<00:00, 62.27it/s]
100%|██████████| 348/348 [00:06<00:00, 50.87it/s]
100%|██████████| 480/480 [00:08<00:00, 54.70it/s]
100%|██████████| 480/480 [00:09<00:00, 51.28it/s]
100%|██████████| 364/364 [00:07<00:00, 49.23it/s]
100%|██████████| 480/480 [00:09<00:00, 49.68it/s]
100%|██████████| 368/368 [00:06

(20736, 1764)


0it [00:00, ?it/s]
100%|██████████| 30/30 [00:00<00:00, 57.56it/s]
100%|██████████| 23/23 [00:00<00:00, 61.19it/s]
100%|██████████| 30/30 [00:00<00:00, 73.18it/s]
100%|██████████| 23/23 [00:00<00:00, 79.64it/s]
100%|██████████| 23/23 [00:00<00:00, 62.87it/s]
100%|██████████| 29/29 [00:00<00:00, 55.80it/s]
100%|██████████| 23/23 [00:00<00:00, 54.24it/s]
100%|██████████| 30/30 [00:00<00:00, 66.47it/s]
100%|██████████| 18/18 [00:00<00:00, 59.36it/s]
100%|██████████| 23/23 [00:00<00:00, 61.93it/s]
100%|██████████| 30/30 [00:00<00:00, 69.47it/s]
100%|██████████| 23/23 [00:00<00:00, 59.57it/s]
100%|██████████| 23/23 [00:00<00:00, 66.64it/s]
100%|██████████| 30/30 [00:00<00:00, 62.60it/s]
100%|██████████| 21/21 [00:00<00:00, 56.54it/s]
100%|██████████| 30/30 [00:00<00:00, 62.05it/s]
100%|██████████| 30/30 [00:00<00:00, 65.88it/s]
100%|██████████| 22/22 [00:00<00:00, 54.84it/s]
100%|██████████| 30/30 [00:00<00:00, 60.43it/s]
100%|██████████| 23/23 [00:00<00:00, 58.58it/s]
100%|██████████| 29/2

(1296, 1764)





In [4]:
#Labeling
y_train = [f.split(os.sep)[0] for f in filenames_train]

#split data into train_test split
x_train = hogged_train.astype(np.float32)
y_train = np.array(y_train)
y_train, uniques = pd.factorize(y_train)
x_train = pd.DataFrame(x_train, dtype=np.float32)
y_train = pd.Series(y_train, dtype=np.int32)

In [5]:
#Labeling
y_test = [f.split(os.sep)[0] for f in filenames_test]

#split data into train_test split
x_test = hogged_test.astype(np.float32)
y_test = np.array(y_test)
y_test, uniques = pd.factorize(y_test)
x_test = pd.DataFrame(x_test, dtype=np.float32)
y_test = pd.Series(y_test, dtype=np.int32)

In [6]:
#Check if data is prepared successfully
print("Number of Train Samples:", len(y_train))
print("Number of Train Labels:", len(np.unique(y_train)))
counts = y_train.value_counts()
print("Train Label Distribution:")
print(counts)

print("Number of Test Samples:", len(y_test))
print("Number of Test Labels:", len(np.unique(y_test)))
counts = y_test.value_counts()
print("Test Label Distribution:")
print(counts)

Number of Train Samples: 20736
Number of Train Labels: 50
Train Label Distribution:
0     480
26    480
29    480
18    480
16    480
15    480
7     480
36    480
13    480
37    480
33    476
28    476
45    476
2     476
10    472
41    472
42    472
43    472
5     472
48    472
20    472
21    472
39    472
49    464
24    368
40    368
3     368
46    368
4     368
19    368
30    368
31    368
32    368
35    368
9     368
1     368
38    368
44    364
6     364
12    364
11    364
17    364
23    364
22    360
14    348
47    348
34    344
27    340
25    320
8     292
Name: count, dtype: int64
Number of Test Samples: 1296
Number of Test Labels: 50
Test Label Distribution:
0     30
37    30
28    30
18    30
29    30
16    30
15    30
33    30
13    30
36    30
10    30
21    30
39    30
41    30
7     30
43    30
45    30
48    30
2     30
26    30
42    29
49    29
20    29
5     29
22    23
35    23
3     23
46    23
4     23
6     23
40    23
9     23
38    23
11    23
23  

#### Feature Standardisation

In [7]:
print("\n====== Feature Standardisation Started! ======")
scaler = StandardScaler()
scaler.fit(x_train) 

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

print("\n====== Feature Standardisation Completed! ======")
print(f"The Shape for Training Set after Feature Standardisation: {x_train_scaled.shape}")
print(f"The Shape for Testing Set after Feature Standardisation: {x_test_scaled.shape}")



The Shape for Training Set after Feature Standardisation: (20736, 1764)
The Shape for Testing Set after Feature Standardisation: (1296, 1764)


In [8]:
print("\n====== Dimensionality Reduction by PCA Started! ======")
pca = PCA(n_components=0.85, random_state=42) 
pca.fit(x_train_scaled)


x_train_pca = pca.transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

print("\n====== Dimensionality Reduction by PCA Completed! ======")
print(f"The Shape for Training Set after Dimensionality Reduction by PCA: {x_train_pca.shape}")
print(f"The Shape for Testing Set after Dimensionality Reduction by PCA: {x_test_pca.shape}")
print(f"The Number of Chosen PCA: {pca.n_components_}")
print(f"The Explained Variance Ratio: {np.sum(pca.explained_variance_ratio_):.4f}")



The Shape for Training Set after Dimensionality Reduction by PCA: (20736, 241)
The Shape for Testing Set after Dimensionality Reduction by PCA: (1296, 241)
The Number of Chosen PCA: 241
The Explained Variance Ratio: 0.8505


#### 1) RandomForestClassifier - feature extraction by hog

In [47]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 30 candidates, totalling 30 fits")
    with open('model_bank/best_hog_rf_model.pkl', 'rb') as file:
        best_hog_rf = pickle.load(file)

else:
    start_time = time.time()

    # Base model
    base_model = RandomForestClassifier(random_state=42)

    # Hyperparameters
    param_dist = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, 40],
        'max_features': ['sqrt', 'log2', 0.5, 0.8, 1.0]
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train_pca, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [48]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'n_estimators': 150, 'max_features': 0.5, 'max_depth': 40}")
    print("Best Accuracy: 0.563416")
    training_time = 47.19
else:
    best_hog_rf = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time /60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'n_estimators': 150, 'max_features': 0.5, 'max_depth': 40}
Best Accuracy: 0.209008
Total Training Time: 26.8 minutes


In [51]:
# Predict
y_train_pred = best_hog_rf.predict(x_train_pca)
y_test_pred = best_hog_rf.predict(x_test_pca)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
hog_rf_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(hog_rf_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,26.8,
1,Accuracy,,0.999035,0.307099
2,Precision,macro,0.99895,0.328319
3,Precision,micro,0.999035,0.307099
4,Precision,weighted,0.999048,0.322049
5,Recall,macro,0.998908,0.301281
6,Recall,micro,0.999035,0.307099
7,Recall,weighted,0.999035,0.307099
8,F0.5-Score,macro,0.998939,0.313624
9,F0.5-Score,micro,0.999035,0.307099


In [53]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_hog_rf_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(best_hog_rf, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_hog_rf_model.pkl


#### 2) KNNClassifier - feature extraction by hog

In [9]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 10 candidates, totalling 30 fits")
    with open('model_bank/best_hog_knn_model.pkl', 'rb') as file:
        best_hog_knn = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Base model
    base_model = KNeighborsClassifier()

    # Hyperparameters
    param_dist = {
        'n_neighbors': randint(1, 30),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'cosine']
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train_pca, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [15]:
skip_train = False
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}")
    print("Best Accuracy: 0.675010")
    training_time = 1.03
else:
    best_hog_knn = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
Best Accuracy: 0.279900
Total Training Time: 0.0 minutes


In [17]:
# Predict
y_train_pred = best_hog_knn.predict(x_train_pca)
y_test_pred = best_hog_knn.predict(x_test_pca)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
hog_knn_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(hog_knn_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,0.0,
1,Accuracy,,0.998939,0.368827
2,Precision,macro,0.998861,0.379593
3,Precision,micro,0.998939,0.368827
4,Precision,weighted,0.998971,0.385752
5,Recall,macro,0.998801,0.367517
6,Recall,micro,0.998939,0.368827
7,Recall,weighted,0.998939,0.368827
8,F0.5-Score,macro,0.998841,0.371319
9,F0.5-Score,micro,0.998939,0.368827


In [18]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_hog_knn_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(best_hog_knn, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_hog_knn_model.pkl


#### 3) XGBoostClassifier - feature extraction by hog

In [20]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 10 candidates, totalling 30 fits")
    with open('model_bank/best_hog_xgb_model.pkl', 'rb') as file:
        best_hog_xgb = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Balance class weights
    sample_weights = compute_sample_weight(
        class_weight="balanced",
        y=y_train
    )

    # Base model
    base_model = xgb.XGBClassifier(
        device="cuda",
        tree_method="hist",
        objective="multi:softprob",
        num_class=len(np.unique(y_train)),
        eval_metric=['merror','mlogloss'],
        random_state=42,
    )

    # Hyperparameters
    param_dist = {
        'n_estimators': randint(50, 500),
        'max_depth': randint(3, 12),
        'learning_rate': uniform(0.01, 0.19),  # range: 0.01 to 0.2
        'subsample': uniform(0.7, 0.3),        # range: 0.7 to 1.0
        'colsample_bytree': uniform(0.7, 0.3)  # range: 0.7 to 1.0
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(
        x_train_pca, y_train,
        sample_weight = sample_weights)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [21]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'colsample_bytree': 0.7692681476866446, 'learning_rate': 0.05579483854494223, 'max_depth': 9, 'n_estimators': 477, 'subsample': 0.848553073033381}")
    print("Best Accuracy: 0.569396")
    training_time = 42.59
else:
    best_hog_xgb = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'colsample_bytree': 0.7692681476866446, 'learning_rate': 0.05579483854494223, 'max_depth': 9, 'n_estimators': 477, 'subsample': 0.848553073033381}
Best Accuracy: 0.256028
Total Training Time: 48.2 minutes


In [22]:
# Predict
y_train_pred = best_hog_xgb.predict(x_train_pca)
y_test_pred = best_hog_xgb.predict(x_test_pca)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
hog_xgb_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(hog_xgb_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,48.2,
1,Accuracy,,0.999035,0.341821
2,Precision,macro,0.998918,0.353889
3,Precision,micro,0.999035,0.341821
4,Precision,weighted,0.999046,0.353453
5,Recall,macro,0.998934,0.337784
6,Recall,micro,0.999035,0.341821
7,Recall,weighted,0.999035,0.341821
8,F0.5-Score,macro,0.998919,0.345981
9,F0.5-Score,micro,0.999035,0.341821


In [23]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_hog_xgb_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(best_hog_xgb, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_hog_xgb_model.pkl


### Pipeline Models using Feature Extraction Method 2 - Using pretrained CNN

ResNet50 will be used as the feature extractor due to its pre-trained weights, derived from large datasets like ImageNet, and is a popular choice to use for computer vision applications such as image classification.
Reference:
1) https://medium.com/@meetkalathiya1301/feature-extraction-using-pre-trained-models-for-image-classification-16e6ff43f268
2) https://stackoverflow.com/questions/62117707/extract-features-from-pretrained-resnet50-in-pytorch

In [24]:
#Process image data for feature extraction using CNN
input_dir = '../CS610_AML_Group_Project/resized_images'
img_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])]) #mean and std based on ImageNet - normalise image data closer to normal distribution
img_dataset = datasets.ImageFolder(input_dir, transform=img_transform)
data_loader = DataLoader(img_dataset, batch_size=32, num_workers=4)

In [25]:
#define function for CNN feature extraction
def cnn_feature_extract(cnn_feature_extractor, data_loader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #prepare cnn model to use for feature extraction
    cnn_feature_extractor.eval()
    cnn_feature_extractor.fc = torch.nn.Identity() #replace fully connected layer of pretrained cnn with Identity layer
    for para in cnn_feature_extractor.parameters():
        para.requires_grad = False #freeze weights
    #feature extraction
    features_list, labels_list = [], []
    cnn_feature_extractor.to(device)
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            feature = cnn_feature_extractor(images)
            feature = feature.view(feature.size(0),-1) #flatten into (n_samples, n_features) for non-CNN models
            #convert tensors into numpy for fitting into non-CNN models and add into lists
            features_list.append(feature.cpu().numpy())
            labels_list.append(labels.numpy())

    return cnn_feature_extractor, np.vstack(features_list), np.hstack(labels_list)

In [26]:
#initialise and extract features using CNN feature extractor
weights = models.ResNet50_Weights.IMAGENET1K_V2
resnet50_extractor = models.resnet50(weights=weights)
resnet50_extractor, X, y = cnn_feature_extract(resnet50_extractor, data_loader) #X = features, y =labels
#no need labelling as the numpy array is generated from the data_loader

In [27]:
#CNN training and test split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
x_train = pd.DataFrame(x_train, dtype=np.float32)
y_train = pd.Series(y_train, dtype=np.int32)
x_test = pd.DataFrame(x_test, dtype=np.float32)
y_test = pd.Series(y_test, dtype=np.int32)
#same as original flow
print("Number of Samples:", len(y_train))
print("Number of Labels:", len(np.unique(y_train)))
counts = y_train.value_counts()
print("Label Distribution:")
print(counts)

Number of Samples: 5184
Number of Labels: 50
Label Distribution:
36    120
18    120
29    120
13    120
7     120
15    120
16    120
26    120
0     120
37    120
33    119
2     119
45    119
28    119
5     118
21    118
43    118
48    118
20    118
10    118
42    118
39    118
41    118
49    116
46     92
19     92
3      92
24     92
9      92
1      92
4      92
30     92
31     92
35     92
40     92
38     92
32     92
6      91
11     91
23     91
12     91
17     91
44     91
22     90
14     87
47     87
34     86
27     85
25     80
8      73
Name: count, dtype: int64


#### 1) RandomForestClassifier - feature extraction by CNN

In [28]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 30 candidates, totalling 30 fits")
    with open('model_bank/best_cnn_rf_model.pkl', 'rb') as file:
        best_cnn_rf = pickle.load(file)

else:
    start_time = time.time()

    # Base model
    base_model = RandomForestClassifier(random_state=42)

    # Hyperparameters
    param_dist = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, 40],
        'max_features': ['sqrt', 'log2', 0.5, 0.8, 1.0]
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 20}")
    print("Best Accuracy: 0.719473")
    training_time = 87.56
else:
    best_cnn_rf = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

In [None]:
# Predict
y_train_pred = best_cnn_rf.predict(x_train)
y_test_pred = best_cnn_rf.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
cnn_rf_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(cnn_rf_metrics)

In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_cnn_rf_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(best_cnn_rf, file)
    print(f"Model Saved Successfully {model_path}")

#### 2) KNNClassifier - feature extraction by CNN

In [29]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 10 candidates, totalling 30 fits")
    with open('model_bank/best_cnn_knn_model.pkl', 'rb') as file:
        best_cnn_knn = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Base model
    base_model = KNeighborsClassifier()

    # Hyperparameters
    param_dist = {
        'n_neighbors': randint(1, 30),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'cosine']
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(x_train, y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [30]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}")
    print("Best Accuracy: 0.896123")
    training_time = 3.96
else:
    best_cnn_knn = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

Best Parameters: {'metric': 'cosine', 'n_neighbors': 11, 'weights': 'distance'}
Best Accuracy: 0.302083
Total Training Time: 0.22 minutes


In [31]:
# Predict
y_train_pred = best_cnn_knn.predict(x_train)
y_test_pred = best_cnn_knn.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
cnn_knn_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(cnn_knn_metrics)

Unnamed: 0,Metric,Average Type,Train,Test
0,Training time (minutes),,0.22,
1,Accuracy,,0.998071,0.367284
2,Precision,macro,0.997988,0.389494
3,Precision,micro,0.998071,0.367284
4,Precision,weighted,0.998176,0.393255
5,Recall,macro,0.997819,0.361929
6,Recall,micro,0.998071,0.367284
7,Recall,weighted,0.998071,0.367284
8,F0.5-Score,macro,0.997929,0.37711
9,F0.5-Score,micro,0.998071,0.367284


In [32]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_cnn_knn_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(best_cnn_knn, file)
    print(f"Model Saved Successfully {model_path}")

Model Saved Successfully ../CS610_AML_Group_Project/model_bank\best_cnn_knn_model.pkl


#### 3) XGBoostClassifier - feature extraction by CNN

In [None]:
skip_train = False

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    print("Fitted 3 folds for each of 30 candidates, totalling 30 fits")
    with open('model_bank/best_cnn_xgb_model.pkl', 'rb') as file:
        best_cnn_xgb = pickle.load(file)

else:

    # Start timing
    start_time = time.time()

    # Balance class weights
    sample_weights = compute_sample_weight(
        class_weight="balanced",
        y=y_train
    )

    # Base model
    base_model = xgb.XGBClassifier(
        device="cuda",
        tree_method="hist",
        objective="multi:softprob",
        num_class=len(np.unique(y_train)),
        eval_metric=['merror','mlogloss'],
        random_state=42,
    )

    # Hyperparameters
    param_dist = {
        'n_estimators': randint(50, 500),
        'max_depth': randint(3, 12),
        'learning_rate': uniform(0.01, 0.19),  # range: 0.01 to 0.2
        'subsample': uniform(0.7, 0.3),        # range: 0.7 to 1.0
        'colsample_bytree': uniform(0.7, 0.3)  # range: 0.7 to 1.0
    }

    # Randomized search tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_dist,
        n_iter=10,
        scoring='accuracy',
        cv=3,
        verbose=2,
        random_state=42,
        error_score='raise',
        n_jobs=-1
    )
    random_search.fit(
        x_train, y_train,
        sample_weight = sample_weights)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

In [None]:
if skip_train:
    print("Training skipped, printing model trained previously...\n")
    print("Best Parameters: {'colsample_bytree': 0.7195154778955838, 'learning_rate': 0.19028825207813332, 'max_depth': 4, 'n_estimators': 314, 'subsample': 0.7047898756660642}")
    print("Best Accuracy: 0.796345")
    training_time = 216.38
else:
    best_cnn_xgb = random_search.best_estimator_
    print("Best Parameters:", random_search.best_params_)
    print(f"Best Accuracy: {random_search.best_score_:.6f}")
    training_time = round(training_time / 60, 2)
print(f"Total Training Time: {training_time} minutes")

In [None]:
# Predict
y_train_pred = best_cnn_xgb.predict(x_train)
y_test_pred = best_cnn_xgb.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
cnn_xgb_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(cnn_xgb_metrics)

In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'best_cnn_xgb_model.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(best_cnn_xgb, file)
    print(f"Model Saved Successfully {model_path}")

### Model Stacking

Stacking is a method that help to improve the overall performance of models as the weakness of a certain models can be compensated by the strengths of other models. Hence, we decided to utilise stacking to improve the overall performance of the model. For this technique, only the CNN-feature extraction method will be used as it has been proven to provide better model performance (in terms of accuracy).
<br>
<br>
Using the CNN extracted feature set and the models earlier in the code, they will be used in this stacking technique to determine if stacking improves the overall performance. 

#### Import the models

#### Set up stacking

In [None]:
skip_train = True

if skip_train:
    # Import previous model
    print("Training skipped, importing model trained previously...")
    with open('model_bank/stacked_model_pipeline.pkl', 'rb') as file:
        stacking_cf = pickle.load(file)
    training_time = 135.15

else:

    # Start timing
    start_time = time.time()

    # Stacked model
    estimators = [('rcf_model',RandomForestClassifier(n_estimators=150, 
                                                    max_features="sqrt", 
                                                    max_depth=20, 
                                                    random_state=42)),
                ("xgboost",xgb.XGBClassifier(colsample_bytree=0.7195154778955838, 
                                            learning_rate= 0.19028825207813332, 
                                            max_depth= 4, n_estimators= 314, 
                                            subsample=0.7047898756660642,
                                            device="cuda",
                                            tree_method="hist", 
                                            objective="multi:softprob", 
                                            num_class=len(np.unique(y_train)),
                                            eval_metric=['merror','mlogloss'],
                                            random_state=42)),
                ("knn", KNeighborsClassifier(metric= "euclidean", 
                                            n_neighbors= 1, 
                                            weights="distance"))]

    stacking_cf = StackingClassifier(estimators=estimators, 
                                    final_estimator=LogisticRegression(), 
                                    cv=3, 
                                    passthrough=False, 
                                    verbose=1)

    # Train
    start_time = time.time()
    stacking_cf.fit(x_train,y_train)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time
    training_time = round(training_time / 60, 2)

print(f"Total Training Time: {training_time} minutes")

In [None]:
# Predict
y_train_pred = stacking_cf.predict(x_train)
y_test_pred = stacking_cf.predict(x_test)

# Function to save metrics
metrics = {"Metric": [], "Average Type": [], "Train": [], "Test": []}
def add_metric(name, avg_type, train_value, test_value):
    metrics["Metric"].append(name)
    metrics["Average Type"].append(avg_type)
    metrics["Train"].append(train_value)
    metrics["Test"].append(test_value)

# Training time
add_metric("Training time (minutes)", "N/A", training_time, "N/A")

# Accuracy
add_metric("Accuracy", "N/A",
           accuracy_score(y_train, y_train_pred),
           accuracy_score(y_test, y_test_pred))

# Precision
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Precision", avg,
               precision_score(y_train, y_train_pred, average=avg),
               precision_score(y_test, y_test_pred, average=avg))

# Recall
for avg in ['macro', 'micro', 'weighted']:
    add_metric("Recall", avg,
               recall_score(y_train, y_train_pred, average=avg),
               recall_score(y_test, y_test_pred, average=avg))

# F0.5-Score 
beta = 0.5 # mis-labelled sneakers are more costly than missing labels
for avg in ['macro', 'micro', 'weighted']:
    add_metric(f"F{beta}-Score", avg,
               fbeta_score(y_train, y_train_pred, beta=beta, average=avg),
               fbeta_score(y_test, y_test_pred, beta=beta, average=avg))

# Display metrics
stack_metrics = pd.DataFrame(metrics)
pd.set_option('display.precision', 6)
display(stack_metrics)

In [None]:
export = True

if not export:
    print("Model not exported")
else:
    model_bank_dir = '../CS610_AML_Group_Project/model_bank'
    os.makedirs(model_bank_dir, exist_ok=True)
    model_filename_pickle = 'stacked_model_pipeline.pkl'
    model_path = os.path.join(model_bank_dir, model_filename_pickle)
    with open(model_path, 'wb') as file: 
        pickle.dump(stacking_cf, file)
    print(f"Model Saved Successfully {model_path}")