### **Unsupervised Learning**

**Data preparation**
- convert images into feature vectors (ensure normalization for normalization and preprocessing for consistency)

**Clustering**
- explore K-means, hierarchical clustering, or DBSCAN to see which one is best at identify patterns
- choose optimal number of clusters (K) using techniques like the elbow method or silhouette score
- evaluate performance and visualize clustering results for insights into data distribution and cluster relationships

**PCA**
- reduce dimensionality and select number of principal components based on variance ratio
- analyze contribution of each component and assess impact on data structure

**(Optional) combination of clustering and pca**
- compare performance using original features vs. PCA-reduced features

**Integration with supervised learning**
- explore clustering and pca results (and the optional combination of them) as additional inputs to P2_supervised model (based off EfficientNet)
- evaluate enhancement of supervised learning performance

**Analyze Model Performance**
- plot loss and accuracy over epochs to visualize training progress and identify potential overfitting or underfitting.
- create a confusion matrix to examine how well the model distinguishes between classes.

In [73]:
# importing all necessary libraries
import glob
import warnings
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

import torch
import torchvision 
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

from PIL import Image
from typing import Dict, List, Tuple
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from datasets import BrainTumorMRIDataset

In [53]:
class CFG:
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    NUM_DEVICES = torch.cuda.device_count()
    NUM_WORKERS = os.cpu_count()
    NUM_CLASSES = 4
    EPOCHS = 16
    BATCH_SIZE = (
        32 if torch.cuda.device_count() < 2 
        else (32 * torch.cuda.device_count())
    )
    LR = 0.001
    APPLY_SHUFFLE = True
    SEED = 768
    HEIGHT = 224
    WIDTH = 224
    CHANNELS = 3
    IMAGE_SIZE = (224, 224, 3)
    
    # Define paths
    DATASET_PATH = './data/original/'
    TRAIN_PATH = './data/original//Training/'
    TEST_PATH = './data/original/Testing/'
    
# Mute warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

In [54]:
train_images = glob.glob(f"{CFG.TRAIN_PATH}**/*.jpg")
test_images = glob.glob(f"{CFG.TEST_PATH}**/*.jpg")

In [55]:
def generate_labels(image_paths):
    return [(_.split('/')[-2:][0]).replace('-', '_') for _ in image_paths]


def build_df(image_paths, labels):
    df = pd.DataFrame({'image_path': image_paths, 'label': generate_labels(labels)})
    return df

def _load(image_path, as_tensor=True, target_size=(224,224)):
    # Read and decode an image file to a uint8 tensor
    image = Image.open(image_path)
    
    # Resize image to target size and convert to RGB
    image = image.resize(target_size)
    image = image.convert('RGB')
    
    if as_tensor:
        converter = transforms.Compose([transforms.ToTensor(), transforms.Grayscale()])
        return converter(image)
    else:
        return image

In [56]:
train_df = build_df(train_images, generate_labels(train_images))
test_df = build_df(test_images, generate_labels(test_images))

In [57]:
train_df_sample03 = train_df.sample(frac=0.3, random_state=42).reset_index(drop=True)
train_split_idx, val_split_idx, _, _ = (train_test_split(
    train_df_sample03.index,
    train_df_sample03.label,
    test_size=0.35, 
    stratify=train_df_sample03.label,
    random_state=CFG.SEED))

In [58]:
train_new_df = train_df_sample03.iloc[train_split_idx].reset_index(drop=True)
val_df = train_df_sample03.iloc[val_split_idx].reset_index(drop=True)
train_new_df.shape, val_df.shape

((1114, 2), (600, 2))

In [59]:
train_size = len(train_new_df)
val_size = len(val_df)
test_size = len(test_df)
total = train_size + val_size + test_size

# View the counts
print(f'train samples count:\t\t{train_size}\t({(100 * train_size/total):.2f}%)')
print(f'validation samples count:\t{val_size}\t({(100 * val_size/total):.2f}%)')
print(f'test samples count:\t\t{test_size}\t({(100 * test_size/total):.2f}%)')
print('================================================')
print(f'TOTAL:\t\t\t\t{total}\t({(100 * total/total):.2f}%)')

train samples count:		1114	(36.83%)
validation samples count:	600	(19.83%)
test samples count:		1311	(43.34%)
TOTAL:				3025	(100.00%)


In [60]:
# Data transoformation definitions
train_transforms = transforms.Compose([
    transforms.Resize((CFG.HEIGHT, CFG.WIDTH)),
    transforms.Grayscale(num_output_channels=3),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.ToTensor()
])
test_transforms = transforms.Compose([
    transforms.Resize((CFG.HEIGHT, CFG.WIDTH)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor()
])
idx = random.sample(train_df_sample03.index.to_list(), 1)[0]
aug_image = _load(train_df_sample03.image_path[idx], as_tensor=False)

In [61]:
train_ds = BrainTumorMRIDataset(train_new_df, transform=train_transforms)
val_ds = BrainTumorMRIDataset(val_df, transform=test_transforms)
test_ds = BrainTumorMRIDataset(test_df, transform=test_transforms)

In [62]:
train_loader = DataLoader(
    dataset=train_ds, 
    batch_size=CFG.BATCH_SIZE,
    num_workers=CFG.NUM_WORKERS,
    shuffle=CFG.APPLY_SHUFFLE
)
val_loader = DataLoader(
    dataset=val_ds,
    batch_size=CFG.BATCH_SIZE,
    num_workers=CFG.NUM_WORKERS,
    shuffle=False
)
test_loader = DataLoader(
    dataset=test_ds,
    batch_size=CFG.BATCH_SIZE,
    num_workers=CFG.NUM_WORKERS,
    shuffle=False
)

## Defining and Training the EfficientNetV2 Model

In [63]:
class EfficientNetV2Model(nn.Module):
    def __init__(self, backbone_model, name='efficientnet-v2-large', 
                 num_classes=CFG.NUM_CLASSES, device=CFG.DEVICE):
        super(EfficientNetV2Model, self).__init__()
        
        self.backbone_model = backbone_model
        self.device = device
        self.num_classes = num_classes
        self.name = name
        
        classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p=0.2, inplace=True), 
            nn.Linear(in_features=1280, out_features=256, bias=True),
            nn.GELU(),
            nn.Dropout(p=0.2, inplace=True),
            nn.Linear(in_features=256, out_features=num_classes, bias=False)
        ).to(device)
        
        self._set_classifier(classifier)
        
    def _set_classifier(self, classifier:nn.Module) -> None:
        self.backbone_model.classifier = classifier
    
    def forward(self, image):
        return self.backbone_model(image)

In [64]:
def get_efficientnetv2_model(
    device: torch.device=CFG.NUM_CLASSES) -> nn.Module:
    torch.manual_seed(CFG.SEED)
    torch.cuda.manual_seed(CFG.SEED)
    model_weights = (
        torchvision
        .models
        .EfficientNet_V2_L_Weights
        .DEFAULT
    )
    model = (torchvision.models.efficientnet_v2_l(weights=model_weights)).to(device) 
    for param in model.features.parameters():
        param.requires_grad = False
    return model

In [65]:
backbone_model = get_efficientnetv2_model(CFG.DEVICE)
efficientnetv2_params = {
    'backbone_model'    : backbone_model,
    'name'              : 'efficientnet-v2-large',
    'device'            : CFG.DEVICE
}
efficientnet_model = EfficientNetV2Model(**efficientnetv2_params)

## Extracting Features
This following method and definition is the extraction of features from the training data using the EfficientNet model. We will use train_features in our exploration of clustering and pca (disjoint and joint).

In [66]:
def extract_features(loader, model):
    model.eval()
    features = []
    with torch.no_grad():
        for images, _ in loader:
            images = images.to(CFG.DEVICE)
            outputs = model(images)
            features.extend(outputs.cpu().numpy())
    return np.array(features)

In [67]:
train_features = extract_features(train_loader, backbone_model)

## Clustering

In [68]:
def perform_clustering(features, method='kmeans'):
    if method == 'kmeans':
        kmeans = KMeans(n_clusters=CFG.NUM_CLASSES, random_state=42)
        clusters = kmeans.fit_predict(features)
    elif method == 'hierarchical':
        hierarchical = AgglomerativeClustering(n_clusters=CFG.NUM_CLASSES)
        clusters = hierarchical.fit_predict(features)
    elif method == 'dbscan':
        dbscan = DBSCAN(eps=0.5, min_samples=5)
        clusters = dbscan.fit_predict(features)
    else:
        raise ValueError("invalid clustering method. choose from 'kmeans', 'hierarchical', or 'dbscan'")
    return clusters

In [69]:
# Perform clustering on training features
train_clusters_kmeans = perform_clustering(train_features, method='kmeans')
train_clusters_hierarchical = perform_clustering(train_features, method='hierarchical')
train_clusters_dbscan = perform_clustering(train_features, method='dbscan')

In [70]:
# Incorporate clustering labels into the training dataset
train_new_df['cluster_kmeans'] = train_clusters_kmeans
train_new_df['cluster_hierarchical'] = train_clusters_hierarchical
train_new_df['cluster_dbscan'] = train_clusters_dbscan

In [75]:
print(train_new_df.columns)

Index(['image_path', 'label', 'cluster_kmeans', 'cluster_hierarchical',
       'cluster_dbscan'],
      dtype='object')


In [74]:
# Define new data loaders with clustering labels
train_ds_cluster_kmeans = BrainTumorMRIDataset(train_new_df, transform=train_transforms, clustering='cluster_kmeans')
train_ds_cluster_hierarchical = BrainTumorMRIDataset(train_new_df, transform=train_transforms, clustering='cluster_hierarchical')
train_ds_cluster_dbscan = BrainTumorMRIDataset(train_new_df, transform=train_transforms, clustering='cluster_dbscan')

TypeError: __init__() got an unexpected keyword argument 'clustering'

## PCA

## Training the Model

In [None]:
loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(efficientnet_model.parameters(), lr=CFG.LR)

In [None]:
def execute_epoch(
    model:torch.nn.Module,
    dataloader:torch.utils.data.DataLoader,
    optimizer:torch.optim.Optimizer,
    loss_fn:torch.nn.Module,
    device:torch.device) -> Tuple[float, float]:
    
    model.train()
    train_loss, train_acc = 0, 0
    for batch, (X, y) in enumerate(tqdm(dataloader)):
        X, y = X.to(device), y.to(device)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        train_loss += loss.item() 
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        predicted_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (predicted_class == y).sum().item() / len(y_pred)
        
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    return train_loss, train_acc

In [None]:
def evaluate(
    model:torch.nn.Module,
    dataloader:torch.utils.data.DataLoader,
    loss_fn:torch.nn.Module,
    device:torch.device) -> Tuple[float, float]:
    
    model.eval()
    eval_loss, eval_acc = 0, 0
    with torch.inference_mode():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            loss = loss_fn(y_pred, y)
            eval_loss += loss.item()
            predicted_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            eval_acc += (predicted_class == y).sum().item() / len(y_pred)

    eval_loss = eval_loss / len(dataloader)
    eval_acc = eval_acc / len(dataloader)
    return eval_loss, eval_acc

In [None]:
def train(
    model:torch.nn.Module,
    train_dataloader:torch.utils.data.DataLoader,
    eval_dataloader:torch.utils.data.DataLoader,
    optimizer:torch.optim.Optimizer,
    loss_fn:torch.nn.Module,
    epochs:int,
    device:torch.device) -> Dict[str, List]:
    
    session = {
        'loss'          : [],
        'accuracy'      : [],
        'eval_loss'     : [],
        'eval_accuaracy': []}
    
    for epoch in tqdm(range(epochs)):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        train_loss, train_acc = execute_epoch(model, train_dataloader, optimizer, loss_fn, device)
        eval_loss, eval_acc = evaluate(model, eval_dataloader, loss_fn, device)
        print(f'loss: {train_loss:.4f} - acc: {train_acc:.4f} - eval_loss: {eval_loss:.4f} - eval_acc: {eval_acc:.4f}')
        session['loss'].append(train_loss)
        session['accuracy'].append(train_acc)
        session['eval_loss'].append(eval_loss)
        session['eval_accuaracy'].append(eval_acc)
    return session

In [None]:
print('Training EfficientNet Model')
print(f'Train on {len(train_new_df)} samples, validate on {len(val_df)} samples.')
print('----------------------------------')

efficientnet_session_config = {
    'model'               : efficientnet_model,
    'train_dataloader'    : train_loader,
    'eval_dataloader'     : val_loader,
    'optimizer'           : optimizer,
    'loss_fn'             : loss_fn,
    'epochs'              : CFG.EPOCHS,
    'device'              : CFG.DEVICE
}

efficientnet_session_history = train(**efficientnet_session_config)

In [None]:
def predict(
    model:nn.Module, 
    sample_loader:torch.utils.data.DataLoader,
    device:torch.device) -> np.ndarray:
    model.eval() 
    predictions = []
    with torch.inference_mode():
        for batch, (X, y) in enumerate(tqdm(sample_loader)):
            X, y = X.to(device), y.to(device)
            y_pred = model(X) 
            predicted_probs = torch.softmax(y_pred, dim=1).cpu().numpy()
            predictions.append(predicted_probs) 
    return np.vstack(predictions)