#### 1. Data Loading and Preprocessing

In [2]:
from config import *
from LoadDataset import CustomImageDataset
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
from config import dataset_test_path, dataset_train_path, dataset_val_path

# Define the transformations
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create datasets
train_dataset = CustomImageDataset(file_path=dataset_train_path, folder_path=dataset_root, transform=transform)
val_dataset = CustomImageDataset(file_path=dataset_val_path, folder_path=dataset_root, transform=transform)
test_dataset = CustomImageDataset(file_path=dataset_test_path, folder_path=dataset_root, transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [3]:
len(train_dataset), len(val_dataset), len(test_dataset), "total:", len(train_dataset) + len(val_dataset) + len(test_dataset)

(674373, 207499, 155625, 'total:', 1037497)

##### 2. Model Definition

In [4]:
import torch
import torch.nn as nn
import torchvision.models as models

class ResNet50(nn.Module):
    def __init__(self, num_classes=14):
        super(ResNet50, self).__init__()
        self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        # TODO: research if there are other weights to be set (like V2 if it exists)
        num_ftrs = self.model.fc.in_features
        self.model.fc = nn.Linear(num_ftrs, num_classes)

    def forward(self, x):
        return self.model(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet50(num_classes=len(class_names))
model = model.to(device)


#### 3. Prototype Selection

##### Procedure
1. Calculate cosine similarity $S_{i j}$ of the data points' deep features
2. Compute the density of each image $\rho_i$ by counting the number of images with a similarity exceeding the threshold 
3. Calculate how diverse and representative an image is based on its density and cosine similarity resulting in an overall score $\eta_i$
4. Select the highest scoring images as prototypes


##### Background Theory About Prototypes

determine high **density** of data points $\rho_i$ (#samples similar to sample) because they are likely to have correct labels, making them good candidates for prototypes
identify similar data points ($i$ and $j$) using their **cosine similarity** $S_{ij}$ and a **threshold** value $S_c$ So $\operatorname{sign}\left(S_{i j}-S_c\right)$ returns 1 if the threshold is exceeded (else 0): 
$S_{i j}=\frac{\mathcal{G}\left(\mathbf{x}_i\right)^T \mathcal{G}\left(\mathbf{x}_j\right)}{\left\|\mathcal{G}\left(\mathbf{x}_i\right)\right\|_2\left\|\mathcal{G}\left(\mathbf{x}_j\right)\right\|_2}$ with the Euclidean norm (length) of the deep features of sample $i$ $\|\mathcal{G}\left(\mathbf{x}_i)\right\|_2$

$\rho_i=\sum_{j=1}^m \operatorname{sign}\left(S_{i j}-S_c\right)$

**select** diverse and representative prototypes with $\eta_i$, ensuring that the chosen prototypes cover a wide range of characteristics within a class, which improves the accuracy and robustness of the label correction process

if the density of a single data point $i$ is **smaller** than the maximum density ($\rho_i<\rho_{\max }$), we look at the **maximum** **similarity** between $i$ and the denser data point $j$ to identify how well $i$ is connected to a denser and more reliable feature space. This $\eta_i$ indicates, if the data sample is a **representative** prototype.

if the data point $i$ has the **highest density**, we need to check that the prototypes are **not too similar**. Thus, we take the smallest similarity value to select prototypes that are **spread out** in the feature space to avoid redundancy and ensure **diversity**.

$\eta_i= \begin{cases}\max_{j, \rho_j>\rho_i} S_{i j}, & \rho_i<\rho_{\max } \\ \min_j S_{i j}, & \rho_i=\rho_{\max }\end{cases}$

In [5]:
def select_prototypes(features, labels, num_prototypes=num_prototypes, similarity_threshold=similarity_threshold):
    prototypes = {}
    for label in np.unique(labels):  # Loop through each unique label
        class_features = features[labels == label]  # Extract features for the current label
        # Calculate cosine similarity matrix
        norm_class_features = class_features / np.linalg.norm(class_features, axis=1, keepdims=True)
        similarity_matrix = np.matmul(norm_class_features, norm_class_features.T)
        # Calculate densities (rho) based on similarity threshold
        densities = np.sum(similarity_matrix > similarity_threshold, axis=1)
        max_density = np.max(densities)
        
        # Calculate eta to ensure diversity and representativeness
        eta = np.full(len(densities), float('inf'))
        for i in range(len(densities)):
            if densities[i] < max_density:
                eta[i] = np.max(similarity_matrix[i][densities > densities[i]])
            else:
                eta[i] = np.min(similarity_matrix[i][densities == densities[i]])
        
        # Select the highest scoring prototypes
        selected_prototypes = []
        while len(selected_prototypes) < num_prototypes:
            candidate_indices = np.where(eta < 0.95)[0]
            if len(candidate_indices) == 0:
                break
            best_candidate = candidate_indices[np.argmax(densities[candidate_indices])]
            selected_prototypes.append(best_candidate)
            eta[best_candidate] = float('inf')  # Ensure this candidate is not selected again
        
        prototypes[label] = class_features[selected_prototypes[:num_prototypes]]
    return prototypes


##### 4. Label Correction

#### Procedure
1. Determine the highest average similarity score $\sigma_c$ for each image based on the deep features of the class prototypes 
2. Assign the corresponding prototype’s label as the corrected, new pseudo label $\hat{y}$
3. Start new iteration (with new training and label correction)

##### Background Theory About Label Correction
determine **similarity** of data instance to prototypes with the average similarity score for class $c$ $\sigma_c$: $\sigma_c=\frac{1}{p} \sum_{l=1}^p \cos \left(\mathcal{G}(\mathbf{x}), \mathcal{G}\left(\mathbf{x}_{c l}\right)\right), c=1 \ldots K$

($p$ := #prototypes for each class) 

cosine similarity between the deep features of the sample $\mathbf{x}$ and the prototype $\mathbf{x}_{c l}$)

select the class $c$ that has the highest average similarity score $\sigma_c$: 
$\hat{y}=\operatorname{argmax}_c \sigma_c, c=1 \ldots K$

In [6]:
def correct_labels(features, prototypes):
    corrected_labels = []
    for feature in features:  # Iterate through each feature vector
        max_similarity = -1
        corrected_label = -1
        for label, proto_features in prototypes.items():  # Iterate through each class's prototypes
            # Calculate mean similarity of the feature to the class's prototypes
            similarities = np.mean([np.dot(feature, proto_feature) for proto_feature in proto_features])
            if similarities > max_similarity:  # Update if current similarity is the highest
                max_similarity = similarities
                corrected_label = label
        corrected_labels.append(corrected_label)  # Append the most similar class's label
    return np.array(corrected_labels)  # Return the corrected labels as a NumPy array


#### 5. Training Loop with Iterative Self-Learning

##### Loss Function
1st iteration: cross-entropy loss to quantify how well the model's predictions match the true labels: $\mathcal{L}(\mathcal{F}(\theta, \mathbf{x}), y)=-\frac{1}{n} \sum_{i=1}^n \log \left(\mathcal{F}\left(\theta, \mathbf{x}_i\right)_{y_i}\right)$

we use this because it is a normalized measure of performance that is independent of the batch size or dataset size (unlike the total loss) → easier to interpret and compare the model's effectiveness

In the **subsequent iterations** we need to **combine the losses** of the original noisy $y$ and corrected labels $\hat{y}$ with the weight factor $\alpha$ for training: 
$\mathcal{L}_{\text {total }}=(1-\alpha) \mathcal{L}(\mathcal{F}(\theta, \mathbf{x}), y)+\alpha \mathcal{L}(\mathcal{F}(\theta, \mathbf{x}), \hat{y})$

##### Parameters

1st iteration: We are looking for the **optimal parameters** $\theta^*$ (weights and biases) by **minimizing the loss function** (truth value $Y$ and prediction  $\mathcal{F}(\theta, \mathbf{X})$: $\theta^*=\operatorname{argmin}_\theta \mathcal{L}(Y, \mathcal{F}(\theta, \mathbf{X}))$

In the **subsequent iterations** **update** the optimal parameters because of the corrected labels $\hat{Y}$ using the prototypes $\mathbf{X}_s$: $\theta^*=\operatorname{argmin}_\theta \mathcal{L}\left(Y, \hat{Y}\left(\mathbf{X}, \mathbf{X}_s\right), \mathcal{F}(\theta, \mathbf{X})\right)$

In [7]:
import time
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import numpy as np
from tqdm import tqdm

writer = SummaryWriter()

def train_model(model, train_loader, num_epochs=num_epochs, learning_rate=lr, momentum=momentum, weight_decay=weight_decay, step_size=step_size, gamma=gamma, alpha=alpha, num_prototypes=num_prototypes, device=device):
    criterion = nn.CrossEntropyLoss()
    optimizer = SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
    scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
    
    total_start_time = time.time()
    
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        model.train()
        running_loss = 0.0
        
        # Initialize progress bar for the epoch
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}/{num_epochs}', ncols=100)
        
        for i, (inputs, labels) in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            if epoch == 0:  # First iteration
                loss = criterion(outputs, labels)
            else:  # Subsequent iterations
                corrected = correct_labels(all_features, prototypes)
                corrected = torch.tensor(corrected).to(device)
                loss = (1 - alpha) * criterion(outputs, labels) + alpha * criterion(outputs, corrected)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            # Update progress bar with batch loss
            progress_bar.set_postfix(batch_loss=loss.item())
        
        avg_loss = running_loss / len(train_loader)
        writer.add_scalar('Loss/train', avg_loss, epoch)
        
        if epoch % 5 == 0:
            model.eval()
            all_features = []
            all_labels = []
            with torch.no_grad():
                for inputs, labels in train_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    features = model(inputs)
                    all_features.append(features.cpu().numpy())
                    all_labels.append(labels.cpu().numpy())
            
            all_features = np.concatenate(all_features)
            all_labels = np.concatenate(all_labels)
            
            prototypes = select_prototypes(all_features, all_labels, num_prototypes=num_prototypes, similarity_threshold=similarity_threshold)
        
        scheduler.step()
        
        epoch_duration = time.time() - epoch_start_time
        total_elapsed_time = time.time() - total_start_time
        estimated_remaining_time = (num_epochs - epoch - 1) * (total_elapsed_time / (epoch + 1))
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Time per Epoch: {epoch_duration:.2f}s, Estimated Time Remaining: {estimated_remaining_time:.2f}s')
    
    writer.close()


2024-07-28 16:59:20.718151: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
train_model(model, train_loader)

Epoch 1/15:   0%|                                                          | 0/5269 [00:17<?, ?it/s]


KeyboardInterrupt: 