## Load the Data

Mount Google Drive to access data and other repo files

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Set the seed for reproducability

In [None]:
import torch
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

Delete redundant sample data from Google Colab's session

In [None]:
import shutil
import os

def delete_folder(folder_path):
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' has been deleted.")
    else:
        print(f"Folder '{folder_path}' does not exist.")

delete_folder('sample_data')


Folder 'sample_data' has been deleted.


Clone the repository to access the other relevant files

In [None]:
# clone the repo
!git clone https://github.com/lgiesen/Deep-Self-Learning-From-Noisy-Labels.git

# go to directory
%cd Deep-Self-Learning-From-Noisy-Labels

Cloning into 'Deep-Self-Learning-From-Noisy-Labels'...
remote: Enumerating objects: 128, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 128 (delta 70), reused 100 (delta 46), pack-reused 0[K
Receiving objects: 100% (128/128), 4.20 MiB | 18.52 MiB/s, done.
Resolving deltas: 100% (70/70), done.
/content/Deep-Self-Learning-From-Noisy-Labels


Define the dataset

In [2]:
from config import batch_size, dataset_test_path, dataset_train_path, dataset_val_path
from LoadDataset import CustomImageDataset
from torch.utils.data import DataLoader
from torchvision import transforms

# Define the transformations
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # "These exact values are used for normalizing data that has been pre-trained
    # on the ImageNet dataset. They are based on the statistics of the ImageNet
    # dataset, which consists of a large number of natural images."
    # https://moiseevigor.github.io/software/2022/12/18/one-pager-training-resnet-on-imagenet/

])

# Create datasets
train_dataset = CustomImageDataset(file_path=dataset_train_path, transform=transform)
val_dataset = CustomImageDataset(file_path=dataset_val_path, transform=transform)
test_dataset = CustomImageDataset(file_path=dataset_test_path, transform=transform)

# Create data loaders
# pinned memory can significantly speed up the transfer of data between the host and the device (GPU) because the GPU can directly access it
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Prepare dataloaders dictionary
dataloaders = {
    'train': train_loader,
    'val': val_loader
}

dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}
print(dataset_sizes)
%cd ../../

{'train': 674373, 'val': 207499}
/Users/leori/Code


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Extract the image files

In [None]:
%%time
import tarfile
import os
from config import shared_folder_path, dataset_img

# Function to extract and process files
def extract_and_process(tar_file_path, extract_to):
    with tarfile.open(tar_file_path, 'r') as tar_ref:
        tar_ref.extractall(extract_to)
        print(f"Extracted {tar_file_path} to {extract_to}")

parallel_extraction = True

from concurrent.futures import ThreadPoolExecutor

# Function to extract and process files
def extract_and_process(tar_file_path, extract_to):
    with tarfile.open(tar_file_path, 'r') as tar_ref:
        tar_ref.extractall(extract_to)
        print(f"Extracted {tar_file_path} to {extract_to}")

# Create the extraction directory if it doesn't exist
os.makedirs(dataset_img, exist_ok=True)

# List of tar files to extract
tar_files = [os.path.join(shared_folder_path, f"{i}.tar") for i in range(10)]

# Function to handle extraction in parallel
def extract_tar_files_parallel(tar_files, extract_to):
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(extract_and_process, tar_file, extract_to) for tar_file in tar_files if os.path.exists(tar_file)]
        for future in futures:
            try:
                future.result()  # Wait for the result to ensure any exceptions are raised
            except Exception as e:
                print(f"An error occurred: {e}")

# Extract tar files in parallel
extract_tar_files_parallel(tar_files, dataset_img)
print("The extracted tar files should result in the folders 0 to 9:")
!ls "{dataset_img}"

Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/1.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/3.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/4.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/0.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/2.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/D

## Training

In [None]:
import torch
from torchvision import models
from config import lr, momentum, weight_decay, gamma, step_size
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

# Initialize the model
#model = models.resnet50(pretrained=True)
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

# Define a hook function to capture the output before the FC layer
def hook_function(module, input, output):
    global features
    features = output

# Register the hook to the layer before the FC layer (AdaptiveAvgPool2d)
hook = model.avgpool.register_forward_hook(hook_function)

# Parallelize training across multiple GPUs
model = torch.nn.DataParallel(model).to(device)

# Assuming y_train contains the labels for the training dataset
labels = [label for _, label in train_dataset]
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
del labels

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

# Initialize the learning rate scheduler: Decay LR by a factor of 0.1 every 5 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

device: cuda:0




In [None]:

########## Test the hook
# Define the input image with the shape [1, 3, 224, 224]
input_image = torch.randn(1, 3, 224, 224)

# Pass the input image through the model
output = model(input_image)

# Unregister the hook
hook.remove()

# Print the captured features
print(features.shape)
print(features)

# The shape of `features` should be [1, 2048, 1, 1] before it is flattened by the FC layer


In [None]:
model.summary()

In [None]:
# Function to calculate accuracy
def calculate_accuracy(outputs, labels):
    _, predicted = outputs.max(1)
    correct = predicted.eq(labels).sum().item()
    return correct

# Function to evaluate the model
def evaluate_model(loader, model, criterion, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():  # No need to calculate gradients during evaluation
        for inputs, labels in loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            correct_predictions += calculate_accuracy(outputs, labels)
            total_samples += labels.size(0)

    avg_loss = running_loss / len(loader)
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Function to select prototypes based on density and similarity
def select_prototypes(features, labels, num_prototypes, similarity_threshold):
    similarity_matrix = cosine_similarity(features)
    densities = (similarity_matrix > similarity_threshold).sum(axis=1)
    prototypes = []
    for c in np.unique(labels):
        class_indices = np.where(labels == c)[0]
        class_densities = densities[class_indices]
        sorted_indices = class_indices[np.argsort(-class_densities)]
        prototypes.append(sorted_indices[:num_prototypes])
    return np.concatenate(prototypes)

# Function to correct labels based on prototypes
def correct_labels(features, prototypes, prototype_labels):
    corrected_labels = []
    for feature in features:
        similarities = cosine_similarity([feature], prototypes)[0]
        class_similarities = np.zeros(np.max(prototype_labels) + 1)
        for j, proto_label in enumerate(prototype_labels):
            class_similarities[proto_label] += similarities[j]
        corrected_labels.append(np.argmax(class_similarities))
    return np.array(corrected_labels)

In [None]:
import time
from config import num_epochs, dataset_root, similarity_threshold, num_prototypes
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer_path = dataset_root.replace("..", "/content") + 'runs/resnet50_experiment'
writer = SummaryWriter(writer_path)

# 1. Train the Model
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_start_time = time.time()  # Start time for the epoch

    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for inputs, labels in train_loader:
        # Move input and label tensors to the device
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero out the optimizer
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        correct_predictions += calculate_accuracy(outputs, labels)
        total_samples += labels.size(0)

    epoch_duration = time.time() - epoch_start_time  # End time for the epoch
    avg_loss = running_loss / len(train_loader)  # Average loss for the epoch
    accuracy = correct_predictions / total_samples  # Accuracy for the epoch

    # Log the training loss, accuracy, and duration to TensorBoard
    writer.add_scalar('Loss/train', avg_loss, epoch)
    writer.add_scalar('Accuracy/train', accuracy, epoch)
    writer.add_scalar('Time/train', epoch_duration, epoch)

    # Validate the model
    val_loss, val_accuracy = evaluate_model(val_loader, model, criterion, device)
    writer.add_scalar('Loss/val', val_loss, epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    # Print the loss, accuracy, and time for every epoch
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, '
          f'Time: {epoch_duration:.2f} sec')

    # Step the scheduler
    scheduler.step()

    # 2. Label Correction Phase (except in last iteration)
    if epoch != num_epochs-1:
        # 2.1 Prototype Selection
        # sample m=1280 images for each class
        # extract features with hook
        # calculate similarity of features
        # calculate density of features
        # select 8 prototypes for each class
        # get right labels 



        all_features = []
        all_labels = []
        with torch.no_grad():
            for images, labels in train_loader:
                features = feature_extractor(images)
                all_features.append(features)
                all_labels.append(labels)
        
        all_features = torch.cat(all_features).cpu().numpy()
        all_labels = torch.cat(all_labels).cpu().numpy()
        prototypes_indices = select_prototypes(all_features, all_labels, num_prototypes, similarity_threshold)
        prototypes = all_features[prototypes_indices]

        # Update dataset labels
        train_loader.dataset.labels = corrected_labels
        # 2.2 label correction
        prototype_labels = all_labels[prototypes_indices]
        corrected_labels = correct_labels(all_features, prototypes, prototype_labels)

        # TODO ADJUST ALL LABELS train_dataset (CHECK IF TRAIN LOADER IS ALSO ADJUSTED) OR UPDATE NEXT (COUPLE OF?) TRAINING TRAIN LOADERS

print(f'Finished Training, Final Train Loss: {avg_loss:.4f}, Final Train Accuracy: {accuracy:.4f}')

# Test the model
test_loss, test_accuracy = evaluate_model(test_loader, model, criterion, device)
writer.add_scalar('Loss/test', test_loss, 0)
writer.add_scalar('Accuracy/test', test_accuracy, 0)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# Close the TensorBoard writer
writer.close()

Epoch 1/15, Train Loss: 0.7485, Train Accuracy: 0.7680, Val Loss: 0.6389, Val Accuracy: 0.7987, Time: 1951.03 sec


In [None]:
from google.colab import files

files.download(writer_path)

In [None]:
# Save the trained model
model_path = f'{dataset_root}models/resnet50_clothing1m.pth'
torch.save(model.state_dict(), model_path)
print("Model saved.")

In [None]:
from google.colab import files

files.download(model_path)

In [None]:
import os

if os.path.isfile(model_path):
    print("The file exists.")
else:
    print("The file does not exist.")

#### Evaluation

In [None]:
import torch
from torchvision import models
from torchviz import make_dot

# Define the model (assuming you have already defined and loaded it as before)
weights = models.ResNet50_Weights.IMAGENET1K_V1
model = models.resnet50(weights=weights)
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, len(class_names))

# Create a dummy input tensor with the same shape as your input data
dummy_input = torch.randn(1, 3, 224, 224)

# Perform a forward pass using the dummy input
output = model(dummy_input)

# Visualize the model
dot = make_dot(output, params=dict(model.named_parameters()))
dot.format = 'png'
dot.render(f'{dataset_root}models/resnet50_model')

# Display the model graph
from IPython.display import Image
Image(f'{dataset_root}models/resnet50_model.png')

Evaluate models on validation data with accuracy, precision, recall, F1 score

In [None]:
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the model (assuming it's saved as a PyTorch model)
model = torch.load(model_path)
model.eval()  # Set the model to evaluation mode

# Define a function to evaluate the model
def evaluate_model(loader):
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in loader:
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1

# Assuming val_loader and test_loader are defined and contain the data
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model(val_loader)
test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(test_loader)

# Create a DataFrame to store the evaluation results
evaluation_results = pd.DataFrame({
    'Dataset': ['Validation', 'Test'],
    'Accuracy': [val_accuracy, test_accuracy],
    'Precision': [val_precision, test_precision],
    'Recall': [val_recall, test_recall],
    'F1 Score': [val_f1, test_f1]
})

# Export the evaluation results to a CSV file
evaluation_results.to_csv(f'{dataset_root}models/model_evaluation_results.csv', index=False)

print("Model evaluation completed and results exported.")


# To Do

- handle class imbalance (oversampling, undersampling, or class weighting to balance the dataset)
optional
- Hyperparameter tuning to verify statements in the paper