## Load the Data

Mount Google Drive to access data and other repo files

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Set the seed for reproducability

In [2]:
import torch
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

Delete redundant sample data from Google Colab's session

In [3]:
import shutil
import os

def delete_folder(folder_path):
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' has been deleted.")
    else:
        print(f"Folder '{folder_path}' does not exist.")

delete_folder('sample_data')


Folder 'sample_data' has been deleted.


Clone the repository to access the other relevant files

In [4]:
# clone the repo
!git clone https://github.com/lgiesen/Deep-Self-Learning-From-Noisy-Labels.git

# go to directory
%cd Deep-Self-Learning-From-Noisy-Labels

Cloning into 'Deep-Self-Learning-From-Noisy-Labels'...
remote: Enumerating objects: 164, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 164 (delta 91), reused 127 (delta 58), pack-reused 0[K
Receiving objects: 100% (164/164), 4.22 MiB | 8.56 MiB/s, done.
Resolving deltas: 100% (91/91), done.
/content/Deep-Self-Learning-From-Noisy-Labels


Define the dataset

In [5]:
from config import batch_size, dataset_test_path, dataset_train_path, dataset_val_path
from LoadDataset import CustomImageDataset
from torch.utils.data import DataLoader
from torchvision import transforms

# Define the transformations
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # "These exact values are used for normalizing data that has been pre-trained
    # on the ImageNet dataset. They are based on the statistics of the ImageNet
    # dataset, which consists of a large number of natural images."
    # https://moiseevigor.github.io/software/2022/12/18/one-pager-training-resnet-on-imagenet/

])

# Create datasets
train_dataset = CustomImageDataset(file_path=dataset_train_path, transform=transform)
val_dataset = CustomImageDataset(file_path=dataset_val_path, transform=transform)
test_dataset = CustomImageDataset(file_path=dataset_test_path, transform=transform)

# Create data loaders
# pinned memory can significantly speed up the transfer of data between the host and the device (GPU) because the GPU can directly access it
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Prepare dataloaders dictionary
dataloaders = {
    'train': train_loader,
    'val': val_loader
}

dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}
print(dataset_sizes)
%cd ../../

{'train': 674373, 'val': 207499}
/


Extract the image files

In [6]:
%%time
import tarfile
import os
from config import shared_folder_path, dataset_img

# Function to extract and process files
def extract_and_process(tar_file_path, extract_to):
    with tarfile.open(tar_file_path, 'r') as tar_ref:
        tar_ref.extractall(extract_to)
        print(f"Extracted {tar_file_path} to {extract_to}")

parallel_extraction = True

from concurrent.futures import ThreadPoolExecutor

# Function to extract and process files
def extract_and_process(tar_file_path, extract_to):
    with tarfile.open(tar_file_path, 'r') as tar_ref:
        tar_ref.extractall(extract_to)
        print(f"Extracted {tar_file_path} to {extract_to}")

# Create the extraction directory if it doesn't exist
os.makedirs(dataset_img, exist_ok=True)

# List of tar files to extract
tar_files = [os.path.join(shared_folder_path, f"{i}.tar") for i in range(10)]

# Function to handle extraction in parallel
def extract_tar_files_parallel(tar_files, extract_to):
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(extract_and_process, tar_file, extract_to) for tar_file in tar_files if os.path.exists(tar_file)]
        for future in futures:
            try:
                future.result()  # Wait for the result to ensure any exceptions are raised
            except Exception as e:
                print(f"An error occurred: {e}")

# Extract tar files in parallel
extract_tar_files_parallel(tar_files, dataset_img)
print("The extracted tar files should result in the folders 0 to 9:")
!ls "{dataset_img}"

Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/0.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/5.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/9.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/4.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/images/2.tar to ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/extracted_images/
Extracted /content/drive/MyDrive/Colab_Notebooks/D

## Training

In [7]:
import torch
from torchvision import models
from config import lr, momentum, weight_decay, gamma, step_size, dataset, num_classes
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np
import torch
from collections import Counter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

# Initialize the model
#model = models.resnet50(pretrained=True)
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
# Modify the final fully connected layer to output 14 classes
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, num_classes)
# Parallelize training across multiple GPUs
model = torch.nn.DataParallel(model)

# Calculate the balanced class weights because of an imbalanced dataset
# Read the data again for higher efficiency
data = pd.read_csv(dataset.replace("../","/content/"), header=None, sep=' ', usecols=[1], names=['label'])
# Convert the labels to a numpy array
labels = data['label'].values
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
del labels, data

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

# Initialize the learning rate scheduler: Decay LR by a factor of 0.1 every 5 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

model = model.to(device)

device: cuda:0


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 74.1MB/s]


In [8]:
# Function to calculate accuracy
def calculate_accuracy(outputs, labels):
    _, predicted = outputs.max(1)
    correct = predicted.eq(labels).sum().item()
    return correct

# Function to evaluate the model
def evaluate_model(loader, model, criterion, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():  # No need to calculate gradients during evaluation
        for inputs, labels in loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            correct_predictions += calculate_accuracy(outputs, labels)
            total_samples += labels.size(0)

    avg_loss = running_loss / len(loader)
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

In [9]:
import time
from config import num_epochs, dataset_root, checkpoint_path
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer_path = dataset_root.replace("..", "/content") + 'runs/resnet50_experiment'
writer = SummaryWriter(writer_path)

# Early stopping parameters
patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Train the model
for epoch in range(num_epochs):
    if early_stop:
        break

    model.train()  # Set model to training mode
    epoch_start_time = time.time()  # Start time for the epoch

    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for inputs, labels in train_loader:
        # Move input and label tensors to the device
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero out the optimizer
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        correct_predictions += calculate_accuracy(outputs, labels)
        total_samples += labels.size(0)

    epoch_duration = time.time() - epoch_start_time  # End time for the epoch
    avg_loss = running_loss / len(train_loader)  # Average loss for the epoch
    accuracy = correct_predictions / total_samples  # Accuracy for the epoch

    # Log the training loss, accuracy, and duration to TensorBoard
    writer.add_scalar('Loss/train', avg_loss, epoch)
    writer.add_scalar('Accuracy/train', accuracy, epoch)
    writer.add_scalar('Time/train', epoch_duration, epoch)

    # Validate the model
    val_loss, val_accuracy = evaluate_model(val_loader, model, criterion, device)
    writer.add_scalar('Loss/val', val_loss, epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    # Print the loss, accuracy, and time for every epoch
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, '
          f'Time: {epoch_duration:.2f} sec')

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            early_stop = True

    # Save checkpoint every epoch
    torch.save(model.state_dict(), checkpoint_path + f'model_epoch_{epoch+1}.pth')

    # Step the scheduler
    scheduler.step()

print(f'Finished Training, Final Train Loss: {avg_loss:.4f}, Final Train Accuracy: {accuracy:.4f}')

# Test the model
test_loss, test_accuracy = evaluate_model(test_loader, model, criterion, device)
writer.add_scalar('Loss/test', test_loss, 0)
writer.add_scalar('Accuracy/test', test_accuracy, 0)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# Close the TensorBoard writer
writer.close()

model_path = f'{dataset_root}models/resnet50_clothing1m.pth'
torch.save(model.state_dict(), model_path)
print("Model saved.")

Epoch 1/15, Train Loss: 0.8193, Train Accuracy: 0.7395, Val Loss: 0.6566, Val Accuracy: 0.7911, Time: 2209.52 sec
Epoch 2/15, Train Loss: 0.6084, Train Accuracy: 0.8045, Val Loss: 0.6293, Val Accuracy: 0.7987, Time: 1997.41 sec
Epoch 3/15, Train Loss: 0.6120, Train Accuracy: 0.8038, Val Loss: 0.6734, Val Accuracy: 0.7913, Time: 1954.85 sec
Epoch 4/15, Train Loss: 0.6619, Train Accuracy: 0.7893, Val Loss: 0.8744, Val Accuracy: 0.7246, Time: 1975.22 sec
Epoch 5/15, Train Loss: 0.7066, Train Accuracy: 0.7775, Val Loss: 0.8256, Val Accuracy: 0.7476, Time: 1967.66 sec
Epoch 6/15, Train Loss: 0.5371, Train Accuracy: 0.8306, Val Loss: 0.5273, Val Accuracy: 0.8344, Time: 1982.76 sec
Epoch 7/15, Train Loss: 0.4845, Train Accuracy: 0.8473, Val Loss: 0.5134, Val Accuracy: 0.8348, Time: 1970.28 sec
Epoch 8/15, Train Loss: 0.4581, Train Accuracy: 0.8560, Val Loss: 0.5120, Val Accuracy: 0.8388, Time: 1973.72 sec
Epoch 9/15, Train Loss: 0.4409, Train Accuracy: 0.8621, Val Loss: 0.5125, Val Accuracy: 

RuntimeError: Parent directory ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/models does not exist.

RuntimeError: Parent directory ../drive/MyDrive/Colab_Notebooks/Deep_Self_Learning_From_Noisy_Labels/models does not exist.

In [12]:
from config import model_path_standard as model_path
torch.save(model.state_dict(), model_path)
print("Model saved.")

Model saved.


In [13]:
from google.colab import files

files.download(model_path)
files.download(writer_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The training has ended after epoch 2 because the laptop was offline. Here I continue training the model.

In [None]:
def save_checkpoint(epoch, model, optimizer, scheduler, best_val_loss, epochs_no_improve, checkpoint_path):
    checkpoint_path = os.path.join(checkpoint_path, f'checkpoint_epoch_{epoch}.pth')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_val_loss': best_val_loss,
        'epochs_no_improve': epochs_no_improve
    }, checkpoint_path)
    print(f'Checkpoint saved at epoch {epoch}')

def load_checkpoint(model, optimizer, scheduler, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    return checkpoint['epoch'], checkpoint['best_val_loss'], checkpoint['epochs_no_improve']

In [None]:
import time
from config import num_epochs, dataset_root, checkpoint_path
from torch.utils.tensorboard import SummaryWriter
# Initialize TensorBoard writer
writer_path = dataset_root.replace("..", "/content") + 'runs/resnet50_experiment'
writer = SummaryWriter(writer_path)

# Early stopping parameters
patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Load specific checkpoint if available
start_epoch = 2
current_checkpoint_path = os.path.join(checkpoint_path, f'checkpoint_epoch_{start_epoch}.pth')
if os.path.exists(current_checkpoint_path):
    start_epoch, best_val_loss, epochs_no_improve = load_checkpoint(model, optimizer, scheduler, current_checkpoint_path)
    start_epoch += 1  # start from the next epoch
    print(f'Resumed from checkpoint {current_checkpoint_path}')

# Train the model
for epoch in range(start_epoch, num_epochs):
    if early_stop:
        break

    model.train()  # Set model to training mode
    epoch_start_time = time.time()  # Start time for the epoch

    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for inputs, labels in train_loader:
        # Move input and label tensors to the device
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero out the optimizer
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        correct_predictions += calculate_accuracy(outputs, labels)
        total_samples += labels.size(0)

    epoch_duration = time.time() - epoch_start_time  # End time for the epoch
    avg_loss = running_loss / len(train_loader)  # Average loss for the epoch
    accuracy = correct_predictions / total_samples  # Accuracy for the epoch

    # Log the training loss, accuracy, and duration to TensorBoard
    writer.add_scalar('Loss/train', avg_loss, epoch)
    writer.add_scalar('Accuracy/train', accuracy, epoch)
    writer.add_scalar('Time/train', epoch_duration, epoch)

    # Validate the model
    val_loss, val_accuracy = evaluate_model(val_loader, model, criterion, device)
    writer.add_scalar('Loss/val', val_loss, epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    # Print the loss, accuracy, and time for every epoch
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, '
          f'Time: {epoch_duration:.2f} sec')

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            early_stop = True

    # Save checkpoint
    save_checkpoint(epoch, model, optimizer, scheduler, best_val_loss, epochs_no_improve, checkpoint_path)

    # Step the scheduler
    scheduler.step()

print(f'Finished Training, Final Train Loss: {avg_loss:.4f}, Final Train Accuracy: {accuracy:.4f}')

# Test the model
test_loss, test_accuracy = evaluate_model(test_loader, model, criterion, device)
writer.add_scalar('Loss/test', test_loss, 0)
writer.add_scalar('Accuracy/test', test_accuracy, 0)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# Close the TensorBoard writer
writer.close()

#### Evaluation

Evaluate models on validation data with accuracy, precision, recall, F1 score