In [1]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torchvision.transforms import InterpolationMode
from torch.utils.data import DataLoader, Subset

import matplotlib.pyplot as plt
import logging
import itertools
from collections import defaultdict
import pandas as pd
import random
import os

from myutils import seed_worker

In [2]:
SEED = 1812

# Configuration
config = {
    'BATCH_SIZE': 32,               # Batch size
    'INITIAL_LR': 0.001,            # Initial Learning rate
    'LR_DECOY': 0.9,                # Learning rate decay factor applies each epoch
    'NUM_EPOCHS': 10,               # Number of epochs for model training
    'DATA_DIR': '../Data/raw',      # Folder with images in initial format
    'IMAGE_SIZE': 242,              # Target image size (width and height) for preprocessing
    'NUM_CLASSES': 6,               # Number of classes
    'TEST_SPLIT_RATIO': 0.2,        # Share of testing data
    'VALID_SPLIT_RATIO': 0.1,       # Share of validation data
    'NORMALIZE': True,              # Images normalization
    'DEBUG': True,                  # Whether to output debug info or not
    'NUMBER_OF_TRIALS': 1,          # Number of trials to calculate the mean time per image
    'NUM_WORKER': 8,                # Number of workers for DataLoader
    'PREFETCH_FACTOR': 2,           # Prefetch factor for DataLoader
}

# ANSI escape codes
RED = '\033[91m'
GREEN = '\033[92m'
RESET = '\033[0m'

# Logging configuration for all modules, including Python's pachages 
root_logger = logging.getLogger()
root_logger.setLevel(logging.WARNING)
root_handler = logging.StreamHandler()
root_handler.setFormatter(logging.Formatter('%(asctime)s - ROOT - %(levelname)s - %(message)s'))
root_logger.addHandler(root_handler)

# Logging configuration for my module
my_module_logger = logging.getLogger(__name__)
if config['DEBUG'] == True:
    my_module_logger.setLevel(logging.DEBUG)
else:
    my_module_logger.setLevel(logging.INFO)
my_module_logger.propagate = False

# Handler for my module 
my_module_handler = logging.StreamHandler()
my_module_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
my_module_logger.addHandler(my_module_handler)

In [3]:
def reset_seeds(seed = 1812):
    # Python, NumPy, and PyTorch seeds
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    # For M1 GPU (MPS backend)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
    
    # Ensure deterministic algorithms (if supported)
    torch.backends.cudnn.deterministic = True  # Affects CPU/CUDA/MPS indirectly
    torch.backends.cudnn.benchmark = False
    # torch.use_deterministic_algorithms(True)
    # torch.backends.mps.deterministic = True

# Detecting the type of avalable device
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif torch.backends.mps.is_available():
        return torch.device('mps')
    else:
        return torch.device('cpu')

In [4]:
class ImagesData:
    def __init__(self, data_dir: str = config['DATA_DIR'], 
                 test_split: float = config['TEST_SPLIT_RATIO'], 
                 validation_split: float = config['VALID_SPLIT_RATIO']):
        self.data_dir = data_dir
        self.test_split = test_split
        self.valid_split = validation_split

        self.train_dataset, self.valid_dataset, self.test_dataset = self.get_datasets(data_dir)
        self.train_loader, self.valid_loader, self.test_loader = self.get_dataloaders(self.train_dataset, 
                                                                                      self.valid_dataset, 
                                                                                      self.test_dataset
                                                                                      )

    # Composing transformers for training, validation and testing 
    def get_transforms(self, train: bool = False):
        # The common part for all kind of transformers
        common_transforms = [transforms.Resize((config['IMAGE_SIZE'], config['IMAGE_SIZE']))]

        # Specificly for training - consist augmentation
        
        if train:
            common_transforms += [
                transforms.RandomHorizontalFlip(p = 0.5),     # Horizontal Flip
                transforms.RandomVerticalFlip(p = 0.5),       # Vertical Flip
                transforms.RandomAffine(                    
                    degrees = 36,                             # Rotation
                    translate = (0.2, 0.2),                   # Move
                    scale = (0.7, 1.3),                       # Zoom
                    interpolation = InterpolationMode.NEAREST # How to interpolate - as neighbor pixels
                ),
                transforms.ColorJitter(brightness = 0.2, contrast = 0.2)    # Brightness and contrast
            ]
        # Convertion to tensor for all kind of transformers 
        common_transforms.append(transforms.ToTensor())
        # Normalization 
        if config['NORMALIZE']:
            common_transforms.append(
                transforms.Normalize(mean = [0.485, 0.456, 0.406],  # Normalization parameters the same as for ImageNet
                                     std = [0.229, 0.224, 0.225])
            )
        return transforms.Compose(common_transforms)


    def get_datasets(self, data_dir: str):
        train_transforms = self.get_transforms(train = True)
        valid_transforms = self.get_transforms(train = False)
        test_transforms  = self.get_transforms(train = False)
        
        train_dataset = datasets.ImageFolder(root = data_dir, transform = train_transforms)
        valid_dataset = datasets.ImageFolder(root = data_dir, transform = valid_transforms)
        test_dataset  = datasets.ImageFolder(root = data_dir, transform = test_transforms)
        return train_dataset, valid_dataset, test_dataset

    def get_dataloaders(self, train_dataset, valid_dataset, test_dataset):
        num_workers = config['NUM_WORKER']
        prefetch_factor = config['PREFETCH_FACTOR']
        my_module_logger.debug(f'{GREEN}Number of workers: {num_workers:d}, '
                               f'Prefetch factor: {prefetch_factor:d}, '
                               f'Batch size: {config['BATCH_SIZE']:d}{RESET}')

        g = torch.Generator()
        g.manual_seed(1812)

        dataset_size = len(train_dataset)
        indices = list(range(dataset_size))
        np.random.seed(1812)
        np.random.shuffle(indices)

        dataset_size = 200
        indices = indices[:dataset_size]

        test_split = int(np.floor(config['TEST_SPLIT_RATIO'] * dataset_size))
        valid_split = int(np.floor(config['VALID_SPLIT_RATIO'] * dataset_size))

        test_indices = indices[:test_split]
        train_val_indices = indices[test_split:]
        valid_indices = train_val_indices[:valid_split]
        train_indices = train_val_indices[valid_split:]

        my_module_logger.debug(f'Training dataset indices: {min(train_indices):d}-{max(train_indices):d}, ' 
                               f'Number of instances: {len(train_indices)}')
        my_module_logger.debug(f'Validation dataset indices: {min(valid_indices):d}-{max(valid_indices):d}, '
                               f'Number of instances: {len(valid_indices)}')
        my_module_logger.debug(f'Test dataset indices: {min(test_indices):d}-{max(test_indices):d}, '
                               f'Number of instances: {len(test_indices)}')                

        my_module_logger.debug(f'Nun_workers: {num_workers}, prefetch_factor: {prefetch_factor}')
        train_loader = DataLoader(Subset(train_dataset, train_indices),
                                batch_size = config['BATCH_SIZE'], shuffle = True,
                                num_workers = num_workers,              # Number of subprocesses for data loading
                                # pin_memory = True,                      # Pinned memory for faster transfers to GPU,
                                prefetch_factor = prefetch_factor,      # Number of batches to prefetch per worker
                                persistent_workers = False,
                                generator = g,
                                worker_init_fn = seed_worker
                                )
        valid_loader = DataLoader(Subset(valid_dataset, valid_indices),
                                batch_size = config['BATCH_SIZE'], shuffle = False,
                                num_workers = num_workers,         
                                # pin_memory = True,       
                                prefetch_factor = prefetch_factor,
                                persistent_workers = False, 
                                generator = g,
                                worker_init_fn = seed_worker
                                )
        test_loader  = DataLoader(Subset(test_dataset, test_indices),
                                batch_size = config['BATCH_SIZE'], shuffle = False, 
                                num_workers = num_workers,         
                                # pin_memory = True,       
                                prefetch_factor = prefetch_factor,
                                persistent_workers = False,
                                generator = g,
                                worker_init_fn = seed_worker
                                )
        return train_loader, valid_loader, test_loader

In [5]:
def plot_metrics(train_metrics, val_metrics, ylabel: str, title: str):
    plt.figure(figsize=(8, 4))
    plt.plot(train_metrics, label='Training')
    plt.plot(val_metrics, label='Validation')
    if ylabel == 'Accuracy':
        plt.ylim([0, 100])
    else:
        plt.ylim([0, 1])
    plt.xlabel('Epochs')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.show()
    
def denormalize(image, mean, std):
    mean = torch.tensor(mean).reshape(-1, 1, 1)
    std = torch.tensor(std).reshape(-1, 1, 1)
    image = image * std + mean
    return image

def plot_samples_from_loader(loader, num:int):
    batch = iter(loader)
    first_sixteen = next(batch)

    # Assuming first_sixteen[0] is a batch of images with shape (32, 3, 242, 242)
    # Select a subset of images, for example, the first 16 images
    images = first_sixteen[0][:16]              # Shape: (16, 3, 242, 242)
    denorm_images = denormalize(images, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    # Create a grid of images
    grid_img = torchvision.utils.make_grid(denorm_images, nrow = 4)  # nrow defines the number of images in each row

    # The grid image will have shape (3, H, W), so we need to permute dimensions
    grid_img = grid_img.permute(1, 2, 0).cpu().numpy()

    plt.figure(figsize=(16, 16))
    plt.imshow(grid_img)
    plt.axis('off')
    plt.show()

def plot_samples(images, num:int):
    # Assuming first_sixteen[0] is a batch of images with shape (32, 3, 242, 242)
    # Select a subset of images, for example, the first 16 images
    images = images[:num]              # Shape: (32, 3, 242, 242)
    images = images.to(torch.device('cpu'))
    denorm_images = denormalize(images, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    # Create a grid of images
    grid_img = torchvision.utils.make_grid(denorm_images, nrow = 4)  # nrow defines the number of images in each row

    # The grid image will have shape (3, H, W), so we need to permute dimensions
    grid_img = grid_img.permute(1, 2, 0).cpu().numpy()

    plt.figure(figsize=(12, (num // 4) * 3))
    plt.imshow(grid_img)
    plt.axis('off')
    plt.show()

In [6]:
def build_model(num_classes: int):
    model = torchvision.models.convnext_base(
        weights = torchvision.models.ConvNeXt_Base_Weights.IMAGENET1K_V1)
    num_features = model.classifier[2].in_features
    model.classifier[2] = nn.Linear(num_features, num_classes)
    # Freeze feature extractor
    for param in model.features.parameters():
        param.requires_grad = False
    return model


def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    batches_debuging_info = []
    batch = 0

    for images, labels in loader:
        checksums_for_debuging = defaultdict(dict)
        images, labels = images.to(device), labels.to(device)

        weights = 7 * torch.arange(1, images.shape[0] + 1).view(images.shape[0], 1, 1, 1)    # to account the possible shuffle of images inside the batch
        weights = weights.to(get_device())
        images_check_sum = (images *  weights).sum()

        outputs = model(images)
        loss = criterion(outputs, labels)


        my_module_logger.debug(f'Images chechsum: {images_check_sum:12,.0f}, '        # Checksum for all images in the batch to compare with other iterations
                               f'Outputs checksum: {outputs.sum():8.4f}, '               # checksum for outputs to compare with other iterations
                               f'Loss: {loss:2.6f}')                                     # The loss to compare with other iterations 

        # plot_samples(images, 4)                               # Display the first four images of the batch to check that they are the same on the each iteration

        checksums_for_debuging['Batch'] = batch
        checksums_for_debuging['Images_check'] = images_check_sum.item()
        checksums_for_debuging['Outputs_check'] = outputs.sum().item()
        checksums_for_debuging['Loss'] = loss.item()
        batches_debuging_info.append(checksums_for_debuging)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        batch += 1
    return running_loss / total, 100 * correct / total, batches_debuging_info


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return total_loss / total, 100 * correct / total

In [7]:
def get_param_grid(parameters: list):
    combinations = []
    for grid in parameters:
        # Converting all values to list
        normalized_grid = {key: value if isinstance(value, list) else [value]
                           for key, value in grid.items()}
        # Exctract keys and respective values
        keys = list(normalized_grid.keys())
        values_lists = [normalized_grid[key] for key in keys]

        # Generating all possible combinations
        combinations.extend([dict(zip(keys, combination)) for combination in itertools.product(*values_lists)])
        
    return combinations

def change_config(combination: dict):
     for key, value in combination.items():
         config[key] = value

def log_results(file_name, config, test_loss, test_acc, elapsed_time):
    results = defaultdict(list)

    for key, value in config.items():
        results[key] = value
    results['test_loss'] = test_loss
    results['test_acc'] = test_acc
    results['elapsed_time'] = elapsed_time /60

    results_df = pd.DataFrame.from_dict([results])
    
    header = not os.path.exists(file_name)
    results_df.to_csv(file_name, mode = 'a', header = header, index = False)

def flatten_defaultdict(data):
    rows = []
    for key, value_list in data.items():
        for _, item in enumerate(value_list):
            row = {'Iteration': key}
            row.update(item)
            rows.append(row)
    return pd.DataFrame(rows)

In [8]:
def main():
    reset_seeds(1812)

    device = get_device()
    my_module_logger.debug(f'Using device: {device}')
    my_module_logger.debug(f'Config: {config}')

    data = ImagesData()

    model = build_model(config['NUM_CLASSES'])
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = config['INITIAL_LR'])
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma = config['LR_DECOY'])

    epoch_train_loss, epoch_train_acc = [], []
    epoch_val_loss, epoch_val_acc = [], []
    debug_info_list = []

    for epoch in range(config['NUM_EPOCHS']):
        torch.cuda.empty_cache()
        start_time = time.time()
   
        train_loss, train_acc, debug_info = train_one_epoch(model, data.train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, data.valid_loader, criterion, device)

        # Adding debuging infor for the epoch
        debug_info = [{'Epoch': epoch + 1, **d} for d in debug_info]
        debug_info_list.extend(debug_info)

        epoch_train_loss.append(train_loss)
        epoch_train_acc.append(train_acc)
        epoch_val_loss.append(val_loss)
        epoch_val_acc.append(val_acc)

        scheduler.step()
        elapsed = time.time() - start_time

        my_module_logger.debug(f'Epoch [{epoch + 1:2d}/{config['NUM_EPOCHS']}]: '
                    f'Train Loss: {RED}{train_loss:.4f}{RESET}, Train Acc: {train_acc:.2f}%, '
                    f'Val Loss: {RED}{val_loss:.4f}{RESET}, Val Acc: {val_acc:.2f}%, '
                    f'Time: {elapsed:3.0f} sec')

    # Save the model
    # torch.save(model, '../Models/convNeXt_pytorch_base_v5_BS' + 
    #            str(config['BATCH_SIZE']) + '_LR' + str(config['INITIAL_LR']))

    # Evaluate on test set with timing measurements
    processing_times = []

    for i in range(config['NUMBER_OF_TRIALS']):
        start_trial = time.time()
        test_loss, test_acc = evaluate(model, data.test_loader, criterion, device)
        elapsed_trial = time.time() - start_trial

        # Assuming total images is sum of batch sizes:
        total_images = len(data.test_dataset)
        avg_time_per_image = elapsed_trial / total_images
        
        my_module_logger.debug(f'Trial: {i + 1}, '
                               f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%, '
                               f'Avg Time/Image: {avg_time_per_image:.3f} sec')
        processing_times.append(avg_time_per_image)

    my_module_logger.info(f'Final Test: Loss: {test_loss:.4f}, Acc: {test_acc:.2f}%')
    my_module_logger.info(f'Mean processing time per image: {np.mean(processing_times):.3f} sec')

    # plot_metrics(epoch_train_acc, epoch_val_acc, 'Accuracy', 'Training and Validation Accuracy')
    # plot_metrics(epoch_train_loss, epoch_val_loss, 'Loss', 'Training and Validation Loss')

    del model
    del optimizer
    del criterion
    del scheduler 
    torch.cuda.empty_cache()
    
    return test_loss, test_acc, debug_info_list

In [9]:
if __name__ == '__main__':
    file_name = f"../results_{time.strftime('%y%m%d_%H%M', time.localtime())}.csv"
    debug_summary = defaultdict(list)

    # param_grid = [
    #     {   
    #         'BATCH_SIZE': 32, 
    #         'PREFETCH_FACTOR': [1, 2],
    #         'INITIAL_LR': 0.004 
    #     }, 
        # {   
        #     'BATCH_SIZE': [40, 48], 
        #     'PREFETCH_FACTOR': [2, 4], 
        #     'INITIAL_LR': [0.004, 0.002, 0.001, 0.0005], 
        # }, 

    # ]
    
    # Generating all combinations
    # for combination in get_param_grid(param_grid):
    #     change_config(combination)

    for i in range(4):
        my_module_logger.debug(f'{GREEN}Iteration {i + 1:d}{RESET}')
        cycle_start_time = time.time()

        test_loss, test_acc, debug_data = main()
        debug_summary[i + 1] = debug_data
        
        elapsed_time = (time.time() - cycle_start_time)
        my_module_logger.debug(f'Total cycle elapsed time: {elapsed_time // 60:2.0f} min {elapsed_time % 60:2.0f} sec')
        log_results(file_name, config, test_loss, test_acc, elapsed_time / 60)


    # Convert debug_summary to flat DataFrame
    df = flatten_defaultdict(debug_summary)
    df = df[['Iteration',  'Epoch', 'Batch', 'Images_check', 'Outputs_check', 'Loss']].sort_values(by = ['Iteration', 'Epoch', 'Batch'])

    # Calculate differencies for debuging
    df_diff = df.copy()
    df_diff['Images_check_diff'] = df.groupby(['Epoch', 'Batch'])['Images_check'].diff()
    df_diff['Outputs_check_diff'] = df.groupby(['Epoch', 'Batch'])['Outputs_check'].diff()
    df_diff['Loss_diff'] = df.groupby(['Epoch', 'Batch'])['Loss'].diff()

2025-02-05 19:24:38,118 - DEBUG - [92mIteration 1[0m
2025-02-05 19:24:38,139 - DEBUG - Using device: mps
2025-02-05 19:24:38,139 - DEBUG - Config: {'BATCH_SIZE': 32, 'INITIAL_LR': 0.001, 'LR_DECOY': 0.9, 'NUM_EPOCHS': 10, 'DATA_DIR': '../Data/raw', 'IMAGE_SIZE': 242, 'NUM_CLASSES': 6, 'TEST_SPLIT_RATIO': 0.2, 'VALID_SPLIT_RATIO': 0.1, 'NORMALIZE': True, 'DEBUG': True, 'NUMBER_OF_TRIALS': 1, 'NUM_WORKER': 8, 'PREFETCH_FACTOR': 2}
2025-02-05 19:24:38,151 - DEBUG - [92mNumber of workers: 8, Prefetch factor: 2, Batch size: 32[0m
2025-02-05 19:24:38,152 - DEBUG - Training dataset indices: 4-2493, Number of instances: 140
2025-02-05 19:24:38,152 - DEBUG - Validation dataset indices: 115-2484, Number of instances: 20
2025-02-05 19:24:38,152 - DEBUG - Test dataset indices: 72-2526, Number of instances: 40
2025-02-05 19:24:38,152 - DEBUG - Nun_workers: 8, prefetch_factor: 2
2025-02-05 19:24:50,463 - DEBUG - Images chechsum:   96,798,416, Outputs checksum:  -5.7422, Loss: 1.778519
2025-02-05

In [10]:
with pd.option_context('display.width', 200, 'display.max_rows', 200):
    print(df_diff)

     Iteration  Epoch  Batch  Images_check  Outputs_check      Loss  Images_check_diff  Outputs_check_diff  Loss_diff
0            1      1      0    96798416.0      -5.742249  1.778519                NaN                 NaN        NaN
1            1      1      1   108718832.0      -1.645323  1.741768                NaN                 NaN        NaN
2            1      1      2    77164784.0      -8.356722  1.693612                NaN                 NaN        NaN
3            1      1      3    56238016.0     -11.779107  1.635712                NaN                 NaN        NaN
4            1      1      4   -21610722.0      -5.067947  1.621746                NaN                 NaN        NaN
5            1      2      0    84747416.0     -23.493286  1.471361                NaN                 NaN        NaN
6            1      2      1   119699776.0     -16.554253  1.390118                NaN                 NaN        NaN
7            1      2      2   174519712.0      -9.80784