# Advanced ML Pipeline

Putting it all together from preceding lessons and labs

Dataset (CIFAR-10): https://www.cs.toronto.edu/~kriz/cifar.html

Model

In [60]:
# imports

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import os
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt

In [50]:
# Data inspection

# play around with just one file to understand how to load and process the data

file_path = './data/CIFAR-10-batches-py/data_batch_1'   # batch 1
with open(file_path, 'rb') as f:
    # batch_data_dict = pickle.load(f, encoding='bytes') # python dict keys would be binary and inaccessible using natural language
    batch_data_dict = pickle.load(f, encoding='latin1')
# print(batch_data_dict)

for key in batch_data_dict:
    print(key)

# batch_data_dict['data']

batch_label
labels
data
filenames


  batch_data_dict = pickle.load(f, encoding='latin1')


In [70]:
# Data Access

# Dataset: CIFAR-10

class CIFAR10Dataset(Dataset):

    '''
    CIFAR-10 Python dataset of downloaded blobs in 5 batches
    '''

    def __init__(self, root_dir, train=True, transform=None):
        
        self.root_dir = root_dir
        self.train = train
        self.transform = transform

        if not os.path.exists(self.root_dir):
            raise FileNotFoundError(f"CIFAR-10 directory not found: {self.root_dir}")
        
        self.data, self.labels = self._load_data()

    def _load_data(self):
        images_list = []
        labels_list = []

        num_batches = 5

        if self.train:
            batch_file_names = [f"data_batch_{i}" for i in range(1, num_batches+1)]
        else:
            batch_file_names = ["test_batch"]

        for batch_file_name in batch_file_names:
            path = os.path.join(self.root_dir, batch_file_name)
            images, labels = self._load_batch(path)
            images_list.append(images)
            labels_list.extend(labels)
        
        images = np.concatenate(images_list, axis=0)    # convert all image batches into one array - TODO NECESSARY?
        labels = np.array(labels_list, dtype=np.float32)    # TODO: or int64?

        return images, labels

    def _load_batch(self, file_path):
        
        """ Load a single batch from corresponding file """

        with open(file_path, 'rb') as f:
            batch_data_dict = pickle.load(f, encoding='latin1')   # TODO: right encoding? bytes (in cifar10 docs), latin1 (llm suggested)
        
        images = batch_data_dict['data']  # shape: [10000, 3072] since docs say has 10000 x 3072 numpy array of uint8s per batch
        labels = batch_data_dict['labels']    # length: 10000

        # Reshape to (N, 3, 32, 32) array
        images = np.reshape(images, (-1, 3, 32, 32))
        
        return images, labels


    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        image = self.data[idx]
        label = self.labels[idx]

        # convert to float tensor?

        # optional individual image transforms

        return image, label




In [71]:
# Data Preprocessing

# define transformations for each split dataset

transform_training_fn = transforms.Compose([
    # data augmentations
    transforms.RandomHorizontalFlip(p=0.5),

    # standard preprocessing
    transform.Resize(),
    transformsCenterCrop(),
    transforms.ToTensor()
    transforms.Normalize(
        mean=[],    # TODO: util to calc mean and stds of dataset
        std=[]
    )
])

transform_validation_fn = transforms.Compose([
    transform.Resize(),
    transformsCenterCrop(),
    transforms.ToTensor()
    transforms.Normalize(
        mean=[],    # TODO: util to calc mean and stds of dataset 
        std=[]
    )
]) 

SyntaxError: invalid syntax. Perhaps you forgot a comma? (728913531.py, line 12)

In [None]:
# Utils
# put in own file for future usage

def denormalize(img_denorm, mean_list=[0.485, 0.456, 0.406], std_list=[0.229, 0.224, 0.225]):

    # @param [] mean_list: calculated list of means for the dataset such as [0.485, 0.456, 0.406] for OxfordFlowers
    # @param [] std_list: calculated list of standard deviations for the dataset such as [0.229, 0.224, 0.225] for OxfordFlowers

    mean = torch.tensor(mean_list).view(3, 1, 1)
    std = torch.tensor(std_list).view(3, 1, 1)

    img_denorm = img_denorm * std + mean
    img_denorm = img_denorm.clamp(0, 1)

    return img_denorm

def visualize_processed_image(loader_data):

    # @param DataLoader loader_data: data loader obj of dataset such as training loader data 

    images, labels = next(iter(loader_data))   # gets one batch

    processed_img_tensor = images[0].cpu()    # move image to cpu from gpu if needed for further operations
    denormalized_img_tensor = denormalize(processed_img_tensor) # denormalize image first

    # visualize processed image
    plt.imshow(denormalized_img_tensor.permute(1, 2, 0))  # convert C,H,W (channel, heigth, width) -> H,W,C since PyTorch tensors are [C, H, W], but matplotlib expects [H, W, C]:
    plt.axis("off")
    plt.show()


def visualize_data_augmentations(dataset, idx=0, num_versions=8):
    """ See what data augmentations look like for a sample image """
    # @param Tensor dataset: just raw dataset tensor, not passed into DataLoader

    fix, axes = plt.subplots(2, 4, figsize=(12, 6))
    axes = axes.flatten()

    for i in range(num_versions):
        img, label = dataset[idx]   # retrieve augmented version

        # denormalize for display
        img = denormalize(img)

        axes[i].imshow(img.permute(1, 2, 0))  # CHW to HWC
        axes[i].set_title(f"Label: {label}")
        axes[i].axis('off')

    plt.tight_layout()
    plt.show()

In [69]:
# Data Access

# sanity check by testing transforms on raw dataset and loaded dataset

root_dir = './data/CIFAR-10-batches-py'
train_dataset = CIFAR10Dataset(
    root_dir,
    train=True
)
train_data_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)
# visualize_processed_image(train_data_loader)
# visualize_data_augmentations(train_dataset)

  batch_data_dict = pickle.load(f, encoding='latin1')   # TODO: right encoding? bytes (in cifar10 docs), latin1 (llm suggested)


In [None]:
# Model

# Basic model


class NNBasic(nn.Module):
    
    # go with CNN

    def __init__(self):

        super().__init__()

        # input image size is ??
        # image_size = ??

        # conv block 1
        self.conv1 = self.Conv2d(
            in_channels=1,    # input channel = color channel (grayscale is 1, RGB is 3)
            out_channels=32,  # how many feature maps to produce
            kernel_size=3,   # filter size
            padding=1,  # filter padding
        )
        self.activation1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(
            kernel_size=2,
            stride=2
        )


        # conv block 2
        # image size is halved due to preceding pooling




        # flatten
        self.flatten = nn.Flatten()

        # fully connected blocks
        flattened_size = 0  # ??
        self.fc1 = nn.Linear(
            flattened_size,
            128
        )

        # output layer
        output_classes = 10
        # self.fc_output = nn.Linear(

        # )







In [None]:
# Model

# Inspection