### Make sure you follow the preprocessing instructions in the README.md file!

In [1]:
import os

# Lets see the directory structure of imagenet1k
def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        jpeg_files = [f for f in files if f.endswith('.JPEG')]
        if jpeg_files:  # if the list is not empty
            print('{}Number of JPEG files: {}'.format(subindent, len(jpeg_files)))
        for f in files:
            if f.endswith('.txt'):
                print('{}{}'.format(subindent, f))

In [2]:
#list_files('/mnt/imagenet1k_resized/ILSVRC2012_img_train/')

In [3]:
#list_files('/mnt/imagenet1k_resized/ILSVRC2012_img_val/')

In [4]:
#list_files('/mnt/imagenet21k_resized_new/imagenet21k_train/')

In [5]:
#list_files('/mnt/imagenet21k_resized_new/imagenet21k_val/')

### It is clear from the output of the above cells that preprocessing worked!

We are looking to see if the validation and training sets are organized in the same manner and that they are ordered the same.

This makes input into the `torchvision.datasets.ImageFolder` class work without a hitch!

In [6]:
# Importing necessary libraries to unzip `tiny-imagenet-200.zip`
import zipfile
import random
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

# Importing pytorch libraries
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [7]:
# Importing custom VisionTransformer Model

from models.vit import VisionTransformer

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cuda')

# Hyperparameters
batch_size = 512 # should be 4096 for ViT paper
criterion = nn.CrossEntropyLoss()

patch_size_ = 32

num_layers_ = 12
D_ = 768
mlp_size_ = 3072
num_heads_ = 12

num_classes_ = 1000

num_epochs = 1

model = VisionTransformer(patch_size=patch_size_, D=D_, num_layers=num_layers_, num_classes=num_classes_, num_heads=num_heads_, mlp_size=mlp_size_)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)


# Define a transform for training data
train_transform = transforms.Compose([
    transforms.Pad(4),  # Pad the image by 4 pixels
    transforms.RandomCrop(224),  # Randomly crop a 224x224 region from the padded image
    transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize to range [-1, 1]
])

# Define a transform for validation data
val_transform = transforms.Compose([
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize to range [-1, 1]
])

In [9]:
print("Number of available CPU cores:", os.cpu_count())

Number of available CPU cores: 24


In [10]:
# Load ImageNet1k dataset and make DataLoaders
train_dataset1k = datasets.ImageFolder(root='/mnt/imagenet1k_resized/ILSVRC2012_img_train', transform=train_transform)
val_dataset1k = datasets.ImageFolder(root='/mnt/imagenet1k_resized/ILSVRC2012_img_val', transform=val_transform)

train_loader1k = DataLoader(dataset=train_dataset1k, batch_size=batch_size, shuffle=True, num_workers=20, pin_memory=True)
val_loader1k = DataLoader(dataset=val_dataset1k, batch_size=batch_size, shuffle=False, num_workers=20, pin_memory=True)

#Calculate total steps
total_steps = len(train_loader1k) * num_epochs

# StepLR that decays the learning rate every 30 epochs
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.9)

print(f"ImageNet-1k has {len(train_loader1k)*batch_size:,} training images and {len(val_loader1k)*batch_size:,} validation images!")

# Load ImageNet21k dataset and make DataLoaders
#train_dataset21k = datasets.ImageFolder(root='/mnt/imagenet21k_resized_new/imagenet21k_train', transform=train_transform)
#val_dataset21k = datasets.ImageFolder(root='/mnt/imagenet21k_resized_new/imagenet21k_val', transform=val_transform)

#train_loader21k = DataLoader(dataset=train_dataset21k, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
#val_loader21k = DataLoader(dataset=val_dataset21k, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

#print(f"ImageNet-21k has {len(train_loader21k)*batch_size:,} training images and {len(val_loader21k)*batch_size:,} validation images!")

ImageNet-1k has 1,281,168 training images and 50,096 validation images!


In [11]:
total_steps

2583

In [None]:
# Inspect a batch from train_loader1k
train_images, train_labels = next(iter(train_loader1k))
train_images, train_labels = train_images.to(device), train_labels.to(device)

print("Train images batch shape:", train_images.shape)
print("Train labels batch shape:", train_labels.shape)
print("Train images data type:", train_images.dtype)
print("Train labels data type:", train_labels.dtype)

# Inspect a batch from val_loader1k
val_images, val_labels = next(iter(val_loader1k))

print("Validation images batch shape:", val_images.shape)
print("Validation labels batch shape:", val_labels.shape)
print("Validation images data type:", val_images.dtype)
print("Validation labels data type:", val_labels.dtype)

### TESTING COMPONENTS OF vit.py IN IPYNB BEFORE MOVING TO .PY FILE

In [None]:
# Importing required PyTorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

# Class for Image Preprocessing
class ImagePreprocessor(nn.Module):
    def __init__(self, patch_size):
        super(ImagePreprocessor, self).__init__()
        self.patch_size = patch_size  # Size of each patch

    def forward(self, x):
        # Dynamically get the batch size and channel dimensions
        batch_size, channel, _, _ = x.size()

        # Using unfold to create patches
        x_p = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)

        # Reshape into the desired shape
        x_p = x_p.permute(0, 2, 3, 1, 4, 5).contiguous()
        x_p = x_p.view(batch_size, -1, self.patch_size * self.patch_size * channel)

        # Now x_p should have shape [batch_size, (Height * Width) / (patch_size * patch_size), (patch_size * patch_size * channel)]
        
        return x_p

# Class for Patch Embedding
class PatchEmbedding(nn.Module):
    def __init__(self, patch_dim, D):
        super(PatchEmbedding, self).__init__()
        self.D = D  # Dimension to project to
        self.linear = nn.Linear(patch_dim, D)  # Linear projection layer

    def forward(self, x_p):
        # Project patches to D dimensions
        x_emb = self.linear(x_p)
        return x_emb

# Class for adding a Class Token
class ClassToken(nn.Module):
    def __init__(self, D):
        super(ClassToken, self).__init__()
        self.class_token_embedding = nn.Parameter(torch.randn(1, 1, D))  # Learnable class token

    def forward(self, x_emb):
        # Prepend class token to patch embeddings
        batch_size = x_emb.size(0)
        class_token = self.class_token_embedding.repeat(batch_size, 1, 1)
        x_class = torch.cat([class_token, x_emb], dim=1)
        return x_class

# Class for Position Embeddings
class PositionEmbedding(nn.Module):
    def __init__(self, seq_len, D):
        super(PositionEmbedding, self).__init__()
        self.position_embeddings = nn.Parameter(torch.randn(1, seq_len, D))  # Learnable position embeddings

    def forward(self, x_class):
        # Add position embeddings
        x_pos = x_class + self.position_embeddings
        return x_pos

# Class for Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, D, num_layers):
        super(TransformerEncoder, self).__init__()
        self.num_layers = num_layers
        self.layer_norm = nn.LayerNorm(D)
        self.multihead_attention = nn.MultiheadAttention(D, num_heads=4, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(D, D),
            nn.GELU(),
            nn.Linear(D, D)
        )

    def forward(self, x_pos):
        # Transformer Encoder Logic
        for _ in range(self.num_layers):
            x_norm = self.layer_norm(x_pos)
            x_att, _ = self.multihead_attention(x_norm, x_norm, x_norm)
            x_pos = x_pos + x_att
            x_pos = x_pos + self.mlp(self.layer_norm(x_pos))
        return x_pos

# Class for Classification Head
class ClassificationHead(nn.Module):
    def __init__(self, D, num_classes):
        super(ClassificationHead, self).__init__()
        self.linear = nn.Linear(D, num_classes)  # Linear layer for classification

    def forward(self, x_transformed):
        # Take the class token and perform classification
        x_class_token = x_transformed[:, 0, :]
        print("x_class_token.shape: ", x_class_token.shape)
        output = self.linear(x_class_token)
        return output

# Main Vision Transformer Class
class VisionTransformerTest(nn.Module):
    def __init__(self, patch_size, D, num_layers, num_classes):
        super(VisionTransformerTest, self).__init__()
        self.image_preprocessor = ImagePreprocessor(patch_size)
        self.patch_embedding = PatchEmbedding(patch_size * patch_size * 3, D)  # 3 channels, patch_size x patch_size patches
        self.class_token = ClassToken(D)
        self.position_embedding = PositionEmbedding(197, D)  # 196 patches + 1 class token
        self.transformer_encoder = TransformerEncoder(D, num_layers)
        self.classification_head = ClassificationHead(D, num_classes)

    def forward(self, x):
        print("x.shape",x.shape)
        
        # Preprocess the image into patches
        x_p = self.image_preprocessor(x)
        print("x_p.shape: ", x_p.shape)

        # Generate patch embeddings
        x_emb = self.patch_embedding(x_p)
        print("x_emb.shape: ", x_emb.shape)

        # Prepend the class token
        x_class = self.class_token(x_emb)
        print("x_class.shape: ", x_class.shape)

        # Add position embeddings
        x_pos = self.position_embedding(x_class)
        print("x_pos.shape: ", x_pos.shape)

        # Pass through the Transformer Encoder
        x_transformed = self.transformer_encoder(x_pos)
        print("x_transformed.shape: ", x_transformed.shape)
    
        # Perform classification
        output = self.classification_head(x_transformed)
        print("output.shape: ", output.shape)
    
        return output

In [None]:
model_test = VisionTransformerTest(patch_size=patch_size_, D=D_, num_layers=num_layers_, num_classes=num_classes_)
model_test.to(device)
output = model_test(train_images)
print("Above was the transformation path of the data")

### TESTING COMPONENTS OF vit.py IN IPYNB BEFORE MOVING TO .PY FILE

In [12]:
train_losses = []
learning_rates = []

# Training Loop
for epoch_idx in range(num_epochs):
    model.train()
    for batch_idx, (train_images, train_labels) in enumerate(train_loader1k):
        train_images, train_labels = train_images.to(device), train_labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        train_outputs = model(train_images)

        # Compute the loss
        train_loss = criterion(train_outputs, train_labels)

        # Backward pass and optimization
        train_loss.backward()

        # Gradient Clip
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Step optimizer and the scheduler
        optimizer.step()
        scheduler.step()

        # Store metrics
        train_losses.append(train_loss.item())
        learning_rates.append(scheduler.get_last_lr()[0])  # Assumes optimizer has a single param group

        print(f"Epoch [{epoch_idx+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader1k)}], Loss: {train_loss.item():.4f}")

: 

In [None]:
# Validation Loop
# NOTE: LOGITS TO MAX LOGIT FUNCTION MIGHT CHANGE DUE TO SPECIFIC NATURE OF VISION TRANSFORMER ALGORITHM

model.eval()
with torch.no_grad():

    correct_count = 0
    total_count = 0

    for val_images, val_labels in val_loader1k:
        val_images, val_labels = val_images.to(device), val_labels.to(device)

        # Logits
        val_outputs = model(val_images)

        # Let the index of the highest logit be the predicted class 
        _, val_predicted = torch.max(val_outputs.data, 1)

        # Update counts from this batch's values
        total_count += val_labels.size(0)
        correct_count += (val_predicted == val_labels).sum().item()

    # Print accuracy score
    print(f'Accuracy of the model on the validation images: {100 * correct_count / total_count}%')

In [None]:
# Save
torch.save(model, './models/vit-base-32p.pth')