### Make sure you follow the preprocessing instructions in the README.md file!

In [1]:
import os

# Lets see the directory structure of imagenet1k
def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        jpeg_files = [f for f in files if f.endswith('.JPEG')]
        if jpeg_files:  # if the list is not empty
            print('{}Number of JPEG files: {}'.format(subindent, len(jpeg_files)))
        for f in files:
            if f.endswith('.txt'):
                print('{}{}'.format(subindent, f))

In [23]:
#list_files('/mnt/imagenet1k_resized/ILSVRC2012_img_train/')

In [24]:
#list_files('/mnt/imagenet1k_resized/ILSVRC2012_img_val/')

In [25]:
#list_files('/mnt/imagenet21k_resized_new/imagenet21k_train/')

In [26]:
#list_files('/mnt/imagenet21k_resized_new/imagenet21k_val/')

### It is clear from the output of the above cells that preprocessing worked!

We are looking to see if the validation and training sets are organized in the same manner and that they are ordered the same.

This makes input into the `torchvision.datasets.ImageFolder` class work without a hitch!

In [12]:
# Importing necessary libraries to unzip `tiny-imagenet-200.zip`
import zipfile
import random
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

# Importing pytorch libraries
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [55]:
# Importing custom VisionTransformer Model

from models.vit import VisionTransformer

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
batch_size = 100
criterion = nn.CrossEntropyLoss()

patch_size_ = 16     # to be changed
D_ = 768             # to be changed
num_layers_ = 12     # to be changed
num_classes_ = 1000

num_epochs = 10

model = VisionTransformer(patch_size=patch_size_, D=D_, num_layers=num_layers_, num_classes=num_classes_)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Define a transform for training data
train_transform = transforms.Compose([
    transforms.Pad(4),  # Pad the image by 4 pixels
    transforms.RandomCrop(224),  # Randomly crop a 224x224 region from the padded image
    transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize to range [-1, 1]
])

# Define a transform for validation data
val_transform = transforms.Compose([
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize to range [-1, 1]
])

In [57]:
print("Number of available CPU cores:", os.cpu_count())

Number of available CPU cores: 24


In [33]:
# Load ImageNet1k dataset and make DataLoaders
train_dataset1k = datasets.ImageFolder(root='/mnt/imagenet1k_resized/ILSVRC2012_img_train', transform=train_transform)
val_dataset1k = datasets.ImageFolder(root='/mnt/imagenet1k_resized/ILSVRC2012_img_val', transform=val_transform)

train_loader1k = DataLoader(dataset=train_dataset1k, batch_size=batch_size, shuffle=True, num_workers=20, pin_memory=True)
val_loader1k = DataLoader(dataset=val_dataset1k, batch_size=batch_size, shuffle=False, num_workers=20, pin_memory=True)

print(f"ImageNet-1k has {len(train_loader1k)*batch_size:,} training images and {len(val_loader1k)*batch_size:,} validation images!")

# Load ImageNet21k dataset and make DataLoaders
#train_dataset21k = datasets.ImageFolder(root='/mnt/imagenet21k_resized_new/imagenet21k_train', transform=train_transform)
#val_dataset21k = datasets.ImageFolder(root='/mnt/imagenet21k_resized_new/imagenet21k_val', transform=val_transform)

#train_loader21k = DataLoader(dataset=train_dataset21k, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
#val_loader21k = DataLoader(dataset=val_dataset21k, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

#print(f"ImageNet-21k has {len(train_loader21k)*batch_size:,} training images and {len(val_loader21k)*batch_size:,} validation images!")

ImageNet-1k has 1,281,200 training images and 50,000 validation images!


In [34]:
# Inspect a batch from train_loader1k
train_images, train_labels = next(iter(train_loader1k))
print("Train images batch shape:", train_images.shape)
print("Train labels batch shape:", train_labels.shape)
print("Train images data type:", train_images.dtype)
print("Train labels data type:", train_labels.dtype)

# Inspect a batch from val_loader1k
val_images, val_labels = next(iter(val_loader1k))
print("Validation images batch shape:", val_images.shape)
print("Validation labels batch shape:", val_labels.shape)
print("Validation images data type:", val_images.dtype)
print("Validation labels data type:", val_labels.dtype)

Train images batch shape: torch.Size([100, 3, 224, 224])
Train labels batch shape: torch.Size([100])
Train images data type: torch.float32
Train labels data type: torch.int64
Validation images batch shape: torch.Size([100, 3, 224, 224])
Validation labels batch shape: torch.Size([100])
Validation images data type: torch.float32
Validation labels data type: torch.int64


### TESTING COMPONENTS OF vit.py IN IPYNB BEFORE MOVING TO .PY FILE

In [45]:
# Importing required PyTorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

# Class for Image Preprocessing
class ImagePreprocessor(nn.Module):
    def __init__(self, patch_size):
        super(ImagePreprocessor, self).__init__()
        self.patch_size = patch_size  # Size of each patch

    def forward(self, x):
        # Reshape image into patches
        # We are using unfold to break the image into patches.
        # The unfold operation will take non-overlapping blocks of size patch_size x patch_size
        x_p = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        
        # Flattening patches
        # We need to flatten the patches while keeping the batch and channel dimensions intact
        # The view operation reshapes the tensor
        x_p = x_p.contiguous().view(x_p.size(0), x_p.size(1), -1, x_p.size(4) * x_p.size(5))
        
        return x_p

In [50]:
image_preprocessor = ImagePreprocessor(patch_size=16)
x_p = image_preprocessor(train_images)
x_p.shape

# I dont think this is the right shape the color dim needs to collapse into the last one also

torch.Size([100, 3, 196, 256])

### TESTING COMPONENTS OF vit.py IN IPYNB BEFORE MOVING TO .PY FILE

In [58]:
# Training Loop

for epoch_idx in range(num_epochs):
    model.train()
    for batch_idx, (train_images, train_labels) in enumerate(train_loader1k):
        train_images, train_labels = train_images.to(device), train_labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        train_outputs = model(train_images)

        # Compute the loss
        train_loss = criterion(train_outputs, train_labels)

        # Backward pass and optimization
        train_loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch_idx+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader1k)}], Loss: {train_loss.item():.4f}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (58800x256 and 768x768)

In [37]:
# Validation Loop
# NOTE: LOGITS TO MAX LOGIT FUNCTION MIGHT CHANGE DUE TO SPECIFIC NATURE OF VISION TRANSFORMER ALGORITHM

model.eval()
with torch.no_grad():

    correct_count = 0
    total_count = 0

    for val_images, val_labels in val_loader1k:
        val_images, val_labels = val_images.to(device), val_labels.to(device)

        # Logits
        val_outputs = model(val_images)

        # Let the index of the highest logit be the predicted class 
        _, val_predicted = torch.max(val_outputs.data, 1)

        # Update counts from this batch's values
        total_count += val_labels.size(0)
        correct_count += (val_predicted == val_labels).sum().item()

    # Print accuracy score
    print(f'Accuracy of the model on the validation images: {100 * correct_count / total_count}%')

NotImplementedError: Module [VisionTransformer] is missing the required "forward" function