# Serial Execution of Vision Transformer on 1 GPU

## Run the code cells below 


## What this code does
- Loads data using Pytorch loader
- Trains Transformer on 1 GPU
- Trains Transformer on 2 GPUs using Data parallel
- Tested both on P100 GPU
- The Dataparallel was done but not used for analysis, just learning

In [1]:
import os
import time
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import ViTFeatureExtractor
from sklearn.metrics import accuracy_score


# Constants
NUM_EPOCHS = 2
BATCH_SIZE = 62
IMG_SIZE = 224  # Required for ViT
SUBSET_SIZE = 12000  # 10k train + 2k test

# Dataset paths
DATASET_DIR = "dataset"
CSV_PATH = os.path.join(DATASET_DIR, "train.csv")
IMAGE_DIR = os.path.join(DATASET_DIR, "train_data")


# Shuffle full dataset of 70k and split manually
df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)  # Shuffle

# Feature extractor for ViT
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])

# Custom Dataset
class ImageDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.df = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        rel_path = self.df.loc[idx, 'file_name']
        img_path = os.path.join(self.img_dir, os.path.basename(rel_path))
        image = Image.open(img_path).convert('RGB')
        label = int(self.df.loc[idx, 'label'])

        if self.transform:
            image = self.transform(image)

        return image, label



# Split 10k train + 2k val + 2k test
train_df = df.iloc[:40000].reset_index(drop=True)
val_df = df.iloc[40000:45000].reset_index(drop=True)
test_df = df.iloc[45000:50000].reset_index(drop=True)

# Create datasets
train_dataset = ImageDataset(train_df, IMAGE_DIR, transform)
val_dataset = ImageDataset(val_df, IMAGE_DIR, transform)
test_dataset = ImageDataset(test_df, IMAGE_DIR, transform)

# Create loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)





In [2]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, test_loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images).logits
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    acc = accuracy_score(all_labels, all_preds)
    return acc


In [3]:


import torch
import time
from transformers import ViTForImageClassification, ViTFeatureExtractor

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=2,
    ignore_mismatched_sizes=True
).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

start_time = time.time()

for epoch in range(2):  # change as needed
    print(f"\nEpoch {epoch + 1} started")
    
    torch.cuda.reset_peak_memory_stats(device)

    train_loss = train(model, train_loader, criterion, optimizer, device)
    acc = evaluate(model, test_loader, device)

    mem_allocated = torch.cuda.memory_allocated(device) / (1024 ** 2)
    mem_reserved = torch.cuda.memory_reserved(device) / (1024 ** 2)
    peak_mem_allocated = torch.cuda.max_memory_allocated(device) / (1024 ** 2)
    peak_mem_reserved = torch.cuda.max_memory_reserved(device) / (1024 ** 2)

    print(f"[GPU 1] Epoch {epoch + 1}, Loss: {train_loss:.4f}, Accuracy: {acc:.4f}")
    print(f"[GPU 1] Memory Allocated: {mem_allocated:.2f} MB")
    print(f"[GPU 1] Memory Reserved: {mem_reserved:.2f} MB")
    print(f"[GPU 1] Peak Memory Allocated: {peak_mem_allocated:.2f} MB")
    print(f"[GPU 1] Peak Memory Reserved: {peak_mem_reserved:.2f} MB")

end_time = time.time()
print(f"\nTotal training time on 1 GPU: {(end_time - start_time):.2f} seconds")



Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1 started


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns



def evaluate_with_metrics(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            outputs = model(images).logits
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    acc = accuracy_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=["Real", "AI"])
    return acc, cm, report


def plot_confusion_matrix(cm, title="Confusion Matrix"):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Real", "AI"], yticklabels=["Real", "AI"])
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title(title)
    plt.tight_layout()
    plt.show()


In [None]:
val_acc, val_cm, val_report = evaluate_with_metrics(model, val_loader, device)
print("\n[GPU 1] Validation Accuracy:", val_acc)
print("[GPU 1] Confusion Matrix:\n", val_cm)
print("[GPU 1] Classification Report:\n", val_report)


plot_confusion_matrix(val_cm, title="[GPU 1] Validation Confusion Matrix")

In [None]:
import torch
import time
from transformers import ViTForImageClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=2,
    ignore_mismatched_sizes=True
).to(device)

# Use DataParallel for multi-GPU if available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

print("Starting 2-GPU Training")
start_time = time.time()

for epoch in range(2):  # change as needed
    print(f"\nEpoch {epoch + 1} started")

    torch.cuda.reset_peak_memory_stats(device)

    train_loss = train(model, train_loader, criterion, optimizer, device)
    acc = evaluate(model, test_loader, device)

    mem_allocated = torch.cuda.memory_allocated(device) / (1024 ** 2)
    mem_reserved = torch.cuda.memory_reserved(device) / (1024 ** 2)
    peak_mem_allocated = torch.cuda.max_memory_allocated(device) / (1024 ** 2)
    peak_mem_reserved = torch.cuda.max_memory_reserved(device) / (1024 ** 2)

    print(f"[GPU 2] Epoch {epoch + 1}, Loss: {train_loss:.4f}, Accuracy: {acc:.4f}")
    print(f"[GPU 2] Memory Allocated: {mem_allocated:.2f} MB")
    print(f"[GPU 2] Memory Reserved: {mem_reserved:.2f} MB")
    print(f"[GPU 2] Peak Memory Allocated: {peak_mem_allocated:.2f} MB")
    print(f"[GPU 2] Peak Memory Reserved: {peak_mem_reserved:.2f} MB")

end_time = time.time()
print(f"\nTotal training time on 2 GPUs: {(end_time - start_time):.2f} seconds")



In [None]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = ViTForImageClassification.from_pretrained(
        'google/vit-base-patch16-224',
        num_labels=2,
        ignore_mismatched_sizes=True
    ).to(device)
    model = torch.nn.DataParallel(model)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    start_time = time.time()
    for epoch in range(2):  # change as needed
        train_loss = train(model, train_loader, criterion, optimizer, device)
        acc = evaluate(model, test_loader, device)
        print(f"[GPU 2] Epoch {epoch+1}, Loss: {train_loss:.4f}, Accuracy: {acc:.4f}")
    end_time = time.time()
    print(f"Total training time on 2 GPUs: {(end_time - start_time):.2f} seconds")

    # Evaluate on validation set
    val_acc, val_cm, val_report = evaluate_with_metrics(model, val_loader, device)
    print("\n[GPU 2] Validation Accuracy:", val_acc)
    print("[GPU 2] Confusion Matrix:\n", val_cm)
    print("[GPU 2] Classification Report:\n", val_report)

    plot_confusion_matrix(val_cm, title="[GPU X] Validation Confusion Matrix")

