In [1]:
%%writefile ddp_resnet_train.py
 
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision import transforms
from torchvision.models import resnet18, ResNet18_Weights
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.distributed import DistributedSampler
import pandas as pd
from PIL import Image
import time
from datetime import datetime
import random
 
# --- Dataset ---
class TrainDataset(Dataset):
    def __init__(self, csv_file, transform=None, root_dir='dataset'):
        self.df = pd.read_csv(csv_file)
        self.transform = transform
        self.root_dir = root_dir
 
    def __len__(self):
        return len(self.df)
 
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.root_dir, row['file_name'])
        label = int(row['label'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label
 
# --- DDP Training Function ---
def main():
    dist.init_process_group("nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
 
    # Transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])
 
    # # Load full dataset
    # dataset = TrainDataset("dataset/train.csv", transform=transform)
    # total_len = len(full_dataset)
    # half_len = total_len // 2
    # subset_indices = random.sample(range(total_len), half_len)
    # sampler = DistributedSampler(dataset)
    # dataloader = DataLoader(dataset, batch_size=64, sampler=sampler, num_workers=2)

    # Load full dataset
    full_dataset = TrainDataset("dataset/train.csv", transform=transform)
    
    # Use only 10% of the dataset
    total_len = len(full_dataset)
    ten_percent_len = int(0.1 * total_len)
    subset_indices = random.sample(range(total_len), ten_percent_len)
    
    # Create the subset
    subset_dataset = Subset(full_dataset, subset_indices)
    
    # Create DistributedSampler on subset
    sampler = DistributedSampler(subset_dataset)
    
    # Dataloader on subset
    dataloader = DataLoader(subset_dataset, batch_size=64, sampler=sampler, num_workers=2)
 
    # Model
    model = resnet18(weights=ResNet18_Weights.DEFAULT)
    model.fc = nn.Linear(model.fc.in_features, 2)
    model.to(device)
    model = DDP(model, device_ids=[local_rank])
 
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
 
    # --- Start Benchmarking ---
    if local_rank == 0:
        torch.cuda.reset_peak_memory_stats()
        start = time.time()
 
    for epoch in range(2): 
        model.train()
        sampler.set_epoch(epoch)
        total_loss, correct, total = 0, 0, 0
 
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
 
            total_loss += loss.item() * labels.size(0)
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
 
        if local_rank == 0:
            acc = correct / total
            print(f"[GPU {local_rank}] Epoch {epoch+1} | Loss: {total_loss/total:.4f} | Acc: {acc:.4f}")
 
    # --- End Benchmarking ---
    if local_rank == 0:
        end = time.time()
        duration = end - start
        mem = torch.cuda.max_memory_allocated() / 1e9
        print("\n--- Benchmark (DDP) ---")
        print(f"Time: {duration:.2f} s")
        print(f"Accuracy: {acc:.4f}")
        print(f"Max GPU Memory: {mem:.2f} GB")
 
        with open("benchmark_log.txt", "a") as f:
            f.write(f"[{datetime.now()}] DDP (No AMP) ({dist.get_world_size()} GPUs)\n")
            f.write(f"Time: {duration:.2f}s | Acc: {acc:.4f} | Mem: {mem:.2f} GB\n\n")
 
    if local_rank == 0:
        torch.save(model.module.state_dict(), "ddp_model.pth")
        print("Model saved to ddp_model.pth")
 
if __name__ == "__main__":
    main()

Overwriting ddp_resnet_train.py


In [2]:
!torchrun --nproc-per-node=1 ddp_resnet_train.py


[GPU 0] Epoch 1 | Loss: 0.1369 | Acc: 0.9442
[GPU 0] Epoch 2 | Loss: 0.0208 | Acc: 0.9939

--- Benchmark (DDP) ---
Time: 192.53 s
Accuracy: 0.9939
Max GPU Memory: 1.74 GB
Model saved to ddp_model.pth


In [3]:
!torchrun --nproc-per-node=2 ddp_resnet_train.py

W0415 20:06:35.555000 3112862 torch/distributed/run.py:793] 
W0415 20:06:35.555000 3112862 torch/distributed/run.py:793] *****************************************
W0415 20:06:35.555000 3112862 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0415 20:06:35.555000 3112862 torch/distributed/run.py:793] *****************************************
[GPU 0] Epoch 1 | Loss: 0.1766 | Acc: 0.9177
[GPU 0] Epoch 2 | Loss: 0.0399 | Acc: 0.9862

--- Benchmark (DDP) ---
Time: 173.00 s
Accuracy: 0.9862
Max GPU Memory: 1.74 GB
Model saved to ddp_model.pth


In [4]:
!torchrun --nproc-per-node=3 ddp_resnet_train.py

W0415 20:09:52.767000 3113892 torch/distributed/run.py:793] 
W0415 20:09:52.767000 3113892 torch/distributed/run.py:793] *****************************************
W0415 20:09:52.767000 3113892 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0415 20:09:52.767000 3113892 torch/distributed/run.py:793] *****************************************
[GPU 0] Epoch 1 | Loss: 0.1900 | Acc: 0.9197
[GPU 0] Epoch 2 | Loss: 0.0403 | Acc: 0.9865

--- Benchmark (DDP) ---
Time: 174.27 s
Accuracy: 0.9865
Max GPU Memory: 1.74 GB
Model saved to ddp_model.pth


In [5]:
!torchrun --nproc-per-node=4 ddp_resnet_train.py

W0415 20:19:55.130000 3116545 torch/distributed/run.py:793] 
W0415 20:19:55.130000 3116545 torch/distributed/run.py:793] *****************************************
W0415 20:19:55.130000 3116545 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0415 20:19:55.130000 3116545 torch/distributed/run.py:793] *****************************************
[GPU 0] Epoch 1 | Loss: 0.2483 | Acc: 0.8784
[GPU 0] Epoch 2 | Loss: 0.0572 | Acc: 0.9770

--- Benchmark (DDP) ---
Time: 179.13 s
Accuracy: 0.9770
Max GPU Memory: 1.74 GB
Model saved to ddp_model.pth
