In [1]:
import torch

# Set PyTorch to use GPU 1
device = torch.device("cuda:1")
torch.cuda.set_device(device)

# Allocate a tensor to initialize GPU 1
x = torch.rand(1000, 1000, device=device)

# Verify the GPU being used
print(f"Using GPU: {torch.cuda.get_device_name(device.index)}")
print(f"Memory Allocated on GPU 1: {torch.cuda.memory_allocated(device.index) / 1024 ** 2:.2f} MB")
print(f"Memory Reserved on GPU 1: {torch.cuda.memory_reserved(device.index) / 1024 ** 2:.2f} MB")

Using GPU: Tesla V100-DGXS-32GB
Memory Allocated on GPU 1: 3.81 MB
Memory Reserved on GPU 1: 20.00 MB


In [1]:
import json
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from PIL import Image
from torchvision.transforms import Compose, ToTensor, Normalize, RandomHorizontalFlip, RandomResizedCrop, ColorJitter, RandomRotation
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from timm import create_model

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define Image Transformations with more augmentation
image_transform = Compose([
    RandomHorizontalFlip(p=0.5),
    RandomResizedCrop(224, scale=(0.8, 1.0)),
    RandomRotation(20),
    ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Standard ImageNet normalization
])

# Load XLM-RoBERTa tokenizer
xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Dataset class
class MultimodalDataset(Dataset):
    def __init__(self, data, image_transform, tokenizer):
        self.data = data
        self.image_transform = image_transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        record = self.data[idx]
        image_path = record["img"]
        image = Image.open(image_path).convert("RGB")
        image = self.image_transform(image)

        text = record["text"]
        text_tokens = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128
        )

        label = torch.tensor(record["label"], dtype=torch.float)
        return image, text_tokens["input_ids"].squeeze(0), text_tokens["attention_mask"].squeeze(0), label

# Improved CLIP-Like Fusion Module with ReLU
class ImprovedCLIPFusion(nn.Module):
    def __init__(self, image_dim, text_dim, fusion_dim):
        super(ImprovedCLIPFusion, self).__init__()
        self.image_proj = nn.Linear(image_dim, fusion_dim)
        self.text_proj = nn.Linear(text_dim, fusion_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(fusion_dim, fusion_dim)
        self.fc2 = nn.Linear(fusion_dim, fusion_dim)
        self.fc3 = nn.Linear(fusion_dim, 1)

    def forward(self, image_features, text_features):
        image_proj = self.activation(self.image_proj(image_features))
        text_proj = self.activation(self.text_proj(text_features))
        fused_features = self.activation(image_proj + text_proj)
        fused_features = self.dropout(fused_features)
        fused_features = self.activation(self.fc1(fused_features))
        fused_features = self.activation(self.fc2(fused_features))
        output = self.fc3(fused_features)
        return output

# Define the Complete Multimodal Model with Swin-Large
class MultimodalClassifier(nn.Module):
    def __init__(self, swin_model_name, xlmr_model_name, fusion_dim):
        super(MultimodalClassifier, self).__init__()
        self.swin_model = create_model(swin_model_name, pretrained=True, num_classes=0)  # Swin-Large
        self.xlmr_model = XLMRobertaModel.from_pretrained(xlmr_model_name)
        self.clip_fusion = ImprovedCLIPFusion(
            image_dim=self.swin_model.num_features,
            text_dim=self.xlmr_model.config.hidden_size,
            fusion_dim=fusion_dim
        )

    def forward(self, images, texts, attention_masks):
        image_features = self.swin_model(images)
        text_features = self.xlmr_model(input_ids=texts, attention_mask=attention_masks).last_hidden_state[:, 0, :]  # Use [CLS] token
        output = self.clip_fusion(image_features, text_features)
        return output

# Function to load datasets
def load_dataset(file_path, is_jsonl=False):
    data = []
    with open(file_path, "r") as f:
        if is_jsonl:
            for line in f:
                try:
                    record = json.loads(line.strip())
                    data.append(record)
                except json.JSONDecodeError:
                    continue
        else:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                print(f"Error decoding JSON file: {file_path}")
    return data

# Training and Fine-Tuning Function
def train_and_fine_tune_model(model, train_loader, fine_tune_loader, criterion, optimizer, scheduler, device, train_epochs, fine_tune_epochs, save_path=None):
    scaler = torch.amp.GradScaler()
    model.train()

    for epoch in range(train_epochs + fine_tune_epochs):
        epoch_loss = 0.0

        # Gradual unfreezing of pre-trained models
        if epoch < 5:
            for param in model.swin_model.parameters():
                param.requires_grad = False
            for param in model.xlmr_model.parameters():
                param.requires_grad = False
        else:
            for param in model.swin_model.parameters():
                param.requires_grad = True
            for param in model.xlmr_model.parameters():
                param.requires_grad = True

        loader = train_loader if epoch < train_epochs else fine_tune_loader
        phase = "Training" if epoch < train_epochs else "Fine-Tuning"

        for images, texts, attention_masks, labels in loader:
            images, texts, attention_masks, labels = (
                images.to(device),
                texts.to(device),
                attention_masks.to(device),
                labels.to(device),
            )
            optimizer.zero_grad()

            with torch.amp.autocast(device_type='cuda'):
                outputs = model(images, texts, attention_masks).squeeze()
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            epoch_loss += loss.item()

        print(f"{phase} Epoch [{epoch + 1}/{train_epochs + fine_tune_epochs}], Loss: {epoch_loss / len(loader):.4f}")

    if save_path:
        save_model(model, save_path)

# Save and Load Functions
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    if os.path.exists(path):
        model.load_state_dict(torch.load(path))
        model.eval()
        print(f"Model loaded from {path}")
    else:
        raise FileNotFoundError(f"Model file not found at {path}")

# Evaluation Function
def evaluate_model(model, dataloader, device):
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for images, texts, attention_masks, labels in dataloader:
            images, texts, attention_masks, labels = (
                images.to(device),
                texts.to(device),
                attention_masks.to(device),
                labels.to(device),
            )
            outputs = model(images, texts, attention_masks).squeeze()
            predictions = (torch.sigmoid(outputs) > 0.5).float()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
    return accuracy, precision, recall, f1

# Paths
train_file = "/home/kalyanreddy/Datasets/English/Train.jsonl"
fine_tune_file = "/home/kalyanreddy/Datasets/Finetune.json"
test_file = "/home/kalyanreddy/Datasets/MultiTest.json"
model_save_path = "multimodal_clip_model_swin_large.pth"

# DataLoader parameters
batch_size = 64

# Load datasets
train_data = load_dataset(train_file, is_jsonl=True)
fine_tune_data = load_dataset(fine_tune_file, is_jsonl=False)
test_data = load_dataset(test_file, is_jsonl=False)

train_loader = DataLoader(MultimodalDataset(train_data, image_transform, xlmr_tokenizer), batch_size=batch_size, shuffle=True)
fine_tune_loader = DataLoader(MultimodalDataset(fine_tune_data, image_transform, xlmr_tokenizer), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(MultimodalDataset(test_data, image_transform, xlmr_tokenizer), batch_size=batch_size, shuffle=False)

# Initialize model with Swin-Large
multimodal_model = MultimodalClassifier(
    swin_model_name="swin_large_patch4_window7_224",  # Swin-Large
    xlmr_model_name="xlm-roberta-base",
    fusion_dim=1024  # Increased fusion dimension
).to(device)

# Loss, Optimizer, and Scheduler
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(
    [{'params': multimodal_model.clip_fusion.parameters(), 'lr': 1e-4},
     {'params': multimodal_model.swin_model.parameters(), 'lr': 1e-5},
     {'params': multimodal_model.xlmr_model.parameters(), 'lr': 1e-5}],
    weight_decay=1e-2
)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Train and Fine-Tune Model
train_and_fine_tune_model(
    multimodal_model,
    train_loader,
    fine_tune_loader,
    criterion,
    optimizer,
    scheduler,
    device,
    train_epochs=25,
    fine_tune_epochs=15,
    save_path=model_save_path
)

Training Epoch [1/40], Loss: 0.6580
Training Epoch [2/40], Loss: 0.6530
Training Epoch [3/40], Loss: 0.6446
Training Epoch [4/40], Loss: 0.6427
Training Epoch [5/40], Loss: 0.6334
Training Epoch [6/40], Loss: 0.6304
Training Epoch [7/40], Loss: 0.6277
Training Epoch [8/40], Loss: 0.6230
Training Epoch [9/40], Loss: 0.6186
Training Epoch [10/40], Loss: 0.6190
Training Epoch [11/40], Loss: 0.6056
Training Epoch [12/40], Loss: 0.5672
Training Epoch [13/40], Loss: 0.5672
Training Epoch [14/40], Loss: 0.5427
Training Epoch [15/40], Loss: 0.5217
Training Epoch [16/40], Loss: 0.5023
Training Epoch [17/40], Loss: 0.4885
Training Epoch [18/40], Loss: 0.4855
Training Epoch [19/40], Loss: 0.4581
Training Epoch [20/40], Loss: 0.4402
Training Epoch [21/40], Loss: 0.4104
Training Epoch [22/40], Loss: 0.3841
Training Epoch [23/40], Loss: 0.3605
Training Epoch [24/40], Loss: 0.3401
Training Epoch [25/40], Loss: 0.3441
Fine-Tuning Epoch [26/40], Loss: 0.9293
Fine-Tuning Epoch [27/40], Loss: 0.7176
Fine

In [3]:

# Evaluate Model
evaluate_model(multimodal_model, test_loader, device)


Accuracy: 71.99%
Precision: 0.70, Recall: 0.77, F1-Score: 0.73


(0.7199197860962567,
 0.7017114914425427,
 0.7663551401869159,
 0.7326100829610721)