In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import zipfile
import os

# Define paths
zip_path = "/content/drive/MyDrive/images.zip"  # Update with correct file name
extract_path = "/content/images"  # Destination folder

# Create extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction Complete! Files are in:", extract_path)


In [None]:
!pip install transformers timm torch torchvision pandas pillow

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
import numpy as np


In [None]:
import os
import pandas as pd
from PIL import Image

def clean_dataset(csv_file_path, img_dir_path):
    """
    Cleans the dataset by removing images that cannot be read or have empty captions.

    Args:
        csv_file_path (str): Path to the CSV file containing image labels.
        img_dir_path (str): Path to the directory containing the images.
    """
    df = pd.read_csv(csv_file_path)

    # Filter out images with issues:
    indices_to_remove = []
    for index, row in df.iterrows():
        img_name = os.path.join(img_dir_path, str(row['image_name']))  # Assuming 'image_name' column
        caption = row['text_corrected']  # Assuming 'text_corrected' column

        # Check for empty caption:
        if not isinstance(caption, str) or not caption.strip():
            print(f"Removing entry with empty or invalid caption: {img_name}")
            indices_to_remove.append(index)
            continue

        # Check if image can be read:
        try:
            Image.open(img_name).convert("RGB")
        except (FileNotFoundError, IOError, OSError):
            print(f"Removing entry with unreadable image: {img_name}")
            indices_to_remove.append(index)
            # Optional: If you want to delete the image file itself:
            # os.remove(img_name)

    # Remove problematic entries from DataFrame:
    cleaned_df = df.drop(indices_to_remove)

    # Overwrite original CSV file with cleaned data:
    cleaned_df.to_csv(csv_file_path, index=False)

    print(f"Removed {len(indices_to_remove)} entries from the dataset.")

In [None]:
# Change these paths to match your dataset location
CSV_FILE_PATH = "/content/labels.csv"  # Replace with actual path
IMG_DIR_PATH = "/content/images"  # Replace with actual path


In [None]:
# Image Preprocessing
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load Pretrained Tokenizer (BERT/RoBERTa)
TEXT_MODEL = "bert-base-uncased"  # Change to "roberta-base" if using RoBERTa
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)


In [None]:
class MemeDataset(Dataset):
    def __init__(self, csv_file, img_dir, tokenizer, transform=None, max_length=128):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_length = max_length

        # Label Mapping
        self.label_mapping = {
            "not_funny": 0, "funny": 1, "very_funny": 2, "hilarious": 3,
            "not_sarcastic": 0, "general": 1, "twisted_meaning": 2, "very_twisted": 3,
            "not_offensive": 0, "slight": 1, "very_offensive": 2, "hateful_offensive": 3,
            "not_motivational": 0, "motivational": 1
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            # Load Image
            img_name = os.path.join(self.img_dir, str(self.data.iloc[idx, 0]))
            image = Image.open(img_name).convert("RGB")
            if self.transform:
                image = self.transform(image)

            # Process Text
            caption = self.data.iloc[idx, 2]  # "text_corrected"
            if not isinstance(caption, str):
                caption = str(caption)  # Convert to string if necessary

            # Check if caption is empty and skip if it is
            if not caption.strip():
                print(f"Skipping empty caption for image: {img_name}")
                raise FileNotFoundError  # Reuse FileNotFoundError for consistency

            encoding = self.tokenizer(caption, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
            text_input_ids = encoding["input_ids"].squeeze(0)
            text_attention_mask = encoding["attention_mask"].squeeze(0)

            # Map Labels
            humor = self.label_mapping.get(str(self.data.iloc[idx, 3]), -1)  # Default -1 for errors
            sarcasm = self.label_mapping.get(str(self.data.iloc[idx, 4]), -1)
            offense = self.label_mapping.get(str(self.data.iloc[idx, 5]), -1)
            motivation = self.label_mapping.get(str(self.data.iloc[idx, 6]), -1)

            labels = torch.tensor([humor, sarcasm, offense, motivation], dtype=torch.float32)

            return text_input_ids, text_attention_mask, image, labels

        except FileNotFoundError:
            print(f"File not found or empty caption: {img_name}, skipping...")
            # Instead of returning empty tensors, return tensors with valid token IDs
            return torch.zeros(self.max_length, dtype=torch.long), torch.zeros(self.max_length, dtype=torch.long), torch.zeros(3, 224, 224, dtype=torch.float32), torch.tensor([-1, -1, -1, -1], dtype=torch.float32)

In [None]:
# Create Dataset & DataLoader
clean_dataset(CSV_FILE_PATH, IMG_DIR_PATH)
dataset = MemeDataset(CSV_FILE_PATH, IMG_DIR_PATH, tokenizer, transform=image_transform)
# Check a sample batch
for text_input_ids, text_attention_mask, images, labels in dataloader:
    print("Text input shape:", text_input_ids.shape)  # (batch_size, max_length)
    print("Image shape:", images.shape)  # (batch_size, 3, 224, 224)
    print("Labels shape:", labels.shape)  # (batch_size, 4)
    break


In [None]:
# Load Pretrained Text Transformer (BERT or RoBERTa)
text_encoder = AutoModel.from_pretrained(TEXT_MODEL)

# Load Pretrained Image Model (ResNet50 or ViT)
IMAGE_MODEL = "resnet50"  # Change to "vit_base_patch16_224" for ViT
if IMAGE_MODEL == "resnet50":
    image_encoder = models.resnet50(pretrained=True)
    image_encoder.fc = nn.Identity()  # Remove classification head
elif IMAGE_MODEL == "vit_base_patch16_224":
    import timm
    image_encoder = timm.create_model("vit_base_patch16_224", pretrained=True)
    image_encoder.head = nn.Identity()  # Remove classification head


In [None]:
class MultimodalModel(nn.Module):
    def __init__(self, text_model, image_model, fusion_dim=512):
        super(MultimodalModel, self).__init__()
        self.text_encoder = text_model
        self.image_encoder = image_model

        text_embedding_dim = 768
        image_embedding_dim = 2048 if IMAGE_MODEL == "resnet50" else 768

        # Fusion layer
        self.fusion = nn.Linear(text_embedding_dim + image_embedding_dim, fusion_dim)
        self.relu = nn.ReLU()

        # Regression head (Predicts Humor, Sarcasm, Offense, Motivation)
        self.regression_head = nn.Linear(fusion_dim, 4)

    def forward(self, text_input_ids, text_attention_mask, image_tensor):
        # Text encoding
        text_features = self.text_encoder(input_ids=text_input_ids, attention_mask=text_attention_mask).last_hidden_state[:, 0, :]

        # Image encoding
        image_features = self.image_encoder(image_tensor).squeeze()

        # Fusion
        fused_features = torch.cat((text_features, image_features), dim=1)
        fused_output = self.relu(self.fusion(fused_features))

        # Regression head
        output = self.regression_head(fused_output)  # Outputs 4 values
        return output


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalModel(text_encoder, image_encoder).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
from sklearn.metrics import f1_score, accuracy_score

epochs = 5
for epoch in range(epochs):
    total_loss = 0
    all_preds = []
    all_labels = []

    model.train()

    for text_input_ids, text_attention_mask, images, labels in dataloader:
        text_input_ids, text_attention_mask, images, labels = (
            text_input_ids.to(device), text_attention_mask.to(device), images.to(device), labels.to(device)
        )

        optimizer.zero_grad()
        preds = model(text_input_ids, text_attention_mask, images)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Store predictions & true labels for metric calculation
        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(labels.cpu().numpy())

    # Convert to numpy arrays
    all_preds = np.vstack(all_preds)  # Shape: (num_samples, 4)
    all_labels = np.vstack(all_labels)  # Shape: (num_samples, 4)

    # Convert Regression Outputs to Discrete Labels (Round to Nearest Integer)
    all_preds = np.round(all_preds).astype(int)

    # Compute Metrics for Each Category
    f1_humor = f1_score(all_labels[:, 0], all_preds[:, 0], average="macro")
    f1_sarcasm = f1_score(all_labels[:, 1], all_preds[:, 1], average="macro")
    f1_offense = f1_score(all_labels[:, 2], all_preds[:, 2], average="macro")
    f1_motivation = f1_score(all_labels[:, 3], all_preds[:, 3], average="macro")

    acc_humor = accuracy_score(all_labels[:, 0], all_preds[:, 0])
    acc_sarcasm = accuracy_score(all_labels[:, 1], all_preds[:, 1])
    acc_offense = accuracy_score(all_labels[:, 2], all_preds[:, 2])
    acc_motivation = accuracy_score(all_labels[:, 3], all_preds[:, 3])

    avg_f1 = (f1_humor + f1_sarcasm + f1_offense + f1_motivation) / 4
    avg_acc = (acc_humor + acc_sarcasm + acc_offense + acc_motivation) / 4

    # Print Metrics
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {total_loss / len(dataloader):.4f}")
    print(f"  F1 Scores  - Humor: {f1_humor:.4f}, Sarcasm: {f1_sarcasm:.4f}, Offense: {f1_offense:.4f}, Motivation: {f1_motivation:.4f}, Avg: {avg_f1:.4f}")
    print(f"  Accuracy   - Humor: {acc_humor:.4f}, Sarcasm: {acc_sarcasm:.4f}, Offense: {acc_offense:.4f}, Motivation: {acc_motivation:.4f}, Avg: {avg_acc:.4f}")
    print("-" * 80)
