In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datasets import load_dataset
from PIL import Image
from transformers import (
    CLIPProcessor, CLIPModel, CLIPVisionModel,
    AutoModel, AutoProcessor, AutoTokenizer,
    AutoModelForMaskedLM
)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
from collections import Counter, defaultdict
import os
import json
from datetime import datetime
import pandas as pd


In [None]:
print("Bismillah")

In [None]:
class CLIPArabic(nn.Module):
    def __init__(self, image_encoder, text_encoder, proj_dim=512, freeze_encoders=True):
        super().__init__()
        self.img_enc = image_encoder
        self.txt_enc = text_encoder

        # Determine feature dimensions of encoders
        if hasattr(self.img_enc, 'embed_dim'):
            img_dim = self.img_enc.embed_dim
        elif hasattr(self.img_enc, 'config') and hasattr(self.img_enc.config, 'hidden_size'):
            img_dim = self.img_enc.config.hidden_size
        else:
            raise AttributeError(
                "Unable to infer image encoder output dimension; please specify explicitly or ensure encoder has 'embed_dim' or 'config.hidden_size'."
            )

        if hasattr(self.txt_enc, 'config') and hasattr(self.txt_enc.config, 'hidden_size'):
            txt_dim = self.txt_enc.config.hidden_size
        else:
            raise AttributeError(
                "Unable to infer text encoder hidden size; ensure encoder has 'config.hidden_size'."
            )

        # Projection heads
        self.img_proj = nn.Linear(img_dim, proj_dim)
        self.txt_proj = nn.Linear(txt_dim, proj_dim)
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1/0.07))

        # Freeze encoders if requested
        if freeze_encoders:
            for param in self.img_enc.parameters():
                param.requires_grad = False
            for param in self.txt_enc.parameters():
                param.requires_grad = False

    def forward(self, pixel_values, input_ids, attention_mask):
        # 1) Image encode
        img_out = self.img_enc(pixel_values=pixel_values)
        if isinstance(img_out, torch.Tensor):
            img_feat = img_out
        else:
            # Prefer pooler_output if it exists and is not None
            if hasattr(img_out, 'pooler_output') and img_out.pooler_output is not None:
                img_feat = img_out.pooler_output
            else:
                img_feat = img_out.last_hidden_state[:, 0]
    
        # 2) Text encode
        txt_out  = self.txt_enc(input_ids=input_ids, attention_mask=attention_mask)
        txt_feat = txt_out.last_hidden_state[:, 0]
    
        # 3) Project & normalize
        img_emb = F.normalize(self.img_proj(img_feat), dim=-1)
        txt_emb = F.normalize(self.txt_proj(txt_feat), dim=-1)
    
        # 4) Return with temp
        return img_emb, txt_emb, self.logit_scale.exp()

In [None]:
class Prop2HateMemeDataset(Dataset):
    """
    Custom PyTorch Dataset for the Prop2Hate-Meme dataset.
    """
    def __init__(self, jsonl_path, image_dir, clip_processor, text_tokenizer, max_len=128,
                 is_labels=False):
        """
        Args:
            jsonl_path (str): Path to the .jsonl file (e.g., 'train.jsonl').
            image_dir (str): Directory where the 'images/' folder is located.
            clip_processor: The processor for CLIP images.
            text_tokenizer: The tokenizer for MARBERT text.
            max_len (int): Maximum sequence length for tokenization.
        """
        self.data = load_dataset('json', data_files=jsonl_path)['train']
        self.image_dir = image_dir
        self.clip_processor = clip_processor
        self.text_tokenizer = text_tokenizer
        self.max_len = max_len
        self.is_labels = is_labels

        
        # Map string labels to integers
        # self.label_map = {"not-hateful": 0, "hateful": 1}
        if self.is_labels and 'label' in self.data.column_names:
            self.labels = self.data['label']
        else:
            self.labels = None

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        # --- Load and Process Image ---
        # --- Fix possible './' in img_path ---
        img_path = item['img_path']
        img_path = os.path.basename(img_path)
    
        image_path = os.path.join(self.image_dir, img_path)
        try:
            image = Image.open(image_path).convert("RGB")
            # Process image using CLIP's processor
            image_processed = self.clip_processor(images=image, return_tensors="pt")
            pixel_values = image_processed['pixel_values'].squeeze(0) # Remove batch dim
        except FileNotFoundError:
            print(f"Warning: Image not found at {image_path}. Using a dummy image.")
            # Provide a dummy tensor if an image is missing
            pixel_values = torch.zeros((3, 224, 224))


        # --- Load and Process Text ---
        text = item['text']
        # Tokenize text using MARBERT's tokenizer
        text_tokenized = self.text_tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        input_ids = text_tokenized['input_ids'].squeeze(0) # Remove batch dim
        attention_mask = text_tokenized['attention_mask'].squeeze(0) # Remove batch dim
        id_it = item['id']

        #label = torch.tensor(item['label'], dtype=torch.long)

        if not self.is_labels:
            return {
                'id': id_it,
                'pixel_values': pixel_values,
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
        else:
            label = torch.tensor(item['label'], dtype=torch.long)
            return {
                'pixel_values': pixel_values,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': label

In [None]:
IMG_DATASET_DIR = "./images"
TRAIN_FILE = "arabic_hateful_meme_train.jsonl"
DEV_FILE = "arabic_hateful_meme_dev.jsonl"
TEST_FILE = "arabic_hateful_meme_test.jsonl"
TEST_NO_LABELS = "task3_test_without_label.jsonl"

clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
marbert_tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERT")

def contrastive_loss(img_emb, txt_emb, logit_scale):
    logits = logit_scale * img_emb @ txt_emb.t()   # (B, B)
    labels = torch.arange(len(logits), device=logits.device)
    loss_i2t = F.cross_entropy(logits, labels)
    loss_t2i = F.cross_entropy(logits.t(), labels)
    return (loss_i2t + loss_t2i) / 2


train_dataset = Prop2HateMemeDataset(
    jsonl_path=TRAIN_FILE,
    image_dir=IMG_DATASET_DIR,
    clip_processor=clip_processor,
    text_tokenizer=marbert_tokenizer
)

dev_dataset = Prop2HateMemeDataset(
    jsonl_path=DEV_FILE,
    image_dir=IMG_DATASET_DIR,
    clip_processor=clip_processor,
    text_tokenizer=marbert_tokenizer
)

test_dataset = Prop2HateMemeDataset(
    jsonl_path=TEST_FILE,
    image_dir=IMG_DATASET_DIR,
    clip_processor=clip_processor,
    text_tokenizer=marbert_tokenizer
)

test_noLabels_dataset = Prop2HateMemeDataset(
    jsonl_path=TEST_NO_LABELS,
    image_dir=IMG_DATASET_DIR,
    clip_processor=clip_processor,
    text_tokenizer=marbert_tokenizer,
)

combined_dataset = ConcatDataset([train_dataset, dev_dataset, test_dataset, test_noLabels_dataset])
full_data_loader = DataLoader(combined_dataset, batch_size=32, shuffle=True)

In [None]:
clip_vit = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
marabert = AutoModel.from_pretrained("UBC-NLP/MARBERT")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPArabic(clip_vit, marabert, proj_dim=512).to(device)

In [None]:
def train(
    model: nn.Module,
    data_loader: DataLoader,
    epochs: int = 10,
    lr: float = 5e-5,
    weight_decay: float = 1e-2,
    device: str = 'cuda'
) -> nn.Module:
    
    # Setup device
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.train()

    # Optimizer & Scheduler (only for trainable params)
    optimizer = torch.optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=lr,
        weight_decay=weight_decay
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    # Training loop
    for epoch in range(1, epochs+1):
        total_loss = 0.0
        for batch in data_loader:
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Forward pass
            img_emb, txt_emb, logit_scale = model(pixel_values, input_ids, attention_mask)
            loss = contrastive_loss(img_emb, txt_emb, logit_scale)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Step scheduler
        scheduler.step()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch}/{epochs} - Loss: {avg_loss:.4f}")

    return model

In [None]:
trained_model = train(model, full_data_loader, epochs=80, lr=5e-5, weight_decay=1e-2, device='cuda')

In [None]:
torch.save(model.state_dict(), "/kaggle/working/cust_clip.pt")
print(f"Saved final model state_dict to /kaggle/working/cust_clip.pt")

import json

model_clip = trained_model.to(device)
model_clip.eval()

embeddings_dict = {}

for batch in full_data_loader:
    # Move data to device
    pixel_values = batch["pixel_values"].to(device)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    ids = batch["id"]  # List of IDs for this batch
    
    # Generate embeddings
    with torch.no_grad():
        img_embs, txt_embs, _ = model_clip(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    
    # Convert to lists and store
    img_embs = img_embs.cpu().numpy().tolist()
    txt_embs = txt_embs.cpu().numpy().tolist()
    
    for idx, id_val in enumerate(ids):
        embeddings_dict[id_val] = {
            "image_embedding": img_embs[idx],
            "text_embedding": txt_embs[idx]
        }

# Save to JSON
with open("embeddings.json", "w") as f:
    json.dump(embeddings_dict, f, indent=4)