In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datasets import load_dataset
from PIL import Image
from transformers import (
    CLIPProcessor, CLIPModel, CLIPVisionModel,
    AutoModel, AutoProcessor, AutoTokenizer,
    AutoModelForMaskedLM
)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
from collections import Counter, defaultdict
import os
import json
from datetime import datetime
import pandas as pd

In [None]:
class CNNFusionLayer(nn.Module):
    """CNN-style fusion layer with 1D convolutions and enhanced regularization"""
    def __init__(self, input_dim, output_dim, kernel_size=3, dropout=0.2):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        # 1D convolutions with reduced parameters
        self.img_conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=kernel_size//2, padding_mode='zeros')
        self.txt_conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=kernel_size//2, padding_mode='zeros')
        
        # Simplified cross-modal interaction
        self.cross_conv = nn.Conv1d(output_dim * 2, output_dim, 1)
        
        # Normalization with affine transforms disabled
        self.img_norm = nn.BatchNorm1d(output_dim, affine=False)
        self.txt_norm = nn.BatchNorm1d(output_dim, affine=False)
        self.cross_norm = nn.BatchNorm1d(output_dim, affine=False)
        
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, img_embeddings, txt_embeddings, txt_mask=None):
        img_t = img_embeddings.transpose(1, 2)  # [B, D, L_img]
        txt_t = txt_embeddings.transpose(1, 2)  # [B, D, L_txt]
        
        img_conv = self.activation(self.img_norm(self.img_conv(img_t)))
        txt_conv = self.activation(self.txt_norm(self.txt_conv(txt_t)))
        
        img_pooled = F.interpolate(
            img_conv, 
            size=txt_conv.size(2), 
            mode='linear', 
            align_corners=False
        )  # Now [B, D, L_txt]
        
        if txt_mask is not None:
            mask = txt_mask.unsqueeze(1).float()  # [B, 1, L_txt]
            txt_conv = txt_conv * mask
        
        combined = torch.cat([img_pooled, txt_conv], dim=1)
        fused = self.activation(self.cross_norm(self.cross_conv(combined)))
        
        img_out = img_conv.transpose(1, 2)
        txt_out = fused.transpose(1, 2)
        
        return self.dropout(img_out), self.dropout(txt_out)

class CNNMultiModalModel(nn.Module):
    """Optimized multimodal model with anti-overfitting features"""
    def __init__(self, num_labels=2, freeze_backbones=True,
                 layer_dims=[512, 256],  # Reduced layers
                 kernel_sizes=[3, 3],     # Smaller kernels
                 pool_type='avg'):
        super().__init__()
        
        # Load pretrained models
        self.vision_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.text_model = AutoModel.from_pretrained("UBC-NLP/MARBERT")
        
        # Selective unfreezing (last 2 layers)
        if freeze_backbones:
            # Freeze all layers first
            for param in self.vision_model.parameters():
                param.requires_grad = False
            for param in self.text_model.parameters():
                param.requires_grad = False
            
            # Unfreeze last 2 vision layers
            for layer in self.vision_model.vision_model.encoder.layers[-2:]:
                for param in layer.parameters():
                    param.requires_grad = True
            
            # Unfreeze last 2 text layers
            for layer in self.text_model.encoder.layer[-2:]:
                for param in layer.parameters():
                    param.requires_grad = True
        
        # Backbone output regularization
        self.vision_dropout = nn.Dropout(0.3)
        self.text_dropout = nn.Dropout(0.3)
        
        # Create shallower fusion layers with higher dropout
        self.fusion_layers = nn.ModuleList()
        for i in range(len(layer_dims)):
            input_dim = self.vision_model.config.hidden_size if i == 0 else layer_dims[i-1]
            layer = CNNFusionLayer(
                input_dim=input_dim,
                output_dim=layer_dims[i],
                kernel_size=kernel_sizes[i],
                dropout=0.2 + i * 0.15  # Increasing dropout
            )
            self.fusion_layers.append(layer)
            
        # Final pooling
        self.pool_type = pool_type
        final_dim = layer_dims[-1]
        
        # Simplified classifier with higher regularization
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(final_dim * 2, final_dim),
            nn.GELU(),
            nn.LayerNorm(final_dim),
            nn.Dropout(0.4),
            nn.Linear(final_dim, num_labels)
        )

    def pool_features(self, img_features, txt_features, txt_mask=None):
        """Pool sequence features with mask support"""
        # Image pooling (no mask needed)
        img_pooled = img_features.mean(dim=1)
        
        # Text pooling with mask
        if txt_mask is not None:
            mask_expanded = txt_mask.unsqueeze(-1).float()
            txt_pooled = (txt_features * mask_expanded).sum(dim=1)
            txt_pooled /= mask_expanded.sum(dim=1).clamp(min=1e-7)
        else:
            txt_pooled = txt_features.mean(dim=1)
            
        return img_pooled, txt_pooled

    def forward(self, pixel_values, input_ids, attention_mask):
        # Extract backbone features with regularization
        vision_outputs = self.vision_model(pixel_values=pixel_values)
        img_embeddings = self.vision_dropout(vision_outputs.last_hidden_state)
        
        text_outputs = self.text_model(
            input_ids=input_ids, 
            attention_mask=attention_mask
        )
        txt_embeddings = self.text_dropout(text_outputs.last_hidden_state)
        
        # Progressive fusion
        for layer in self.fusion_layers:
            img_embeddings, txt_embeddings = layer(
                img_embeddings, 
                txt_embeddings,
                attention_mask
            )
        
        # Final pooling
        img_pooled, txt_pooled = self.pool_features(
            img_embeddings, 
            txt_embeddings, 
            attention_mask
        )
        
        # Classification
        fused = torch.cat([img_pooled, txt_pooled], dim=-1)
        return self.classifier(fused)

In [None]:
class CrossAttentionFusion(nn.Module):
    """
    Cross-attention fusion with pooling over token-level attended outputs.
    Supports 'mean', 'max', or self-attention pooling.
    """
    def __init__(self, hidden_dim, num_heads, pool_mode='max'):
        super().__init__()
        assert pool_mode in ['mean', 'max', 'attention'], "pool_mode must be 'mean', 'max', or 'attention'"
        self.pool_mode = pool_mode

        self.text_to_image_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim, num_heads=num_heads, batch_first=True
        )
        self.image_to_text_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim, num_heads=num_heads, batch_first=True
        )

        if pool_mode == 'attention':
            # learnable pooling vector for image and text
            self.img_pool_query = nn.Parameter(torch.randn(1, 1, hidden_dim))
            self.txt_pool_query = nn.Parameter(torch.randn(1, 1, hidden_dim))
            # use dedicated single-head attention modules for pooling
            self.img_pool_attn = nn.MultiheadAttention(
                embed_dim=hidden_dim, num_heads=1, batch_first=True
            )
            self.txt_pool_attn = nn.MultiheadAttention(
                embed_dim=hidden_dim, num_heads=1, batch_first=True
            )

    def forward(self, image_embeddings, text_embeddings, text_attention_mask=None):
        # Cross-attention
        attended_image, _ = self.text_to_image_attention(
            query=image_embeddings,
            key=text_embeddings,
            value=text_embeddings,
            key_padding_mask=(text_attention_mask == 0) if text_attention_mask is not None else None
        )
        attended_text, _ = self.image_to_text_attention(
            query=text_embeddings,
            key=image_embeddings,
            value=image_embeddings
        )

        # Pooling
        if self.pool_mode == 'mean':
            image_feat = attended_image.mean(dim=1)  # [B, D]
            text_feat  = attended_text.mean(dim=1)   # [B, D]

        elif self.pool_mode == 'max':
            image_feat, _ = attended_image.max(dim=1)  # [B, D]
            text_feat, _  = attended_text.max(dim=1)   # [B, D]

        else:  # attention pooling
            B = image_embeddings.size(0)
            # ensure pool queries are on same device as inputs
            img_q = self.img_pool_query.expand(B, -1, -1).to(image_embeddings.device)  # [B,1,D]
            txt_q = self.txt_pool_query.expand(B, -1, -1).to(text_embeddings.device)    # [B,1,D]

            # attend to get pooled vector using module-level attentions
            pooled_img, _ = self.img_pool_attn(
                query=img_q, key=attended_image, value=attended_image
            )  # [B,1,D]
            pooled_txt, _ = self.txt_pool_attn(
                query=txt_q, key=attended_text, value=attended_text
            )  # [B,1,D]

            image_feat = pooled_img[:, 0, :]  # [B, D]
            text_feat  = pooled_txt[:, 0, :]  # [B, D]

        # Concatenate pooled features
        fused = torch.cat([image_feat, text_feat], dim=-1)  # [B, 2D]
        return fused


class AdvancedFusionModel(nn.Module):
    def __init__(self, num_labels=2, freeze_backbones=True, pool='max'):
        super().__init__()
        self.vision_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.text_model = AutoModel.from_pretrained("UBC-NLP/MARBERT")

        if freeze_backbones:
            for param in self.vision_model.parameters():
                param.requires_grad = False
            for param in self.text_model.parameters():
                param.requires_grad = False
    
        vision_embedding_dim = self.vision_model.config.hidden_size
        text_embedding_dim = self.text_model.config.hidden_size
        
        # Assuming both dimensions are the same, which they are (768)
        assert vision_embedding_dim == text_embedding_dim

        self.fusion = CrossAttentionFusion(hidden_dim=vision_embedding_dim, num_heads=4, pool_mode=pool)

        # The concatenated dimension from the fusion block
        concatenated_dim = vision_embedding_dim * 2
        self.proj = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(concatenated_dim, 512)
        )
        
        self.classification_head = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(512, num_labels)
        )

    def forward(self, pixel_values, input_ids, attention_mask, extract_features=False):
        vision_outputs = self.vision_model(pixel_values=pixel_values)
        image_embeddings = vision_outputs.last_hidden_state # [batch_size, num_patches+1, hidden_dim]

        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = text_outputs.last_hidden_state # [batch_size, seq_len, hidden_dim]

        fused_features = self.fusion(image_embeddings, text_embeddings, attention_mask)
        proj_features = self.proj(fused_features)

        logits = self.classification_head(proj_features)
        
        if extract_features:
            return proj_features
        return logits

In [None]:
class Prop2HateMemeDataset(Dataset):
    """
    Custom PyTorch Dataset for the Prop2Hate-Meme dataset.
    """
    def __init__(self, jsonl_path, image_dir, clip_processor, text_tokenizer, max_len=128,
                 is_labels=True):
        """
        Args:
            jsonl_path (str): Path to the .jsonl file (e.g., 'train.jsonl').
            image_dir (str): Directory where the 'images/' folder is located.
            clip_processor: The processor for CLIP images.
            text_tokenizer: The tokenizer for MARBERT text.
            max_len (int): Maximum sequence length for tokenization.
        """
        self.data = load_dataset('json', data_files=jsonl_path)['train']
        self.image_dir = image_dir
        self.clip_processor = clip_processor
        self.text_tokenizer = text_tokenizer
        self.max_len = max_len
        self.is_labels = is_labels

        
        # Map string labels to integers
        # self.label_map = {"not-hateful": 0, "hateful": 1}
        if self.is_labels and 'label' in self.data.column_names:
            self.labels = self.data['label']
        else:
            self.labels = None

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        # --- Load and Process Image ---
        # --- Fix possible './' in img_path ---
        img_path = item['img_path']
        img_path = os.path.basename(img_path)
    
        image_path = os.path.join(self.image_dir, img_path)
        try:
            image = Image.open(image_path).convert("RGB")
            # Process image using CLIP's processor
            image_processed = self.clip_processor(images=image, return_tensors="pt")
            pixel_values = image_processed['pixel_values'].squeeze(0) # Remove batch dim
        except FileNotFoundError:
            print(f"Warning: Image not found at {image_path}. Using a dummy image.")
            # Provide a dummy tensor if an image is missing
            pixel_values = torch.zeros((3, 224, 224))


        # --- Load and Process Text ---
        text = item['text']
        # Tokenize text using MARBERT's tokenizer
        text_tokenized = self.text_tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        input_ids = text_tokenized['input_ids'].squeeze(0) # Remove batch dim
        attention_mask = text_tokenized['attention_mask'].squeeze(0) # Remove batch dim
        id_it = item['id']

        #label = torch.tensor(item['label'], dtype=torch.long)

        if not self.is_labels:
            return {
                'id': id_it,
                'pixel_values': pixel_values,
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
        else:
            label = torch.tensor(item['label'], dtype=torch.long)
            return {
                'pixel_values': pixel_values,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': label
            }

In [None]:
DATASET_DIR = "/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/images"
TRAIN_FILE = "/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/new_train.jsonl"
TEST_FILE = "/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/new_test.jsonl"
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
USE_FOCAL_LOSS = True  # Switch to Focal Loss for class imbalance

# --- Initialize Model Components ---
clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
marbert_tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERT")

# --- Dataset & Weighted Sampler ---
train_dataset = Prop2HateMemeDataset(
    jsonl_path=TRAIN_FILE,
    image_dir=DATASET_DIR,
    clip_processor=clip_processor,
    text_tokenizer=marbert_tokenizer
)

# Calculate class weights for sampler
label_counts = Counter(train_dataset.labels)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True  # Faster data transfer to GPU
)
print(len(train_dataset))

val_dataset = Prop2HateMemeDataset(
    jsonl_path=TEST_FILE,
    image_dir=DATASET_DIR,
    clip_processor=clip_processor,
    text_tokenizer=marbert_tokenizer
)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(len(val_dataset))

# Focal Loss for class imbalance (handles hard examples)
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = (1 - pt) ** self.gamma * ce_loss
        if self.alpha is not None:
            loss = self.alpha[targets] * loss
        return loss.mean()

# Class-aware loss weighting
loss_weights = torch.tensor([
    len(train_dataset) / (2 * label_counts[0]), 
    len(train_dataset) / (2 * label_counts[1])
], dtype=torch.float32).to(DEVICE)

criterion = FocalLoss(alpha=loss_weights) if USE_FOCAL_LOSS \
            else nn.CrossEntropyLoss(weight=loss_weights)

In [None]:
# from torch.utils.data import DataLoader, Subset

# # Assuming your original datasets are called train_dataset and val_dataset
# train_subset = Subset(train_dataset, list(range(100)))
# val_subset = Subset(val_dataset, list(range(20)))

# train_loader = DataLoader(train_subset,batch_size=BATCH_SIZE,
#     shuffle=True,
#     pin_memory=True)  # Faster data transfer to GPU)
                         
# val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import json
from sklearn.metrics import classification_report

def train_model(
    model,
    train_loader,
    dev_loader,
    criterion,
    device,
    learning_rate=5e-5,
    num_epochs=50,
    early_stop_patience=15,
    weight_decay=1e-5,
    clip_value=1.0,
    model_save_path='best_model.pth',
    metrics_file_path='training_metrics.json'
):
    # Initialize metrics dictionary
    metrics = {
        'epochs': [],
        'train_losses': [],
        'val_f1_macro': [],
        'best_f1': 0,
        'best_epoch': 0,
        'classification_reports': {}
    }
    
    # Move model to target device
    model = model.to(device)
    
    # Initialize optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(
        optimizer, 
        mode='max', 
        factor=0.5, 
        patience=2, 
        verbose=True
    )
    
    best_f1 = 0
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0

        for i, batch in enumerate(train_loader):
            # Move batch to device
            pixel_values = batch['pixel_values'].to(device, non_blocking=True)
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['label'].to(device)

            # Forward pass and loss calculation
            optimizer.zero_grad()
            logits = model(pixel_values, input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            # Backpropagation with gradient clipping
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()

            total_loss += loss.item()
            if (i + 1) % 50 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        # Validation phase
        model.eval()
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for batch in dev_loader:
                pixel_values = batch['pixel_values'].to(device, non_blocking=True)
                input_ids = batch['input_ids'].to(device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(device, non_blocking=True)
                labels = batch['label'].to(device)
                
                logits = model(pixel_values, input_ids, attention_mask)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        # Calculate metrics
        f1 = f1_score(all_labels, all_preds, average='macro')
        avg_train_loss = total_loss / len(train_loader)
        
        # Record metrics
        metrics['epochs'].append(epoch + 1)
        metrics['train_losses'].append(avg_train_loss)
        metrics['val_f1_macro'].append(f1)
        
        print(f"\nEpoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val F1: {f1:.4f}")
        
        # Update learning rate
        scheduler.step(f1)
        
        # Check for new best score
        if f1 > best_f1:
            best_f1 = f1
            metrics['best_f1'] = best_f1
            metrics['best_epoch'] = epoch + 1
            epochs_no_improve = 0
            
            # Save model
            torch.save(model.state_dict(), model_save_path)
            
            # Generate and save classification report
            report = classification_report(
                all_labels, 
                all_preds,
                output_dict=True
            )
            metrics['classification_reports'][f'epoch_{epoch+1}'] = report
            
            # Save metrics to file
            with open(metrics_file_path, 'w') as f:
                json.dump(metrics, f, indent=4)
                
        else:
            epochs_no_improve += 1
            if epochs_no_improve == early_stop_patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                # Save metrics before early stopping
                with open(metrics_file_path, 'w') as f:
                    json.dump(metrics, f, indent=4)
                break
        
        # Save metrics after each epoch (in case of interruption)
        with open(metrics_file_path, 'w') as f:
            json.dump(metrics, f, indent=4)

    # Load best model weights
    print("Training complete. Loading best model for final evaluation.")
    model.load_state_dict(torch.load(model_save_path))
    
    return model, best_f1

In [None]:
import torch
import pandas as pd

def generate_predictions(model, test_loader, output_path="prediction.csv", device='cuda'):
    """
    Generate predictions using a trained model and save them to a CSV file.
    
    Args:
        model: Trained PyTorch model
        test_loader: DataLoader for test data
        output_path (str): Path to save predictions CSV (default: "prediction.csv")
        device (str): Device to use for inference ('cuda' or 'cpu')
    """
    model.eval()
    predictions = []
    ids = []

    with torch.no_grad():
        for batch in test_loader:
            pixel_values = batch['pixel_values'].to(DEVICE)
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
    
            # Get model outputs (logits)
            logits = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Since the model uses BCEWithLogitsLoss and the final layer has one output,
            # a logit value > 0 indicates the positive class (1, "hateful").
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            
            # Handle IDs (convert to list if tensor)
            batch_ids = batch['id']
            if isinstance(batch_ids, torch.Tensor):
                batch_ids = batch_ids.tolist()
            ids.extend(batch_ids)

    # Convert numerical predictions to labels
    label_map = {0: 'not-hate', 1: 'hate'}
    pred_labels = [label_map[p] for p in predictions]

    # Save to CSV
    df = pd.DataFrame({
        'id': ids,
        'prediction': pred_labels
    })
    df.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")



In [None]:
import pandas as pd
from sklearn.metrics import (classification_report, roc_auc_score, 
                             f1_score, precision_score, recall_score, 
                             accuracy_score, confusion_matrix)
import numpy as np

def evaluate_classification_performance(true_labels_path, predictions_path):
    """
    Evaluate classification performance by comparing true labels with predictions.
    
    Args:
        true_labels_path (str): Path to CSV file containing true labels (columns: id, test_label)
        predictions_path (str): Path to CSV file containing predictions (columns: id, prediction)
    """
    # Load the data
    test_df = pd.read_csv(true_labels_path)
    pred_df = pd.read_csv(predictions_path)

    # Verify the IDs match (just in case)
    if not (test_df['id'] == pred_df['id']).all():
        print("Warning: IDs don't match between files! Results may be invalid.")
        # Alternative approach if IDs don't match:
        # merged = pd.merge(test_df, pred_df, on='id', how='inner')
        # y_true = merged['test_label']
        # y_pred = merged['prediction']
    else:
        y_true = test_df['testing_label']
        y_pred = pred_df['prediction']

    # For binary classification, convert to numerical if needed
    label_map = {'hate': 1, 'not-hate': 0}
    if y_true.dtype == 'object':
        y_true_num = y_true.map(label_map)
        y_pred_num = y_pred.map(label_map)
    else:
        y_true_num = y_true
        y_pred_num = y_pred

    # Calculate metrics
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['not-hate', 'hate']))

    print("\nAdditional Metrics:")
    print(f"Accuracy: {accuracy_score(y_true_num, y_pred_num):.4f}")
    print(f"F1 Macro: {f1_score(y_true_num, y_pred_num, average='macro'):.4f}")
    print(f"Precision Macro: {precision_score(y_true_num, y_pred_num, average='macro'):.4f}")
    print(f"Recall Macro: {recall_score(y_true_num, y_pred_num, average='macro'):.4f}")

    # Confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true_num, y_pred_num))


In [None]:
model = AdvancedFusionModel(num_labels=2, freeze_backbones=True).to(DEVICE)

In [None]:
trained_model, best_f1 = train_model(
    model=model,
    train_loader=train_loader,
    dev_loader=val_loader,
    criterion=criterion,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    num_epochs=30,
    early_stop_patience=10,
    model_save_path='best_cross_attention_model.pth',
    metrics_file_path='training_metrics_cross_attention.json'
)

In [None]:
model = CNNMultiModalModel(
        layer_dims=[512, 256],
        kernel_sizes=[3, 3],
        pool_type='avg'
    )

In [None]:
trained_model, best_f1 = train_model(
    model=model,
    train_loader=train_loader,
    dev_loader=val_loader,
    criterion=criterion,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    num_epochs=30,
    early_stop_patience=10,
    model_save_path='best_cnn_fusion_model.pth',
    metrics_file_path='training_metrics_cnn_fusion.json'
)

In [None]:
# test_dataset = Prop2HateMemeDataset(
#     jsonl_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/task3_test_without_label.jsonl",
#     image_dir=DATASET_DIR,
#     clip_processor=clip_processor,
#     text_tokenizer=marbert_tokenizer,
#     is_labels=False
# )

# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# generate_predictions(trained_model, test_loader, "post_predictions.csv")

In [None]:
# # Example usage:
# evaluate_classification_performance(
#     '/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/task3_test_gold.txt',
#     '/kaggle/working/post_predictions.csv'
# )