In [1]:
import numpy as np
import pandas as pd

import os


In [2]:
from datasets import load_dataset
from PIL import Image
from transformers import (
    CLIPProcessor, CLIPModel, CLIPVisionModel,
    AutoModel, AutoProcessor, AutoTokenizer,
    AutoModelForMaskedLM
)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
from collections import Counter, defaultdict
import os
import json
from datetime import datetime
import pandas as pd


2025-08-14 11:29:14.338731: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755170954.644789     111 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755170954.727996     111 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import random
import torch
import numpy as np

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(461)

In [4]:
def worker_init_fn(worker_id):
    np.random.seed(461 + worker_id)

g = torch.Generator()
g.manual_seed(461)

<torch._C.Generator at 0x7dc543e2e570>

In [5]:
class PrecomputedEmbeddingsDataset(Dataset):
    def __init__(self, jsonl_path, embeddings_path, is_test=False):
        """
        Args:
            jsonl_path (str): Path to the original .jsonl file
            embeddings_path (str): Path to JSON file with precomputed embeddings
            is_test (bool): If True, skips loading labels (for test data)
        """
        self.data = load_dataset('json', data_files=jsonl_path)['train']
        with open(embeddings_path, 'r') as f:
            self.embeddings = json.load(f)
        
        self.is_test = is_test
        if not self.is_test:
            self.labels = self.data['label']
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        id_val = str(item['id'])  # JSON keys are strings
        
        if id_val not in self.embeddings:
            raise KeyError(f"Embeddings for ID {id_val} not found")
        
        emb_data = self.embeddings[id_val]
        
        output = {
            'image_embedding': torch.tensor(emb_data['image_embedding']),
            'text_embedding': torch.tensor(emb_data['text_embedding']),
        }
        
        if not self.is_test:
            output['label'] = torch.tensor(item['label'], dtype=torch.long)
        if self.is_test:
            output['id'] = item['id']
        
        return output

In [6]:
class PrecomputedEmbeddingsClassifier(nn.Module):
    def __init__(self, 
                 image_embedding_dim, 
                 text_embedding_dim,
                 fusion_type='gate',
                 projection_dim=256, 
                 dropout_rate=0.2,
                 num_classes=2):
        super().__init__()
        self.fusion_type = fusion_type
        
        # Fusion configuration
        if fusion_type == 'concat':
            classifier_input_dim = image_embedding_dim + text_embedding_dim
        else:
            classifier_input_dim = projection_dim
        
        # Projection layers for non-concat methods
        if fusion_type in ['add', 'mul', 'gate']:
            self.image_proj = nn.Linear(image_embedding_dim, projection_dim)
            self.text_proj = nn.Linear(text_embedding_dim, projection_dim)
            self.norm = nn.LayerNorm(projection_dim)
        
        # Gated fusion components
        if fusion_type == 'gate':
            self.image_gate = nn.Linear(image_embedding_dim, projection_dim)
            self.text_gate = nn.Linear(text_embedding_dim, projection_dim)
            self.sigmoid = nn.Sigmoid()
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, projection_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout_rate),
            nn.Linear(projection_dim, num_classes)
        )

    def forward(self, image_embedding, text_embedding):
        # Apply fusion technique
        if self.fusion_type == 'concat':
            combined = torch.cat((image_embedding, text_embedding), dim=-1)
        
        elif self.fusion_type == 'add':
            img_proj = F.relu(self.image_proj(image_embedding))
            txt_proj = F.relu(self.text_proj(text_embedding))
            combined = img_proj + txt_proj
            combined = self.norm(combined)
        
        elif self.fusion_type == 'mul':
            img_proj = F.relu(self.image_proj(image_embedding))
            txt_proj = F.relu(self.text_proj(text_embedding))
            combined = img_proj * txt_proj
            combined = self.norm(combined)
        
        elif self.fusion_type == 'gate':
            img_proj = F.relu(self.image_proj(image_embedding))
            txt_proj = F.relu(self.text_proj(text_embedding))
            
            # Compute gating weights
            gate_img = self.sigmoid(self.image_gate(image_embedding))
            gate_txt = self.sigmoid(self.text_gate(text_embedding))
            
            # Apply gating mechanism
            combined = gate_img * img_proj + gate_txt * txt_proj
            combined = self.norm(combined)
        
        # Final classification
        logits = self.classifier(combined)
        return logits
    

In [7]:
import json
from sklearn.metrics import classification_report

def train_model(
    model,
    train_loader,
    dev_loader,
    criterion,
    device,
    learning_rate=5e-5,
    num_epochs=100,
    early_stop_patience=100,
    weight_decay=1e-5,
    clip_value=1.0,
    model_save_path='best_model.pth',
    metrics_file_path='training_metrics.json',
    shedular_step=True
):
    # Initialize metrics dictionary
    metrics = {
        'epochs': [],
        'train_losses': [],
        'val_f1_macro': [],
        'best_f1': 0,
        'best_epoch': 0,
        'classification_reports': {}
    }
    
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)
    
    best_f1 = 0
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        
        for i, batch in enumerate(train_loader):
            img_emb = batch['image_embedding'].to(device, non_blocking=True)
            txt_emb = batch['text_embedding'].to(device, non_blocking=True)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            logits = model(img_emb, txt_emb)
            loss = criterion(logits, labels)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()

            total_loss += loss.item()
            if (i + 1) % 50 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for batch in dev_loader:
                img_emb = batch['image_embedding'].to(device, non_blocking=True)
                txt_emb = batch['text_embedding'].to(device, non_blocking=True)
                labels = batch['label'].to(device)
                
                logits = model(img_emb, txt_emb)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        # Calculate metrics
        f1 = f1_score(all_labels, all_preds, average='macro')
        avg_train_loss = total_loss / len(train_loader)
        
        # Record metrics
        metrics['epochs'].append(epoch + 1)
        metrics['train_losses'].append(avg_train_loss)
        metrics['val_f1_macro'].append(f1)
        
        print(f"\nEpoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val F1: {f1:.4f}")
        
        if shedular_step:
            scheduler.step(f1)
        
        # Check for new best score
        if f1 > best_f1:
            best_f1 = f1
            metrics['best_f1'] = best_f1
            metrics['best_epoch'] = epoch + 1
            epochs_no_improve = 0
            
            # Save model
            torch.save(model.state_dict(), model_save_path)
            
            # Generate and save classification report
            report = classification_report(
                all_labels, 
                all_preds,
                output_dict=True
            )
            metrics['classification_reports'][f'epoch_{epoch+1}'] = report
            
            # Save metrics to file
            with open(metrics_file_path, 'w') as f:
                json.dump(metrics, f, indent=4)
                
        else:
            epochs_no_improve += 1
            if epochs_no_improve == early_stop_patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                # Save metrics before early stopping
                with open(metrics_file_path, 'w') as f:
                    json.dump(metrics, f, indent=4)
                break
        
        # Save metrics after each epoch (in case of interruption)
        with open(metrics_file_path, 'w') as f:
            json.dump(metrics, f, indent=4)

    print("Training complete. Loading best model.")
    model.load_state_dict(torch.load(model_save_path))
    
    return model, best_f1

In [8]:
import torch
import pandas as pd

def generate_predictions(model, test_loader, output_path="prediction.csv", device='cuda'):
    """
    Generate predictions using a trained model and save them to a CSV file.
    
    Args:
        model: Trained PyTorch model
        test_loader: DataLoader for test data
        output_path (str): Path to save predictions CSV (default: "prediction.csv")
        device (str): Device to use for inference ('cuda' or 'cpu')
    """
    model.eval()
    predictions = []
    ids = []

    with torch.no_grad():
        for batch in test_loader:
            # Move batch data to device
            img_emb = batch['image_embedding'].to(device, non_blocking=True)
            txt_emb = batch['text_embedding'].to(device, non_blocking=True)
                    
            # Get model predictions
            logits = model(img_emb, txt_emb)
            preds = torch.argmax(logits, dim=1)

            # Store predictions and IDs
            predictions.extend(preds.cpu().numpy())
            
            # Handle IDs (convert to list if tensor)
            batch_ids = batch['id']
            if isinstance(batch_ids, torch.Tensor):
                batch_ids = batch_ids.tolist()
            ids.extend(batch_ids)

    # Convert numerical predictions to labels
    label_map = {0: 'not-hate', 1: 'hate'}
    pred_labels = [label_map[p] for p in predictions]

    # Save to CSV
    df = pd.DataFrame({
        'id': ids,
        'prediction': pred_labels
    })
    df.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")


In [9]:
import pandas as pd
from sklearn.metrics import (classification_report, roc_auc_score, 
                             f1_score, precision_score, recall_score, 
                             accuracy_score, confusion_matrix)
import json

def evaluate_classification_performance(true_labels_path, predictions_path, output_file_path='classification_metrics.json'):
    """
    Evaluate classification performance by comparing true labels with predictions and save results to a file.
    
    Args:
        true_labels_path (str): Path to CSV file containing true labels (columns: id, test_label)
        predictions_path (str): Path to CSV file containing predictions (columns: id, prediction)
        output_file_path (str): Path to save the evaluation metrics (default: 'classification_metrics.json')
    """
    # Load the data
    test_df = pd.read_csv(true_labels_path)
    pred_df = pd.read_csv(predictions_path)

    # Initialize results dictionary
    results = {
        'classification_report': {},
        'additional_metrics': {},
        'confusion_matrix': []
    }

    # Verify the IDs match (just in case)
    if not (test_df['id'] == pred_df['id']).all():
        print("Warning: IDs don't match between files! Results may be invalid.")
        # Alternative approach if IDs don't match:
        merged = pd.merge(test_df, pred_df, on='id', how='inner')
        y_true = merged['testing_label']
        y_pred = merged['prediction']
    else:
        y_true = test_df['testing_label']
        y_pred = pred_df['prediction']

    # For binary classification, convert to numerical if needed
    label_map = {'hate': 1, 'not-hate': 0}
    if y_true.dtype == 'object':
        y_true_num = y_true.map(label_map)
        y_pred_num = y_pred.map(label_map)
    else:
        y_true_num = y_true
        y_pred_num = y_pred

    # Calculate and store classification report
    clf_report = classification_report(y_true, y_pred, target_names=['not-hate', 'hate'], output_dict=True)
    results['classification_report'] = clf_report
    
    # Calculate and store additional metrics
    results['additional_metrics'] = {
        'accuracy': accuracy_score(y_true_num, y_pred_num),
        'f1_macro': f1_score(y_true_num, y_pred_num, average='macro'),
        'precision_macro': precision_score(y_true_num, y_pred_num, average='macro'),
        'recall_macro': recall_score(y_true_num, y_pred_num, average='macro')
    }

    # Calculate and store confusion matrix
    results['confusion_matrix'] = confusion_matrix(y_true_num, y_pred_num).tolist()

    # Print results to console
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['not-hate', 'hate']))

    print("\nAdditional Metrics:")
    print(f"Accuracy: {results['additional_metrics']['accuracy']:.4f}")
    print(f"F1 Macro: {results['additional_metrics']['f1_macro']:.4f}")
    print(f"Precision Macro: {results['additional_metrics']['precision_macro']:.4f}")
    print(f"Recall Macro: {results['additional_metrics']['recall_macro']:.4f}")

    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])

    # Save results to file
    with open(output_file_path, 'w') as f:
        json.dump(results, f, indent=4)
    
    print(f"\nAll metrics saved to {output_file_path}")

In [10]:
DEVICE='cuda'

train_dataset = PrecomputedEmbeddingsDataset(
    jsonl_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/new_train.jsonl",
    embeddings_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/embeddings.json"
)

val_dataset = PrecomputedEmbeddingsDataset(
    jsonl_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/new_test.jsonl",
    embeddings_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/embeddings.json"
)

label_counts = Counter(train_dataset.labels)

sample = train_dataset[0]


# Create data loaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=32, 
    shuffle=True,
    generator=g,
    worker_init_fn=worker_init_fn
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=32, 
    shuffle=False,
    worker_init_fn=worker_init_fn
)


USE_FOCAL_LOSS=True

# Focal Loss for class imbalance (handles hard examples)
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = (1 - pt) ** self.gamma * ce_loss
        if self.alpha is not None:
            loss = self.alpha[targets] * loss
        return loss.mean()

# Class-aware loss weighting
loss_weights = torch.tensor([
    len(train_dataset) / (2 * label_counts[0]),
    len(train_dataset) / (2 * label_counts[1])
], dtype=torch.float32).to(DEVICE)

criterion = FocalLoss(alpha=loss_weights) if USE_FOCAL_LOSS \
            else nn.CrossEntropyLoss(weight=loss_weights)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
model = PrecomputedEmbeddingsClassifier(
    image_embedding_dim=sample['image_embedding'].shape[0],
    text_embedding_dim=sample['text_embedding'].shape[0],
    fusion_type='gate', 
    projection_dim=256,
    num_classes=2
)

In [None]:
# Train model
trained_model, best_f1 = train_model(
    model=model,
    train_loader=train_loader,
    dev_loader=val_loader,
    criterion=criterion,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    num_epochs=30,
    learning_rate=5e-5,
    early_stop_patience=10,
    model_save_path='best_model_custom_emb_fusion.pth',
    metrics_file_path='training_metrics_custom_emb_fusion.json'
)

In [40]:
test_dataset = PrecomputedEmbeddingsDataset(
    jsonl_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/task3_test_without_label.jsonl",
    embeddings_path="/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/embeddings.json",
    is_test=True
)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [50]:
generate_predictions(trained_model, test_loader, "official_predictions.csv")

Predictions saved to official_predictions.csv


In [54]:
# Example usage:
evaluate_classification_performance(
    '/kaggle/input/qcri-final-folder/QCRI_ARABIC_FOLDER/task3_test_gold.txt',
    '/kaggle/working/official_predictions.csv',
    output_file_path='classification_metrics.json_custom_emb_fusion'
)

Classification Report:
              precision    recall  f1-score   support

    not-hate       0.76      0.54      0.63       200
        hate       0.74      0.89      0.81       300

    accuracy                           0.75       500
   macro avg       0.75      0.71      0.72       500
weighted avg       0.75      0.75      0.74       500


Additional Metrics:
Accuracy: 0.7480
F1 Macro: 0.7200
Precision Macro: 0.7518
Recall Macro: 0.7133

Confusion Matrix:
[[266, 34], [92, 108]]

All metrics saved to classification_metrics.json_custom_emb_fusion
