In [None]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    AutoTokenizer, AutoModel, AutoConfig,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset, Dataset as HFDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import logging
import os
import json
import warnings
from typing import Dict, List, Tuple, Optional
from datetime import datetime
import joblib
import random
from collections import Counter
import gc

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create output directories
os.makedirs("./random_seed_analysis_results", exist_ok=True)
os.makedirs("./trained_models_seeds", exist_ok=True)

print("✅ Libraries imported and setup complete!")

In [None]:
# Cell 2: Utility Functions for Memory Management
def set_random_seed(seed: int):
    """Set random seed for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def clear_memory():
    """Clear GPU memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

def print_memory_usage():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        cached = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f} GB, Cached: {cached:.2f} GB")

print("✅ Utility functions defined!")

: 

In [None]:
# Cell 3: Data Loading and Preprocessing Functions
def load_external_datasets() -> Tuple[Dict, Dict]:
    """Load SST-2 and GoEmotions datasets"""
    print("Loading external datasets...")
    
    # Load SST-2 for sentiment
    try:
        sst2_dataset = load_dataset("sst2")
        sentiment_data = {
            'train': sst2_dataset['train'],
            'validation': sst2_dataset['validation']
        }
        print(f"✅ SST-2 dataset loaded: {len(sentiment_data['train'])} train samples")
    except Exception as e:
        print(f"❌ Could not load SST-2: {e}")
        raise
    
    # Load GoEmotions for emotion
    try:
        emotions_dataset = load_dataset("go_emotions", "simplified")
        emotion_data = {
            'train': emotions_dataset['train'],
            'validation': emotions_dataset['validation']
        }
        print(f"✅ GoEmotions dataset loaded: {len(emotion_data['train'])} train samples")
    except Exception as e:
        print(f"❌ Could not load GoEmotions: {e}")
        raise
    
    return sentiment_data, emotion_data

def prepare_reddit_evaluation_data(reddit_data_path: str) -> Dict:
    """Load and prepare Reddit data for evaluation"""
    print(f"Loading Reddit evaluation data from {reddit_data_path}...")
    
    df = pd.read_csv(reddit_data_path)
    
    # Create label encoders that match your existing models
    sentiment_encoder = LabelEncoder()
    emotion_encoder = LabelEncoder()
    
    # Fit encoders
    sentiment_encoder.fit(df['sentiment'].tolist())
    emotion_encoder.fit(df['emotion'].tolist())
    
    reddit_data = {
        'texts': df['text_content'].tolist(),
        'sentiment_labels_text': df['sentiment'].tolist(),
        'emotion_labels_text': df['emotion'].tolist(),
        'sentiment_labels': sentiment_encoder.transform(df['sentiment'].tolist()),
        'emotion_labels': emotion_encoder.transform(df['emotion'].tolist()),
        'sentiment_encoder': sentiment_encoder,
        'emotion_encoder': emotion_encoder
    }
    
    print(f"✅ Reddit data prepared: {len(reddit_data['texts'])} samples")
    print(f"   Sentiment classes: {list(sentiment_encoder.classes_)}")
    print(f"   Emotion classes: {list(emotion_encoder.classes_)}")
    
    return reddit_data

print("✅ Data loading functions defined!")

In [None]:
# Cell 4: Dataset Classes
class RobertaDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], tokenizer, max_length: int = 512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class MultiTaskDataset(Dataset):
    def __init__(self, texts: List[str], sentiment_labels: List[int], 
                 emotion_labels: List[int], tokenizer, max_length: int = 128):
        self.texts = texts
        self.sentiment_labels = sentiment_labels
        self.emotion_labels = emotion_labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        sentiment_label = self.sentiment_labels[idx]
        emotion_label = self.emotion_labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment_labels': torch.tensor(sentiment_label, dtype=torch.long),
            'emotion_labels': torch.tensor(emotion_label, dtype=torch.long)
        }

print("✅ Dataset classes defined!")

In [None]:
# Cell 5: Multitask Model Architecture (Same as your stad-valid.ipynb)
class MultiTaskTransformer(nn.Module):
    def __init__(
        self,
        model_name: str = "microsoft/deberta-base",
        sentiment_num_classes: int = 3,
        emotion_num_classes: int = 6,
        hidden_dropout_prob: float = 0.1,
        attention_dropout_prob: float = 0.1,
        classifier_dropout: float = 0.1
    ):
        super(MultiTaskTransformer, self).__init__()
        
        self.model_name = model_name
        self.sentiment_num_classes = sentiment_num_classes
        self.emotion_num_classes = emotion_num_classes
        
        # Load configuration and adjust dropout
        config = AutoConfig.from_pretrained(model_name)
        config.hidden_dropout_prob = hidden_dropout_prob
        config.attention_probs_dropout_prob = attention_dropout_prob
        
        # Shared transformer encoder
        self.shared_encoder = AutoModel.from_pretrained(
            model_name,
            config=config,
            ignore_mismatched_sizes=True
        )
        
        hidden_size = self.shared_encoder.config.hidden_size
        
        # Task-specific attention layers
        self.sentiment_attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=attention_dropout_prob,
            batch_first=True
        )
        
        self.emotion_attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=attention_dropout_prob,
            batch_first=True
        )
        
        # Shared attention for common features
        self.shared_attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=attention_dropout_prob,
            batch_first=True
        )
        
        # Layer normalization
        self.sentiment_norm = nn.LayerNorm(hidden_size)
        self.emotion_norm = nn.LayerNorm(hidden_size)
        self.shared_norm = nn.LayerNorm(hidden_size)
        
        # Dropout layers
        self.sentiment_dropout = nn.Dropout(classifier_dropout)
        self.emotion_dropout = nn.Dropout(classifier_dropout)
        self.shared_dropout = nn.Dropout(classifier_dropout)
        
        # Classification heads
        self.sentiment_classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(hidden_size, sentiment_num_classes)
        )
        
        self.emotion_classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(hidden_size, emotion_num_classes)
        )
        
        self._init_weights()
    
    def _init_weights(self):
        for module in [self.sentiment_classifier, self.emotion_classifier]:
            for layer in module:
                if isinstance(layer, nn.Linear):
                    nn.init.xavier_uniform_(layer.weight)
                    nn.init.zeros_(layer.bias)
    
    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, 
                task: Optional[str] = None) -> Dict[str, torch.Tensor]:
        # Shared encoder
        encoder_outputs = self.shared_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        sequence_output = encoder_outputs.last_hidden_state
        
        # Apply shared attention
        shared_attended, _ = self.shared_attention(
            sequence_output, sequence_output, sequence_output,
            key_padding_mask=~attention_mask.bool()
        )
        shared_attended = self.shared_norm(shared_attended + sequence_output)
        shared_attended = self.shared_dropout(shared_attended)
        shared_pooled = shared_attended[:, 0, :]
        
        outputs = {}
        
        # Sentiment branch
        if task is None or task == "sentiment":
            sentiment_attended, sentiment_weights = self.sentiment_attention(
                sequence_output, sequence_output, sequence_output,
                key_padding_mask=~attention_mask.bool()
            )
            sentiment_attended = self.sentiment_norm(sentiment_attended + sequence_output)
            sentiment_attended = self.sentiment_dropout(sentiment_attended)
            sentiment_pooled = sentiment_attended[:, 0, :]
            sentiment_features = torch.cat([shared_pooled, sentiment_pooled], dim=-1)
            sentiment_logits = self.sentiment_classifier(sentiment_features)
            outputs["sentiment_logits"] = sentiment_logits
        
        # Emotion branch
        if task is None or task == "emotion":
            emotion_attended, emotion_weights = self.emotion_attention(
                sequence_output, sequence_output, sequence_output,
                key_padding_mask=~attention_mask.bool()
            )
            emotion_attended = self.emotion_norm(emotion_attended + sequence_output)
            emotion_attended = self.emotion_dropout(emotion_attended)
            emotion_pooled = emotion_attended[:, 0, :]
            emotion_features = torch.cat([shared_pooled, emotion_pooled], dim=-1)
            emotion_logits = self.emotion_classifier(emotion_features)
            outputs["emotion_logits"] = emotion_logits
        
        return outputs

print("✅ Multitask model architecture defined!")

In [None]:
# Cell 6: Training Functions for Single-Task Models
def train_roberta_single_task(
    task_type: str,  # 'sentiment' or 'emotion'
    best_params: Dict,
    seed: int,
    sentiment_data: Dict = None,
    emotion_data: Dict = None,
    max_samples: int = 10000
) -> Tuple[any, LabelEncoder]:
    """Train a single-task RoBERTa model"""
    
    print(f"🚀 Training RoBERTa {task_type} model with seed {seed}")
    set_random_seed(seed)
    clear_memory()
    
    # Load appropriate dataset
    if task_type == 'sentiment':
        raw_data = sentiment_data
        text_col = 'sentence'
        label_col = 'label'
    else:  # emotion
        raw_data = emotion_data
        text_col = 'text'
        label_col = 'labels'
    
    # Prepare data
    train_texts = raw_data['train'][text_col][:max_samples]
    train_labels_raw = raw_data['train'][label_col][:max_samples]
    
    # Handle emotion multi-label to single-label conversion
    if task_type == 'emotion':
        # Filter to first 6 classes only and convert multi-label to single-label
        train_labels = []
        filtered_texts = []
        for i, label in enumerate(train_labels_raw):
            if isinstance(label, list):
                if label and label[0] in range(6):
                    train_labels.append(label[0])
                    filtered_texts.append(train_texts[i])
            else:
                if label in range(6):
                    train_labels.append(label)
                    filtered_texts.append(train_texts[i])
        train_texts = filtered_texts
    else:
        train_labels = train_labels_raw
    
    # Create label encoder
    label_encoder = LabelEncoder()
    if task_type == 'sentiment':
        # Map SST-2 to 3 classes: 0->Negative, 1->Positive, add some Neutral
        label_names = ['Negative', 'Neutral', 'Positive']
        converted_labels = []
        for label in train_labels:
            if label == 0:  # Negative
                converted_labels.append(0)
            elif label == 1:  # Positive
                # Add some neutral examples
                if np.random.random() < 0.1:
                    converted_labels.append(1)  # Neutral
                else:
                    converted_labels.append(2)  # Positive
        
        # Ensure we have all classes
        if 1 not in converted_labels:
            neutral_indices = np.random.choice(len(converted_labels), size=50, replace=False)
            for idx in neutral_indices:
                converted_labels[idx] = 1
        
        train_labels = converted_labels
        label_encoder.classes_ = np.array(label_names)
    else:  # emotion
        label_names = ['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise']
        label_encoder.classes_ = np.array(label_names)
    
    # Initialize tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification.from_pretrained(
        'roberta-base',
        num_labels=len(label_encoder.classes_),
        hidden_dropout_prob=best_params['dropout_rate'],
        attention_probs_dropout_prob=best_params['dropout_rate']
    )
    
    # Create dataset
    dataset = RobertaDataset(train_texts, train_labels, tokenizer, max_length=512)
    
    # Training arguments
    output_dir = f"./trained_models_seeds/roberta_{task_type}_seed_{seed}"
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=best_params['num_epochs'],
        per_device_train_batch_size=best_params['batch_size'],
        learning_rate=best_params['learning_rate'],
        weight_decay=best_params['weight_decay'],
        warmup_ratio=best_params['warmup_ratio'],
        logging_steps=100,
        save_strategy="no",  # Don't save during training
        dataloader_num_workers=0,
        report_to="none"
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer
    )
    
    # Train
    trainer.train()
    
    # Save model and encoder
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    joblib.dump(label_encoder, os.path.join(output_dir, f'{task_type}_encoder.pkl'))
    
    print(f"✅ RoBERTa {task_type} model trained and saved with seed {seed}")
    clear_memory()
    
    return model, label_encoder

print("✅ Single-task training functions defined!")

In [None]:
# Cell 7: Training Functions for Multitask Models
def prepare_multitask_data(sentiment_data: Dict, emotion_data: Dict, max_samples: int = 5000):
    """Prepare combined data for multitask training"""
    
    # Get sentiment data (SST-2)
    sentiment_texts = sentiment_data['train']['sentence'][:max_samples]
    sentiment_labels_raw = sentiment_data['train']['label'][:max_samples]
    
    # Get emotion data (GoEmotions, first 6 classes only)
    emotion_texts_all = emotion_data['train']['text']
    emotion_labels_all = emotion_data['train']['labels']
    
    # Filter emotion data to first 6 classes
    emotion_texts = []
    emotion_labels = []
    count = 0
    for i, label in enumerate(emotion_labels_all):
        if count >= max_samples:
            break
        if isinstance(label, list):
            if label and label[0] in range(6):
                emotion_texts.append(emotion_texts_all[i])
                emotion_labels.append(label[0])
                count += 1
        else:
            if label in range(6):
                emotion_texts.append(emotion_texts_all[i])
                emotion_labels.append(label)
                count += 1
    
    # Convert sentiment labels: 0->Negative(0), 1->Positive(2), add Neutral(1)
    converted_sentiment_labels = []
    for label in sentiment_labels_raw:
        if label == 0:  # Negative
            converted_sentiment_labels.append(0)
        elif label == 1:  # Positive
            if np.random.random() < 0.1:
                converted_sentiment_labels.append(1)  # Neutral
            else:
                converted_sentiment_labels.append(2)  # Positive
    
    # Ensure neutral class exists
    if 1 not in converted_sentiment_labels:
        neutral_indices = np.random.choice(len(converted_sentiment_labels), size=50, replace=False)
        for idx in neutral_indices:
            converted_sentiment_labels[idx] = 1
    
    # Use minimum length to balance datasets
    min_length = min(len(sentiment_texts), len(emotion_texts))
    
    combined_texts = sentiment_texts[:min_length]
    combined_sentiment_labels = converted_sentiment_labels[:min_length]
    combined_emotion_labels = emotion_labels[:min_length]
    
    # Create encoders
    sentiment_encoder = LabelEncoder()
    emotion_encoder = LabelEncoder()
    sentiment_encoder.classes_ = np.array(['Negative', 'Neutral', 'Positive'])
    emotion_encoder.classes_ = np.array(['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise'])
    
    return {
        'texts': combined_texts,
        'sentiment_labels': combined_sentiment_labels,
        'emotion_labels': combined_emotion_labels,
        'sentiment_encoder': sentiment_encoder,
        'emotion_encoder': emotion_encoder
    }

def train_multitask_model(
    model_name: str,  # "microsoft/deberta-base" or "vinai/bertweet-base"
    best_params: Dict,
    seed: int,
    sentiment_data: Dict,
    emotion_data: Dict,
    max_samples: int = 2000
) -> Tuple[any, LabelEncoder, LabelEncoder]:
    """Train a multitask model"""
    
    model_type = "deberta" if "deberta" in model_name else "bertweet"
    print(f"🚀 Training {model_type} multitask model with seed {seed}")
    set_random_seed(seed)
    clear_memory()
    
    # Prepare multitask data
    data = prepare_multitask_data(sentiment_data, emotion_data, max_samples)
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Initialize model
    model = MultiTaskTransformer(
        model_name=model_name,
        sentiment_num_classes=3,
        emotion_num_classes=6,
        hidden_dropout_prob=best_params['hidden_dropout_prob'],
        attention_dropout_prob=best_params['hidden_dropout_prob'],
        classifier_dropout=best_params['classifier_dropout']
    ).to(device)
    
    # Create dataset
    dataset = MultiTaskDataset(
        texts=data['texts'],
        sentiment_labels=data['sentiment_labels'],
        emotion_labels=data['emotion_labels'],
        tokenizer=tokenizer,
        max_length=best_params['max_length']
    )
    
    # Create data loader
    dataloader = DataLoader(dataset, batch_size=best_params['batch_size'], shuffle=True)
    
    # Initialize optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=best_params['learning_rate'],
        weight_decay=best_params['weight_decay']
    )
    
    # Training loop
    model.train()
    total_steps = len(dataloader) * best_params['num_epochs']
    warmup_steps = int(total_steps * best_params['warmup_ratio'])
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    # Compute class weights
    sentiment_weights = compute_class_weight(
        'balanced',
        classes=np.unique(data['sentiment_labels']),
        y=data['sentiment_labels']
    )
    emotion_weights = compute_class_weight(
        'balanced',
        classes=np.unique(data['emotion_labels']),
        y=data['emotion_labels']
    )
    
    sentiment_weights = torch.FloatTensor(sentiment_weights).to(device)
    emotion_weights = torch.FloatTensor(emotion_weights).to(device)
    
    # Loss functions
    sentiment_criterion = nn.CrossEntropyLoss(weight=sentiment_weights)
    emotion_criterion = nn.CrossEntropyLoss(weight=emotion_weights)
    
    alpha = best_params['alpha']
    
    print(f"Starting training for {best_params['num_epochs']} epochs...")
    
    for epoch in range(best_params['num_epochs']):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentiment_labels = batch['sentiment_labels'].to(device)
            emotion_labels = batch['emotion_labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Calculate losses
            sentiment_loss = sentiment_criterion(outputs['sentiment_logits'], sentiment_labels)
            emotion_loss = emotion_criterion(outputs['emotion_logits'], emotion_labels)
            
            # Combined loss
            total_loss_batch = alpha * sentiment_loss + (1 - alpha) * emotion_loss
            total_loss += total_loss_batch.item()
            
            # Backward pass
            total_loss_batch.backward()
            optimizer.step()
            scheduler.step()
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{best_params['num_epochs']}, Average Loss: {avg_loss:.4f}")
    
    # Save model
    output_dir = f"./trained_models_seeds/{model_type}_multitask_seed_{seed}"
    os.makedirs(output_dir, exist_ok=True)
    
    # Save model state dict
    torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
    
    # Save config
    config = {
        "model_name": model_name,
        "sentiment_num_classes": 3,
        "emotion_num_classes": 6,
        "model_type": "MultiTaskTransformer"
    }
    with open(os.path.join(output_dir, "config.json"), 'w') as f:
        json.dump(config, f, indent=2)
    
    # Save tokenizer and encoders
    tokenizer.save_pretrained(output_dir)
    joblib.dump(data['sentiment_encoder'], os.path.join(output_dir, 'sentiment_encoder.pkl'))
    joblib.dump(data['emotion_encoder'], os.path.join(output_dir, 'emotion_encoder.pkl'))
    
    print(f"✅ {model_type} multitask model trained and saved with seed {seed}")
    clear_memory()
    
    return model, data['sentiment_encoder'], data['emotion_encoder']

print("✅ Multitask training functions defined!")

In [None]:
# Cell 8: Evaluation Functions
def evaluate_single_task_model(model, tokenizer, label_encoder, reddit_data: Dict, task_type: str) -> Dict:
    """Evaluate a single-task model on Reddit data"""
    
    model.eval()
    model.to(device)
    
    texts = reddit_data['texts']
    true_labels = reddit_data[f'{task_type}_labels']
    
    predictions = []
    confidences = []
    
    with torch.no_grad():
        for i in range(0, len(texts), 16):  # Batch size 16
            batch_texts = texts[i:i+16]
            
            # Tokenize
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=512
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Forward pass
            outputs = model(**inputs)
            logits = outputs.logits
            probs = F.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
            
            # Collect results
            for j in range(len(batch_texts)):
                pred_id = preds[j].item()
                confidence = probs[j][pred_id].item()
                
                # Handle out of range predictions
                if pred_id >= len(label_encoder.classes_):
                    pred_id = 0
                
                predictions.append(pred_id)
                confidences.append(confidence)
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    macro_f1 = f1_score(true_labels, predictions, average='macro', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'predictions': predictions,
        'confidences': confidences,
        'true_labels': true_labels
    }

def evaluate_multitask_model(model, tokenizer, sentiment_encoder, emotion_encoder, 
                           reddit_data: Dict, max_length: int = 128) -> Dict:
    """Evaluate a multitask model on Reddit data"""
    
    model.eval()
    model.to(device)
    
    texts = reddit_data['texts']
    true_sentiment_labels = reddit_data['sentiment_labels']
    true_emotion_labels = reddit_data['emotion_labels']
    
    sentiment_predictions = []
    emotion_predictions = []
    sentiment_confidences = []
    emotion_confidences = []
    
    with torch.no_grad():
        for i in range(0, len(texts), 8):  # Smaller batch size for multitask
            batch_texts = texts[i:i+8]
            
            # Tokenize
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=max_length
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Forward pass
            outputs = model(input_ids=inputs['input_ids'], 
                          attention_mask=inputs['attention_mask'])
            
            # Process sentiment
            sentiment_logits = outputs['sentiment_logits']
            sentiment_probs = F.softmax(sentiment_logits, dim=-1)
            sentiment_preds = torch.argmax(sentiment_logits, dim=-1)
            
            # Process emotion
            emotion_logits = outputs['emotion_logits']
            emotion_probs = F.softmax(emotion_logits, dim=-1)
            emotion_preds = torch.argmax(emotion_logits, dim=-1)
            
            # Collect results
            for j in range(len(batch_texts)):
                # Sentiment
                sent_id = sentiment_preds[j].item()
                sent_conf = sentiment_probs[j][sent_id].item()
                if sent_id >= len(sentiment_encoder.classes_):
                    sent_id = 0
                sentiment_predictions.append(sent_id)
                sentiment_confidences.append(sent_conf)
                
                # Emotion
                emot_id = emotion_preds[j].item()
                emot_conf = emotion_probs[j][emot_id].item()
                if emot_id >= len(emotion_encoder.classes_):
                    emot_id = 0
                emotion_predictions.append(emot_id)
                emotion_confidences.append(emot_conf)
    
    # Calculate metrics
    sentiment_accuracy = accuracy_score(true_sentiment_labels, sentiment_predictions)
    sentiment_f1 = f1_score(true_sentiment_labels, sentiment_predictions, average='macro', zero_division=0)
    
    emotion_accuracy = accuracy_score(true_emotion_labels, emotion_predictions)
    emotion_f1 = f1_score(true_emotion_labels, emotion_predictions, average='macro', zero_division=0)
    
    return {
        'sentiment': {
            'accuracy': sentiment_accuracy,
            'macro_f1': sentiment_f1,
            'predictions': sentiment_predictions,
            'confidences': sentiment_confidences
        },
        'emotion': {
            'accuracy': emotion_accuracy,
            'macro_f1': emotion_f1,
            'predictions': emotion_predictions,
            'confidences': emotion_confidences
        },
        'combined_accuracy': (sentiment_accuracy + emotion_accuracy) / 2,
        'combined_f1': (sentiment_f1 + emotion_f1) / 2
    }

print("✅ Evaluation functions defined!")

In [None]:
# Cell 9: Main Random Seed Analysis Function
def run_random_seed_analysis(
    reddit_data_path: str = "annotated_reddit_posts.csv",
    seeds: List[int] = [42, 123, 456, 789, 999],
    max_training_samples: int = 5000
):
    """Run complete random seed analysis"""
    
    print("🎲 STARTING RANDOM SEED ANALYSIS")
    print("=" * 60)
    print(f"Seeds to test: {seeds}")
    print(f"Max training samples per dataset: {max_training_samples}")
    
    # Load external datasets
    print("\n📂 Loading external datasets...")
    sentiment_data, emotion_data = load_external_datasets()
    
    # Load Reddit evaluation data
    print("\n📂 Loading Reddit evaluation data...")
    reddit_data = prepare_reddit_evaluation_data(reddit_data_path)
    
    # Define best parameters for each model
    best_params = {
        'roberta_sentiment': {
            'learning_rate': 1.289795048085554e-05,
            'batch_size': 16,
            'dropout_rate': 0.2218455076693483,
            'num_epochs': 3,
            'warmup_ratio': 0.15263495397682356,
            'weight_decay': 0.13764422318448438
        },
        'roberta_emotion': {
            'learning_rate': 0.00018843871051154592,
            'batch_size': 32,
            'dropout_rate': 0.15859725997693935,
            'num_epochs': 7,
            'warmup_ratio': 0.16826426457074994,
            'weight_decay': 0.21559177659152684
        },
        'deberta_multitask': {
            'learning_rate': 2.858051065806938e-05,
            'batch_size': 2,
            'alpha': 0.5369658275448169,
            'hidden_dropout_prob': 0.061612603179999434,
            'classifier_dropout': 0.28226345557043153,
            'weight_decay': 0.017881888245041864,
            'warmup_ratio': 0.05975773894779193,
            'num_epochs': 5,
            'max_length': 128
        },
        'bertweet_multitask': {
            'learning_rate': 1.3352204399988585e-05,
            'batch_size': 4,
            'alpha': 0.4503170063321093,
            'hidden_dropout_prob': 0.1361646589006065,
            'classifier_dropout': 0.16618433315983214,
            'weight_decay': 0.06911326050717913,
            'warmup_ratio': 0.15511878702345422,
            'num_epochs': 5,
            'max_length': 128
        }
    }
    
    # Store results for each seed
    all_results = {}
    
    for seed in seeds:
        print(f"\n🌱 TRAINING AND EVALUATING WITH SEED {seed}")
        print("-" * 50)
        
        seed_results = {}
        
        # 1. Train and evaluate RoBERTa Sentiment
        print(f"\n1️⃣ RoBERTa Sentiment (Seed {seed})")
        model, encoder = train_roberta_single_task(
            'sentiment', best_params['roberta_sentiment'], seed, 
            sentiment_data, emotion_data, max_training_samples
        )
        
        # Load tokenizer
        tokenizer = RobertaTokenizer.from_pretrained(f"./trained_models_seeds/roberta_sentiment_seed_{seed}")
        
        # Evaluate
        results = evaluate_single_task_model(model, tokenizer, encoder, reddit_data, 'sentiment')
        seed_results['roberta_sentiment'] = results
        print(f"   Accuracy: {results['accuracy']:.4f}, Macro F1: {results['macro_f1']:.4f}")
        
        del model, tokenizer
        clear_memory()
        
        # 2. Train and evaluate RoBERTa Emotion
        print(f"\n2️⃣ RoBERTa Emotion (Seed {seed})")
        model, encoder = train_roberta_single_task(
            'emotion', best_params['roberta_emotion'], seed,
            sentiment_data, emotion_data, max_training_samples
        )
        
        # Load tokenizer
        tokenizer = RobertaTokenizer.from_pretrained(f"./trained_models_seeds/roberta_emotion_seed_{seed}")
        
        # Evaluate
        results = evaluate_single_task_model(model, tokenizer, encoder, reddit_data, 'emotion')
        seed_results['roberta_emotion'] = results
        print(f"   Accuracy: {results['accuracy']:.4f}, Macro F1: {results['macro_f1']:.4f}")
        
        del model, tokenizer
        clear_memory()
        
        # 3. Train and evaluate DeBERTa Multitask
        print(f"\n3️⃣ DeBERTa Multitask (Seed {seed})")
        model, sent_enc, emot_enc = train_multitask_model(
            "microsoft/deberta-base", best_params['deberta_multitask'], seed,
            sentiment_data, emotion_data, max_training_samples
        )
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(f"./trained_models_seeds/deberta_multitask_seed_{seed}")
        
        # Evaluate
        results = evaluate_multitask_model(
            model, tokenizer, sent_enc, emot_enc, reddit_data, 
            best_params['deberta_multitask']['max_length']
        )
        seed_results['deberta_multitask'] = results
        print(f"   Sentiment - Accuracy: {results['sentiment']['accuracy']:.4f}, F1: {results['sentiment']['macro_f1']:.4f}")
        print(f"   Emotion - Accuracy: {results['emotion']['accuracy']:.4f}, F1: {results['emotion']['macro_f1']:.4f}")
        
        del model, tokenizer
        clear_memory()
        
        # 4. Train and evaluate BERTweet Multitask
        print(f"\n4️⃣ BERTweet Multitask (Seed {seed})")
        model, sent_enc, emot_enc = train_multitask_model(
            "vinai/bertweet-base", best_params['bertweet_multitask'], seed,
            sentiment_data, emotion_data, max_training_samples
        )
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(f"./trained_models_seeds/bertweet_multitask_seed_{seed}")
        
        # Evaluate
        results = evaluate_multitask_model(
            model, tokenizer, sent_enc, emot_enc, reddit_data,
            best_params['bertweet_multitask']['max_length']
        )
        seed_results['bertweet_multitask'] = results
        print(f"   Sentiment - Accuracy: {results['sentiment']['accuracy']:.4f}, F1: {results['sentiment']['macro_f1']:.4f}")
        print(f"   Emotion - Accuracy: {results['emotion']['accuracy']:.4f}, F1: {results['emotion']['macro_f1']:.4f}")
        
        del model, tokenizer
        clear_memory()
        
        all_results[seed] = seed_results
        
        print(f"\n✅ Completed evaluation for seed {seed}")
    
    # Analyze stability across seeds
    print(f"\n📊 ANALYZING STABILITY ACROSS SEEDS")
    print("=" * 60)
    
    stability_analysis = analyze_seed_stability(all_results, seeds)
    
    # Save results
    save_results(all_results, stability_analysis, seeds)
    
    return all_results, stability_analysis

def analyze_seed_stability(all_results: Dict, seeds: List[int]) -> Dict:
    """Analyze stability of models across different seeds"""
    
    stability_stats = {}
    
    # Define model-task combinations
    evaluations = [
        ('roberta_sentiment', 'sentiment'),
        ('roberta_emotion', 'emotion'),
        ('deberta_multitask', 'sentiment'),
        ('deberta_multitask', 'emotion'),
        ('bertweet_multitask', 'sentiment'),
        ('bertweet_multitask', 'emotion')
    ]
    
    for model_name, task in evaluations:
        print(f"\n🔍 {model_name.upper()} - {task.upper()}")
        
        accuracies = []
        f1_scores = []
        
        for seed in seeds:
            if model_name in all_results[seed]:
                result = all_results[seed][model_name]
                
                if model_name.endswith('_multitask'):
                    acc = result[task]['accuracy']
                    f1 = result[task]['macro_f1']
                else:
                    acc = result['accuracy']
                    f1 = result['macro_f1']
                
                accuracies.append(acc)
                f1_scores.append(f1)
        
        if accuracies:
            acc_mean = np.mean(accuracies)
            acc_std = np.std(accuracies)
            f1_mean = np.mean(f1_scores)
            f1_std = np.std(f1_scores)
            
            stability_stats[f"{model_name}_{task}"] = {
                'accuracy_mean': acc_mean,
                'accuracy_std': acc_std,
                'f1_mean': f1_mean,
                'f1_std': f1_std,
                'accuracy_values': accuracies,
                'f1_values': f1_scores
            }
            
            print(f"   Accuracy: {acc_mean:.4f} ± {acc_std:.4f}")
            print(f"   Macro F1: {f1_mean:.4f} ± {f1_std:.4f}")
    
    return stability_stats

def save_results(all_results: Dict, stability_analysis: Dict, seeds: List[int]):
    """Save results to files"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save raw results
    results_file = f"./random_seed_analysis_results/raw_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        # Convert numpy types to Python types for JSON serialization
        serializable_results = {}
        for seed, seed_results in all_results.items():
            serializable_results[str(seed)] = {}
            for model, results in seed_results.items():
                if isinstance(results, dict):
                    serializable_results[str(seed)][model] = {}
                    for key, value in results.items():
                        if isinstance(value, dict):
                            serializable_results[str(seed)][model][key] = {
                                k: float(v) if isinstance(v, (np.floating, np.integer)) else 
                                   [float(x) if isinstance(x, (np.floating, np.integer)) else x for x in v] if isinstance(v, list) else v
                                for k, v in value.items()
                            }
                        else:
                            serializable_results[str(seed)][model][key] = float(value) if isinstance(value, (np.floating, np.integer)) else value
        
        json.dump(serializable_results, f, indent=2)
    
    # Save stability analysis
    stability_file = f"./random_seed_analysis_results/stability_analysis_{timestamp}.json"
    with open(stability_file, 'w') as f:
        serializable_stability = {}
        for key, stats in stability_analysis.items():
            serializable_stability[key] = {
                k: float(v) if isinstance(v, (np.floating, np.integer)) else 
                   [float(x) for x in v] if isinstance(v, list) else v
                for k, v in stats.items()
            }
        json.dump(serializable_stability, f, indent=2)
    
    # Create summary report
    summary_file = f"./random_seed_analysis_results/summary_report_{timestamp}.txt"
    with open(summary_file, 'w') as f:
        f.write("RANDOM SEED ANALYSIS SUMMARY REPORT\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Seeds tested: {seeds}\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        f.write("STABILITY ANALYSIS (Mean ± Std)\n")
        f.write("-" * 40 + "\n")
        
        for key, stats in stability_analysis.items():
            model_task = key.replace('_', ' ').title()
            f.write(f"\n{model_task}:\n")
            f.write(f"  Accuracy: {stats['accuracy_mean']:.4f} ± {stats['accuracy_std']:.4f}\n")
            f.write(f"  Macro F1: {stats['f1_mean']:.4f} ± {stats['f1_std']:.4f}\n")
        
        f.write(f"\nBest Performers (by mean F1 score):\n")
        f.write("-" * 30 + "\n")
        
        # Find best performers
        sentiment_best = max([k for k in stability_analysis.keys() if 'sentiment' in k], 
                           key=lambda x: stability_analysis[x]['f1_mean'])
        emotion_best = max([k for k in stability_analysis.keys() if 'emotion' in k], 
                         key=lambda x: stability_analysis[x]['f1_mean'])
        
        f.write(f"Sentiment: {sentiment_best.replace('_', ' ').title()} ")
        f.write(f"(F1: {stability_analysis[sentiment_best]['f1_mean']:.4f})\n")
        f.write(f"Emotion: {emotion_best.replace('_', ' ').title()} ")
        f.write(f"(F1: {stability_analysis[emotion_best]['f1_mean']:.4f})\n")
    
    print(f"\n💾 Results saved:")
    print(f"   Raw results: {results_file}")
    print(f"   Stability analysis: {stability_file}")
    print(f"   Summary report: {summary_file}")

print("✅ Random seed analysis function defined!")

In [None]:
# Cell 10: Run the Analysis
if __name__ == "__main__":
    # Run random seed analysis
    all_results, stability_analysis = run_random_seed_analysis(
        reddit_data_path="annotated_reddit_posts.csv",
        seeds=[42, 123, 456, 789, 999],  # 5 different seeds
        max_training_samples=5000  # Adjust based on your compute resources
    )
    
    print("\n🎉 RANDOM SEED ANALYSIS COMPLETED!")
    print("=" * 50)
    print("Check the './random_seed_analysis_results/' directory for detailed results.")