In [18]:
# Cell 1: Import Libraries and Setup
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import json
import warnings
from typing import Dict, List, Tuple, Optional
from collections import Counter
import random

# ML Libraries
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoConfig
import joblib

# Suppress warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Model paths
MODEL_PATHS = {
    'roberta_sentiment': 'roberta_sentiment_model_optimized',
    'roberta_emotion': 'roberta_emotion_model_optimized', 
    'deberta_multitask': 'deberta_optimized',
    'bertweet_multitask': 'bertweet_model_ultra_light'
}

DATA_PATH = 'annotated_reddit_posts.csv'

print("Libraries imported and paths configured")

Using device: cuda
Libraries imported and paths configured


In [27]:
import torch.nn as nn
from transformers import AutoModel, AutoConfig

# Cell 2: Multitask Model Architecture (FIXED)
class MultiTaskTransformer(nn.Module):
    """
    Multitask Learning Framework for Sentiment and Emotion Classification
    
    Features:
    - Shared transformer encoder (BERTweet, DeBERTa)
    - Task-specific attention heads
    - Parallel classification heads
    - Dropout for regularization
    """
    
    def __init__(
        self,
        model_name: str = "microsoft/deberta-base",
        sentiment_num_classes: int = 3,
        emotion_num_classes: int = 6,
        hidden_dropout_prob: float = 0.1,
        attention_dropout_prob: float = 0.1,
        classifier_dropout: float = 0.1,
        freeze_encoder: bool = False
    ):
        super(MultiTaskTransformer, self).__init__()
        
        self.model_name = model_name
        self.sentiment_num_classes = sentiment_num_classes
        self.emotion_num_classes = emotion_num_classes
        
        # Load configuration and adjust dropout
        config = AutoConfig.from_pretrained(model_name)
        config.hidden_dropout_prob = hidden_dropout_prob
        config.attention_probs_dropout_prob = attention_dropout_prob
        
        # Shared transformer encoder
        self.shared_encoder = AutoModel.from_pretrained(
            model_name,
            config=config,
            ignore_mismatched_sizes=True
        )
        
        # Freeze encoder if specified
        if freeze_encoder:
            for param in self.shared_encoder.parameters():
                param.requires_grad = False
        
        hidden_size = self.shared_encoder.config.hidden_size
        
        # Task-specific attention layers
        self.sentiment_attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=attention_dropout_prob,
            batch_first=True
        )
        
        self.emotion_attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=attention_dropout_prob,
            batch_first=True
        )
        
        # Shared attention for common features
        self.shared_attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            dropout=attention_dropout_prob,
            batch_first=True
        )
        
        # Layer normalization
        self.sentiment_norm = nn.LayerNorm(hidden_size)
        self.emotion_norm = nn.LayerNorm(hidden_size)
        self.shared_norm = nn.LayerNorm(hidden_size)
        
        # Dropout layers
        self.sentiment_dropout = nn.Dropout(classifier_dropout)
        self.emotion_dropout = nn.Dropout(classifier_dropout)
        self.shared_dropout = nn.Dropout(classifier_dropout)
        
        # Classification heads
        self.sentiment_classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),  # *2 for shared + task-specific
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(hidden_size, sentiment_num_classes)
        )
        
        self.emotion_classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),  # *2 for shared + task-specific
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(hidden_size, emotion_num_classes)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize classification head weights"""
        for module in [self.sentiment_classifier, self.emotion_classifier]:
            for layer in module:
                if isinstance(layer, nn.Linear):
                    nn.init.xavier_uniform_(layer.weight)
                    nn.init.zeros_(layer.bias)
    
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        task: Optional[str] = None
    ) -> Dict[str, torch.Tensor]:
        """
        Forward pass
        
        Args:
            input_ids: Token IDs [batch_size, seq_len]
            attention_mask: Attention mask [batch_size, seq_len]
            task: Optional task specification ("sentiment", "emotion", or None for both)
        
        Returns:
            Dictionary containing logits for requested tasks
        """
        # Shared encoder
        encoder_outputs = self.shared_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Get sequence output [batch_size, seq_len, hidden_size]
        sequence_output = encoder_outputs.last_hidden_state
        
        # Apply shared attention to capture common linguistic features
        shared_attended, _ = self.shared_attention(
            sequence_output, sequence_output, sequence_output,
            key_padding_mask=~attention_mask.bool()
        )
        shared_attended = self.shared_norm(shared_attended + sequence_output)
        shared_attended = self.shared_dropout(shared_attended)
        
        # Pool shared features (use [CLS] token or mean pooling)
        shared_pooled = shared_attended[:, 0, :]  # [CLS] token
        
        outputs = {}
        
        # Sentiment branch
        if task is None or task == "sentiment":
            # Task-specific attention for sentiment
            sentiment_attended, sentiment_weights = self.sentiment_attention(
                sequence_output, sequence_output, sequence_output,
                key_padding_mask=~attention_mask.bool()
            )
            sentiment_attended = self.sentiment_norm(sentiment_attended + sequence_output)
            sentiment_attended = self.sentiment_dropout(sentiment_attended)
            
            # Pool sentiment features
            sentiment_pooled = sentiment_attended[:, 0, :]  # [CLS] token
            
            # Combine shared and task-specific features
            sentiment_features = torch.cat([shared_pooled, sentiment_pooled], dim=-1)
            
            # Sentiment classification
            sentiment_logits = self.sentiment_classifier(sentiment_features)
            outputs["sentiment_logits"] = sentiment_logits
            outputs["sentiment_attention_weights"] = sentiment_weights
        
        # Emotion branch
        if task is None or task == "emotion":
            # Task-specific attention for emotion
            emotion_attended, emotion_weights = self.emotion_attention(
                sequence_output, sequence_output, sequence_output,
                key_padding_mask=~attention_mask.bool()
            )
            emotion_attended = self.emotion_norm(emotion_attended + sequence_output)
            emotion_attended = self.emotion_dropout(emotion_attended)
            
            # Pool emotion features
            emotion_pooled = emotion_attended[:, 0, :]  # [CLS] token
            
            # Combine shared and task-specific features
            emotion_features = torch.cat([shared_pooled, emotion_pooled], dim=-1)
            
            # Emotion classification
            emotion_logits = self.emotion_classifier(emotion_features)
            outputs["emotion_logits"] = emotion_logits
            outputs["emotion_attention_weights"] = emotion_weights
        
        return outputs
    
    # ✅ ADD THESE MISSING HUGGING FACE COMPATIBLE METHODS
    def save_pretrained(self, save_directory: str):
        """Save the model in Hugging Face compatible format"""
        import os
        import json
        
        os.makedirs(save_directory, exist_ok=True)
        
        # Save model state dict
        model_path = os.path.join(save_directory, "pytorch_model.bin")
        torch.save(self.state_dict(), model_path)
        
        # Save config
        config = {
            "model_name": self.model_name,
            "sentiment_num_classes": self.sentiment_num_classes,
            "emotion_num_classes": self.emotion_num_classes,
            "model_type": "MultiTaskTransformer"
        }
        config_path = os.path.join(save_directory, "config.json")
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)
        
        print(f"Model saved to {save_directory}")
    
    @classmethod
    def from_pretrained(cls, model_path: str, **kwargs):
        """Load the model in Hugging Face compatible format"""
        import os
        import json
        
        # Load config
        config_path = os.path.join(model_path, "config.json")
        with open(config_path, 'r') as f:
            config = json.load(f)
        
        # Create model instance
        model = cls(
            model_name=config["model_name"],
            sentiment_num_classes=config["sentiment_num_classes"],
            emotion_num_classes=config["emotion_num_classes"],
            **kwargs
        )
        
        # Load state dict
        model_file = os.path.join(model_path, "pytorch_model.bin")
        state_dict = torch.load(model_file, map_location='cpu')
        model.load_state_dict(state_dict)
        
        print(f"Model loaded from {model_path}")
        return model

# Model configuration options
MODEL_CONFIGS = {
    "bertweet": {
        "name": "vinai/bertweet-base",
        "description": "BERTweet optimized for social media text"
    },
    "deberta": {
        "name": "microsoft/deberta-base",
        "description": "DeBERTa with enhanced attention mechanism"
    }
}

print("✅ Multitask model architecture defined!")
print("Available models:", list(MODEL_CONFIGS.keys()))

✅ Multitask model architecture defined!
Available models: ['bertweet', 'deberta']


In [28]:
# Cell 2: Define Model Loading Functions
class RoBERTaSingleTaskPredictor:
    
    def __init__(self, model_path: str, task_type: str, max_length: int = 512):
        self.device = device
        self.task_type = task_type  # 'sentiment' or 'emotion'
        self.max_length = max_length
        
        print(f"📥 Loading RoBERTa {task_type} model from {model_path}")
        
        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()
        
        # Load label encoder if available
        encoder_path = os.path.join(model_path, f'{task_type}_encoder.pkl')
        if os.path.exists(encoder_path):
            self.label_encoder = joblib.load(encoder_path)
        else:
            # Create default encoder
            self.label_encoder = LabelEncoder()
            if task_type == 'sentiment':
                self.label_encoder.classes_ = np.array(['Negative', 'Neutral', 'Positive'])
            else:  # emotion
                self.label_encoder.classes_ = np.array(['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise'])
        
        print(f"✅ RoBERTa {task_type} model loaded successfully")
        print(f"   Classes: {list(self.label_encoder.classes_)}")
    
    def predict_batch(self, texts: List[str], batch_size: int = 16) -> List[Dict]:
        """Predict for a batch of texts"""
        results = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=self.max_length
            )
            
            # Move to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs.logits
                probs = F.softmax(logits, dim=-1)
                preds = torch.argmax(logits, dim=-1)
                
                # Process results
                for j in range(len(batch_texts)):
                    pred_id = preds[j].item()
                    confidence = probs[j][pred_id].item()
                    
                    # Handle out of range predictions
                    if pred_id >= len(self.label_encoder.classes_):
                        pred_id = 0
                    
                    label = self.label_encoder.classes_[pred_id]
                    
                    result = {
                        'text': batch_texts[j],
                        'predicted_label': label,
                        'confidence': confidence,
                        'class_id': pred_id
                    }
                    results.append(result)
        
        return results

class MultiTaskPredictor:
    
    def __init__(self, model_path: str, model_name: str, max_length: int = 128):
        self.device = device
        self.model_name = model_name
        self.max_length = max_length
        
        print(f"📥 Loading {model_name} multitask model from {model_path}")
        
        # Find the actual model directory
        model_dir = self._find_model_directory(model_path)
        
        # Load tokenizer
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        except:
            # Fallback to original model name
            original_name = "microsoft/deberta-base" if "deberta" in model_name.lower() else "vinai/bertweet-base"
            self.tokenizer = AutoTokenizer.from_pretrained(original_name)
        
        # Load multitask model
        self.model = self._load_multitask_model(model_dir)
        self.model.to(self.device)
        self.model.eval()
        
        # Load label encoders
        self.sentiment_encoder, self.emotion_encoder = self._load_encoders(model_dir)
        
        print(f"✅ {model_name} multitask model loaded successfully")
        print(f"   Sentiment classes: {list(self.sentiment_encoder.classes_)}")
        print(f"   Emotion classes: {list(self.emotion_encoder.classes_)}")
    
    def _find_model_directory(self, base_path: str) -> str:
        """Find the actual model directory"""
        possible_dirs = [
            base_path,
            os.path.join(base_path, 'best_model'),
            os.path.join(base_path, 'final_model'),
            os.path.join(base_path, 'checkpoint-epoch-1'),
            os.path.join(base_path, 'checkpoint-epoch-2')
        ]
        
        for dir_path in possible_dirs:
            if os.path.exists(dir_path) and os.path.exists(os.path.join(dir_path, 'pytorch_model.bin')):
                return dir_path
        
        raise FileNotFoundError(f"No valid model found in {base_path}")
    
    def _load_multitask_model(self, model_dir: str):
        """Load the multitask model"""
        try:
            # Try loading with from_pretrained (if it was saved properly)
            return MultiTaskTransformer.from_pretrained(model_dir)
        except:
            # Manual loading
            with open(os.path.join(model_dir, 'config.json'), 'r') as f:
                config = json.load(f)
            
            model = MultiTaskTransformer(
                model_name=config.get("model_name", "microsoft/deberta-base"),
                sentiment_num_classes=config.get("sentiment_num_classes", 3),
                emotion_num_classes=config.get("emotion_num_classes", 6)
            )
            
            state_dict = torch.load(os.path.join(model_dir, 'pytorch_model.bin'), map_location='cpu')
            model.load_state_dict(state_dict)
            return model
    
    def _load_encoders(self, model_dir: str):
        """Load label encoders"""
        sentiment_path = os.path.join(model_dir, 'sentiment_encoder.pkl')
        emotion_path = os.path.join(model_dir, 'emotion_encoder.pkl')
        
        if os.path.exists(sentiment_path) and os.path.exists(emotion_path):
            sentiment_encoder = joblib.load(sentiment_path)
            emotion_encoder = joblib.load(emotion_path)
        else:
            # Default encoders
            sentiment_encoder = LabelEncoder()
            emotion_encoder = LabelEncoder()
            sentiment_encoder.classes_ = np.array(['Negative', 'Neutral', 'Positive'])
            emotion_encoder.classes_ = np.array(['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise'])
        
        return sentiment_encoder, emotion_encoder
    
    def predict_batch(self, texts: List[str], batch_size: int = 8) -> List[Dict]:
        """Predict both sentiment and emotion for a batch of texts"""
        results = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=self.max_length
            )
            
            # Only pass required inputs
            model_inputs = {
                'input_ids': inputs['input_ids'].to(self.device),
                'attention_mask': inputs['attention_mask'].to(self.device)
            }
            
            with torch.no_grad():
                outputs = self.model(**model_inputs)
                
                sentiment_logits = outputs['sentiment_logits']
                emotion_logits = outputs['emotion_logits']
                
                sentiment_probs = F.softmax(sentiment_logits, dim=-1)
                emotion_probs = F.softmax(emotion_logits, dim=-1)
                
                sentiment_preds = torch.argmax(sentiment_logits, dim=-1)
                emotion_preds = torch.argmax(emotion_logits, dim=-1)
                
                # Process results
                for j in range(len(batch_texts)):
                    sent_id = sentiment_preds[j].item()
                    emot_id = emotion_preds[j].item()
                    
                    # Handle out of range
                    if sent_id >= len(self.sentiment_encoder.classes_):
                        sent_id = 0
                    if emot_id >= len(self.emotion_encoder.classes_):
                        emot_id = 0
                    
                    result = {
                        'text': batch_texts[j],
                        'sentiment': {
                            'label': self.sentiment_encoder.classes_[sent_id],
                            'confidence': sentiment_probs[j][sent_id].item(),
                            'class_id': sent_id
                        },
                        'emotion': {
                            'label': self.emotion_encoder.classes_[emot_id],
                            'confidence': emotion_probs[j][emot_id].item(),
                            'class_id': emot_id
                        }
                    }
                    results.append(result)
        
        return results

print("Model predictor classes defined")

Model predictor classes defined


In [29]:
# Cell 3: Evaluation Functions
def calculate_metrics(true_labels: List[str], pred_labels: List[str], task_name: str) -> Dict:
    accuracy = accuracy_score(true_labels, pred_labels)
    macro_f1 = f1_score(true_labels, pred_labels, average='macro', zero_division=0)
    
    return {
        'task': task_name,
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'classification_report': classification_report(true_labels, pred_labels, zero_division=0)
    }

def evaluate_single_task_model(predictor, texts: List[str], true_labels: List[str], 
                              task_name: str, model_name: str) -> Dict:
    print(f"🔮 Evaluating {model_name} for {task_name}...")
    
    # Get predictions
    predictions = predictor.predict_batch(texts)
    pred_labels = [pred['predicted_label'] for pred in predictions]
    
    # Calculate metrics
    metrics = calculate_metrics(true_labels, pred_labels, task_name)
    
    # Add model info
    metrics['model_name'] = model_name
    metrics['model_type'] = 'single_task'
    metrics['predictions'] = predictions
    
    return metrics

def evaluate_multitask_model(predictor, texts: List[str], true_sentiments: List[str], 
                            true_emotions: List[str], model_name: str) -> Dict:
    """Evaluate a multitask model"""
    print(f"🔮 Evaluating {model_name} for both tasks...")
    
    # Get predictions
    predictions = predictor.predict_batch(texts)
    
    # Extract predictions
    pred_sentiments = [pred['sentiment']['label'] for pred in predictions]
    pred_emotions = [pred['emotion']['label'] for pred in predictions]
    
    # Calculate metrics for both tasks
    sentiment_metrics = calculate_metrics(true_sentiments, pred_sentiments, 'sentiment')
    emotion_metrics = calculate_metrics(true_emotions, pred_emotions, 'emotion')
    
    return {
        'model_name': model_name,
        'model_type': 'multitask',
        'sentiment': sentiment_metrics,
        'emotion': emotion_metrics,
        'combined_accuracy': (sentiment_metrics['accuracy'] + emotion_metrics['accuracy']) / 2,
        'combined_macro_f1': (sentiment_metrics['macro_f1'] + emotion_metrics['macro_f1']) / 2,
        'predictions': predictions
    }

def set_random_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

print("Evaluation functions defined")

Evaluation functions defined


In [30]:
# Cell 4: Load Data
def load_and_prepare_data(data_path: str) -> Tuple[List[str], List[str], List[str]]:
    print(f"Loading data from {data_path}")
    
    df = pd.read_csv(data_path)
    
    texts = df['text_content'].tolist()
    sentiments = df['sentiment'].tolist()
    emotions = df['emotion'].tolist()
    
    print(f"✅ Data loaded: {len(texts)} samples")
    print(f"   Sentiment classes: {sorted(set(sentiments))}")
    print(f"   Emotion classes: {sorted(set(emotions))}")
    
    return texts, sentiments, emotions

# Load the data
texts, true_sentiments, true_emotions = load_and_prepare_data(DATA_PATH)

Loading data from annotated_reddit_posts.csv
✅ Data loaded: 95 samples
   Sentiment classes: ['Negative', 'Neutral', 'Positive']
   Emotion classes: ['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise']


In [31]:
# Cell 5: Load All Models
def load_all_models() -> Dict:
    """Load all four models"""
    models = {}
    
    print("🚀 Loading All Models")
    print("=" * 50)
    
    try:
        # RoBERTa Sentiment Model
        print("\n1️⃣ Loading RoBERTa Sentiment Model")
        models['roberta_sentiment'] = RoBERTaSingleTaskPredictor(
            MODEL_PATHS['roberta_sentiment'], 
            'sentiment'
        )
    except Exception as e:
        print(f"❌ Failed to load RoBERTa sentiment model: {e}")
        models['roberta_sentiment'] = None
    
    try:
        # RoBERTa Emotion Model  
        print("\n2️⃣ Loading RoBERTa Emotion Model")
        models['roberta_emotion'] = RoBERTaSingleTaskPredictor(
            MODEL_PATHS['roberta_emotion'], 
            'emotion'
        )
    except Exception as e:
        print(f"❌ Failed to load RoBERTa emotion model: {e}")
        models['roberta_emotion'] = None
    
    try:
        # DeBERTa Multitask Model
        print("\n3️⃣ Loading DeBERTa Multitask Model")
        models['deberta_multitask'] = MultiTaskPredictor(
            MODEL_PATHS['deberta_multitask'], 
            'DeBERTa'
        )
    except Exception as e:
        print(f"❌ Failed to load DeBERTa multitask model: {e}")
        models['deberta_multitask'] = None
    
    try:
        # BERTweet Multitask Model
        print("\n4️⃣ Loading BERTweet Multitask Model")
        models['bertweet_multitask'] = MultiTaskPredictor(
            MODEL_PATHS['bertweet_multitask'], 
            'BERTweet'
        )
    except Exception as e:
        print(f"❌ Failed to load BERTweet multitask model: {e}")
        models['bertweet_multitask'] = None
    
    # Summary
    loaded_models = [k for k, v in models.items() if v is not None]
    failed_models = [k for k, v in models.items() if v is None]
    
    print(f"\n📊 Model Loading Summary:")
    print(f"   ✅ Loaded: {loaded_models}")
    if failed_models:
        print(f"   ❌ Failed: {failed_models}")
    
    return models

# Load all models
models = load_all_models()                                                                                      

🚀 Loading All Models

1️⃣ Loading RoBERTa Sentiment Model
📥 Loading RoBERTa sentiment model from roberta_sentiment_model_optimized
✅ RoBERTa sentiment model loaded successfully
   Classes: [np.str_('Negative'), np.str_('Neutral'), np.str_('Positive')]

2️⃣ Loading RoBERTa Emotion Model
📥 Loading RoBERTa emotion model from roberta_emotion_model_optimized
✅ RoBERTa emotion model loaded successfully
   Classes: [np.str_('Anger'), np.str_('Fear'), np.str_('Joy'), np.str_('No Emotion'), np.str_('Sadness'), np.str_('Surprise')]

3️⃣ Loading DeBERTa Multitask Model
📥 Loading DeBERTa multitask model from deberta_optimized
Model loaded from deberta_optimized
✅ DeBERTa multitask model loaded successfully
   Sentiment classes: [np.str_('Negative'), np.str_('Neutral'), np.str_('Positive')]
   Emotion classes: [np.str_('Anger'), np.str_('Fear'), np.str_('Joy'), np.str_('No Emotion'), np.str_('Sadness'), np.str_('Surprise')]

4️⃣ Loading BERTweet Multitask Model
📥 Loading BERTweet multitask model fr

In [32]:
# Cell 6: Single Seed Evaluation
def run_single_evaluation(models: Dict, texts: List[str], true_sentiments: List[str], 
                         true_emotions: List[str], seed: int = 42) -> Dict:
    """Run evaluation for all models with a single seed"""
    set_random_seed(seed)
    
    print(f"\n🎯 Running Evaluation (Seed: {seed})")
    print("=" * 50)
    
    results = {'seed': seed, 'models': {}}
    
    # Evaluate RoBERTa Sentiment
    if models['roberta_sentiment'] is not None:
        results['models']['roberta_sentiment'] = evaluate_single_task_model(
            models['roberta_sentiment'], texts, true_sentiments, 'sentiment', 'RoBERTa-Sentiment'
        )
    
    # Evaluate RoBERTa Emotion
    if models['roberta_emotion'] is not None:
        results['models']['roberta_emotion'] = evaluate_single_task_model(
            models['roberta_emotion'], texts, true_emotions, 'emotion', 'RoBERTa-Emotion'
        )
    
    # Evaluate DeBERTa Multitask
    if models['deberta_multitask'] is not None:
        results['models']['deberta_multitask'] = evaluate_multitask_model(
            models['deberta_multitask'], texts, true_sentiments, true_emotions, 'DeBERTa-Multitask'
        )
    
    # Evaluate BERTweet Multitask
    if models['bertweet_multitask'] is not None:
        results['models']['bertweet_multitask'] = evaluate_multitask_model(
            models['bertweet_multitask'], texts, true_sentiments, true_emotions, 'BERTweet-Multitask'
        )
    
    return results

# Run single evaluation
single_results = run_single_evaluation(models, texts, true_sentiments, true_emotions, seed=42)


🎯 Running Evaluation (Seed: 42)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...


In [25]:
# Cell 7: Print Single Seed Results
def print_single_results(results: Dict):
    """Print results for single seed evaluation"""
    print("\n📈 SINGLE SEED EVALUATION RESULTS")
    print("=" * 60)
    
    for model_key, model_results in results['models'].items():
        if model_results is None:
            continue
            
        print(f"\n🤖 {model_results['model_name']}")
        print("-" * 40)
        
        if model_results['model_type'] == 'single_task':
            task = model_results['task']
            print(f"📊 {task.upper()} CLASSIFICATION:")
            print(f"   Accuracy:    {model_results['accuracy']:.4f} ({model_results['accuracy']*100:.2f}%)")
            print(f"   Macro F1:    {model_results['macro_f1']:.4f}")
            
        else:  # multitask
            print(f"📊 SENTIMENT CLASSIFICATION:")
            print(f"   Accuracy:    {model_results['sentiment']['accuracy']:.4f} ({model_results['sentiment']['accuracy']*100:.2f}%)")
            print(f"   Macro F1:    {model_results['sentiment']['macro_f1']:.4f}")
            
            print(f"\n😊 EMOTION CLASSIFICATION:")
            print(f"   Accuracy:    {model_results['emotion']['accuracy']:.4f} ({model_results['emotion']['accuracy']*100:.2f}%)")
            print(f"   Macro F1:    {model_results['emotion']['macro_f1']:.4f}")
            
            print(f"\n🏆 COMBINED PERFORMANCE:")
            print(f"   Avg Accuracy: {model_results['combined_accuracy']:.4f}")
            print(f"   Avg Macro F1: {model_results['combined_macro_f1']:.4f}")

# Print the results
print_single_results(single_results)


📈 SINGLE SEED EVALUATION RESULTS

🤖 RoBERTa-Sentiment
----------------------------------------
📊 SENTIMENT CLASSIFICATION:
   Accuracy:    0.5474 (54.74%)
   Macro F1:    0.2358

🤖 RoBERTa-Emotion
----------------------------------------
📊 EMOTION CLASSIFICATION:
   Accuracy:    0.3579 (35.79%)
   Macro F1:    0.1613


In [33]:
# Cell 8: Random Seed Analysis
def run_random_seed_analysis(models: Dict, texts: List[str], true_sentiments: List[str], 
                            true_emotions: List[str], seeds: List[int] = [42, 123, 456, 789, 999]) -> Dict:
    """Run evaluation across multiple random seeds"""
    print(f"\n🎲 Random Seed Analysis")
    print("=" * 50)
    print(f"Testing seeds: {seeds}")
    
    all_results = []
    
    for seed in seeds:
        print(f"\n🔄 Running evaluation with seed {seed}...")
        seed_results = run_single_evaluation(models, texts, true_sentiments, true_emotions, seed)
        all_results.append(seed_results)
    
    return analyze_seed_stability(all_results)

def analyze_seed_stability(all_results: List[Dict]) -> Dict:
    """Analyze stability across seeds"""
    print(f"\n📊 Analyzing Stability Across Seeds")
    print("-" * 40)
    
    stability_analysis = {}
    
    # Get all model names
    model_names = set()
    for result in all_results:
        model_names.update(result['models'].keys())
    
    for model_name in model_names:
        print(f"\n🤖 {model_name.upper()}")
        
        # Collect metrics across seeds
        sentiment_accs, sentiment_f1s = [], []
        emotion_accs, emotion_f1s = [], []
        
        for result in all_results:
            if model_name not in result['models'] or result['models'][model_name] is None:
                continue
                
            model_result = result['models'][model_name]
            
            if model_result['model_type'] == 'single_task':
                if model_result['task'] == 'sentiment':
                    sentiment_accs.append(model_result['accuracy'])
                    sentiment_f1s.append(model_result['macro_f1'])
                else:  # emotion
                    emotion_accs.append(model_result['accuracy'])
                    emotion_f1s.append(model_result['macro_f1'])
            else:  # multitask
                sentiment_accs.append(model_result['sentiment']['accuracy'])
                sentiment_f1s.append(model_result['sentiment']['macro_f1'])
                emotion_accs.append(model_result['emotion']['accuracy'])
                emotion_f1s.append(model_result['emotion']['macro_f1'])
        
        # Calculate statistics
        model_stability = {}
        
        if sentiment_accs:
            model_stability['sentiment'] = {
                'accuracy_mean': np.mean(sentiment_accs),
                'accuracy_std': np.std(sentiment_accs),
                'f1_mean': np.mean(sentiment_f1s),
                'f1_std': np.std(sentiment_f1s)
            }
            print(f"   📊 Sentiment - Accuracy: {np.mean(sentiment_accs):.4f} ± {np.std(sentiment_accs):.4f}")
            print(f"                Macro F1:  {np.mean(sentiment_f1s):.4f} ± {np.std(sentiment_f1s):.4f}")
        
        if emotion_accs:
            model_stability['emotion'] = {
                'accuracy_mean': np.mean(emotion_accs),
                'accuracy_std': np.std(emotion_accs),
                'f1_mean': np.mean(emotion_f1s),
                'f1_std': np.std(emotion_f1s)
            }
            print(f"   😊 Emotion   - Accuracy: {np.mean(emotion_accs):.4f} ± {np.std(emotion_accs):.4f}")
            print(f"                Macro F1:  {np.mean(emotion_f1s):.4f} ± {np.std(emotion_f1s):.4f}")
        
        stability_analysis[model_name] = model_stability
    
    return {
        'all_results': all_results,
        'stability_analysis': stability_analysis,
        'seeds_tested': [r['seed'] for r in all_results]
    }

# Run random seed analysis
seed_analysis = run_random_seed_analysis(models, texts, true_sentiments, true_emotions)


🎲 Random Seed Analysis
Testing seeds: [42, 123, 456, 789, 999]

🔄 Running evaluation with seed 42...

🎯 Running Evaluation (Seed: 42)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...

🔄 Running evaluation with seed 123...

🎯 Running Evaluation (Seed: 123)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...

🔄 Running evaluation with seed 456...

🎯 Running Evaluation (Seed: 456)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...

🔄 Running evaluation with seed 789...

🎯 Running Evaluation (Seed: 789)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emot

In [35]:
# Cell 9: Model Comparison and Summary
def create_model_comparison_table(stability_analysis: Dict) -> pd.DataFrame:
    """Create a comparison table of all models"""
    rows = []
    
    for model_name, model_stats in stability_analysis.items():
        if 'sentiment' in model_stats:
            rows.append({
                'Model': model_name.replace('_', ' ').title(),
                'Task': 'Sentiment',
                'Accuracy (Mean)': f"{model_stats['sentiment']['accuracy_mean']:.4f}",
                'Accuracy (Std)': f"{model_stats['sentiment']['accuracy_std']:.4f}",
                'Macro F1 (Mean)': f"{model_stats['sentiment']['f1_mean']:.4f}",
                'Macro F1 (Std)': f"{model_stats['sentiment']['f1_std']:.4f}"
            })
        
        if 'emotion' in model_stats:
            rows.append({
                'Model': model_name.replace('_', ' ').title(),
                'Task': 'Emotion',
                'Accuracy (Mean)': f"{model_stats['emotion']['accuracy_mean']:.4f}",
                'Accuracy (Std)': f"{model_stats['emotion']['accuracy_std']:.4f}",
                'Macro F1 (Mean)': f"{model_stats['emotion']['f1_mean']:.4f}",
                'Macro F1 (Std)': f"{model_stats['emotion']['f1_std']:.4f}"
            })
    
    return pd.DataFrame(rows)

def print_final_summary(seed_analysis: Dict):
    """Print final summary of all evaluations"""
    print(f"\n🏆 FINAL MODEL COMPARISON SUMMARY")
    print("=" * 70)
    
    # Create comparison table
    comparison_df = create_model_comparison_table(seed_analysis['stability_analysis'])
    print(comparison_df.to_string(index=False))
    
    print(f"\n📝 Key Insights:")
    print(f"   • Evaluated {len(seed_analysis['seeds_tested'])} random seeds: {seed_analysis['seeds_tested']}")
    print(f"   • Lower standard deviation indicates more stable performance")
    print(f"   • Multitask models provide both sentiment and emotion predictions")
    print(f"   • Single-task models are specialized for one task")
    
    # Find best performers
    sentiment_best = None
    emotion_best = None
    best_sent_f1 = 0
    best_emot_f1 = 0
    
    for model_name, stats in seed_analysis['stability_analysis'].items():
        if 'sentiment' in stats and stats['sentiment']['f1_mean'] > best_sent_f1:
            best_sent_f1 = stats['sentiment']['f1_mean']
            sentiment_best = model_name
        
        if 'emotion' in stats and stats['emotion']['f1_mean'] > best_emot_f1:
            best_emot_f1 = stats['emotion']['f1_mean']
            emotion_best = model_name
    
    print(f"\n🥇 Best Performers (by Macro F1):")
    if sentiment_best:
        print(f"   Sentiment: {sentiment_best.replace('_', ' ').title()} ({best_sent_f1:.4f})")
    if emotion_best:
        print(f"   Emotion:   {emotion_best.replace('_', ' ').title()} ({best_emot_f1:.4f})")

# Print final summary
print_final_summary(seed_analysis)


🏆 FINAL MODEL COMPARISON SUMMARY
             Model      Task Accuracy (Mean) Accuracy (Std) Macro F1 (Mean) Macro F1 (Std)
 Deberta Multitask Sentiment          0.5895         0.0000          0.3723         0.0000
 Deberta Multitask   Emotion          0.1579         0.0000          0.1352         0.0000
 Roberta Sentiment Sentiment          0.5474         0.0000          0.2358         0.0000
   Roberta Emotion   Emotion          0.3579         0.0000          0.1613         0.0000
Bertweet Multitask Sentiment          0.5474         0.0000          0.2358         0.0000
Bertweet Multitask   Emotion          0.1895         0.0000          0.0722         0.0000

📝 Key Insights:
   • Evaluated 5 random seeds: [42, 123, 456, 789, 999]
   • Lower standard deviation indicates more stable performance
   • Multitask models provide both sentiment and emotion predictions
   • Single-task models are specialized for one task

🥇 Best Performers (by Macro F1):
   Sentiment: Deberta Multitask (0.3

In [36]:
# Cell 11: Bootstrap Sampling Stability Test (Most Practical)
def run_bootstrap_stability_analysis(models: Dict, texts: List[str], 
                                   true_sentiments: List[str], true_emotions: List[str],
                                   n_bootstrap: int = 10, sample_ratio: float = 0.8):
    """
    Test model stability using bootstrap sampling
    This gives you meaningful variance by testing on different data subsets
    """
    print(f"\n🔄 Bootstrap Stability Analysis")
    print("=" * 50)
    print(f"Running {n_bootstrap} bootstrap samples (each with {sample_ratio*100:.0f}% of data)")
    
    bootstrap_results = []
    data_size = len(texts)
    sample_size = int(data_size * sample_ratio)
    
    for i in range(n_bootstrap):
        print(f"\n📊 Bootstrap Sample {i+1}/{n_bootstrap}")
        
        # Create bootstrap sample
        np.random.seed(i * 42)  # Different seed for each bootstrap
        indices = np.random.choice(data_size, sample_size, replace=True)
        
        bootstrap_texts = [texts[idx] for idx in indices]
        bootstrap_sentiments = [true_sentiments[idx] for idx in indices]
        bootstrap_emotions = [true_emotions[idx] for idx in indices]
        
        # Evaluate models on this bootstrap sample
        bootstrap_result = run_single_evaluation(
            models, bootstrap_texts, bootstrap_sentiments, bootstrap_emotions, seed=42
        )
        bootstrap_result['bootstrap_id'] = i
        bootstrap_result['sample_indices'] = indices.tolist()
        bootstrap_results.append(bootstrap_result)
    
    return analyze_bootstrap_stability(bootstrap_results)

def analyze_bootstrap_stability(bootstrap_results: List[Dict]) -> Dict:
    """Analyze stability across bootstrap samples"""
    print(f"\n📊 Analyzing Bootstrap Stability")
    print("-" * 40)
    
    stability_analysis = {}
    
    # Get all model names
    model_names = set()
    for result in bootstrap_results:
        model_names.update(result['models'].keys())
    
    for model_name in model_names:
        print(f"\n🤖 {model_name.upper()}")
        
        # Collect metrics across bootstrap samples
        sentiment_accs, sentiment_f1s = [], []
        emotion_accs, emotion_f1s = [], []
        
        for result in bootstrap_results:
            if model_name not in result['models'] or result['models'][model_name] is None:
                continue
                
            model_result = result['models'][model_name]
            
            if model_result['model_type'] == 'single_task':
                if model_result['task'] == 'sentiment':
                    sentiment_accs.append(model_result['accuracy'])
                    sentiment_f1s.append(model_result['macro_f1'])
                else:  # emotion
                    emotion_accs.append(model_result['accuracy'])
                    emotion_f1s.append(model_result['macro_f1'])
            else:  # multitask
                sentiment_accs.append(model_result['sentiment']['accuracy'])
                sentiment_f1s.append(model_result['sentiment']['macro_f1'])
                emotion_accs.append(model_result['emotion']['accuracy'])
                emotion_f1s.append(model_result['emotion']['macro_f1'])
        
        # Calculate statistics
        model_stability = {}
        
        if sentiment_accs:
            model_stability['sentiment'] = {
                'accuracy_mean': np.mean(sentiment_accs),
                'accuracy_std': np.std(sentiment_accs),
                'f1_mean': np.mean(sentiment_f1s),
                'f1_std': np.std(sentiment_f1s)
            }
            print(f"   📊 Sentiment - Accuracy: {np.mean(sentiment_accs):.4f} ± {np.std(sentiment_accs):.4f}")
            print(f"                Macro F1:  {np.mean(sentiment_f1s):.4f} ± {np.std(sentiment_f1s):.4f}")
        
        if emotion_accs:
            model_stability['emotion'] = {
                'accuracy_mean': np.mean(emotion_accs),
                'accuracy_std': np.std(emotion_accs),
                'f1_mean': np.mean(emotion_f1s),
                'f1_std': np.std(emotion_f1s)
            }
            print(f"   😊 Emotion   - Accuracy: {np.mean(emotion_accs):.4f} ± {np.std(emotion_accs):.4f}")
            print(f"                Macro F1:  {np.mean(emotion_f1s):.4f} ± {np.std(emotion_f1s):.4f}")
        
        stability_analysis[model_name] = model_stability
    
    return {
        'bootstrap_results': bootstrap_results,
        'stability_analysis': stability_analysis,
        'n_bootstrap': len(bootstrap_results)
    }

# Run bootstrap analysis (this will give you meaningful variance)
bootstrap_analysis = run_bootstrap_stability_analysis(models, texts, true_sentiments, true_emotions)


🔄 Bootstrap Stability Analysis
Running 10 bootstrap samples (each with 80% of data)

📊 Bootstrap Sample 1/10

🎯 Running Evaluation (Seed: 42)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...

📊 Bootstrap Sample 2/10

🎯 Running Evaluation (Seed: 42)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...

📊 Bootstrap Sample 3/10

🎯 Running Evaluation (Seed: 42)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBERTa-Multitask for both tasks...
🔮 Evaluating BERTweet-Multitask for both tasks...

📊 Bootstrap Sample 4/10

🎯 Running Evaluation (Seed: 42)
🔮 Evaluating RoBERTa-Sentiment for sentiment...
🔮 Evaluating RoBERTa-Emotion for emotion...
🔮 Evaluating DeBER