In [75]:
# Cell 1: Setup and Imports for RoBERTa Pipeline
import os
import json
import time
import random
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

# Transformers
from transformers import (
    AutoTokenizer, 
    AutoModel,
    get_linear_schedule_with_warmup
)

# ML utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib

# Hyperparameter tuning
import optuna

# Dataset loading
from datasets import load_dataset

# Set random seeds for reproducibility
def set_random_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_random_seed(42)

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("All imports completed and GPU configured")

Using device: cuda
GPU: NVIDIA GeForce RTX 4060
GPU Memory: 8.0 GB
All imports completed and GPU configured


In [76]:
# Before training, add cleanup code
import shutil
import os
import gc
import torch
import time

# Clean up output directory if it exists
output_dir = "./initial_distilroberta_sentiment_model"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

# Ensure CUDA cache is cleared
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

2972

In [77]:
# Cell 2: Configuration Classes for distilroBERTa
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    model_name: str = "distilroberta-base"
    max_length: int = 128
    batch_size: int = 8  # Standardize to match BERTweet
    learning_rate: float = 2e-5
    num_epochs: int = 3
    warmup_ratio: float = 0.1
    weight_decay: float = 0.01
    max_grad_norm: float = 1.0
    hidden_dropout_prob: float = 0.1
    attention_dropout_prob: float = 0.1
    classifier_dropout: float = 0.1
    output_dir: str = "./distilroberta_model_output"
    alpha: float = 0.5  # For multitask loss weighting
    task_type: str = "multitask"  # "sentiment", "emotion", or "multitask"

class distilroBERTaModelConfig:
    def __init__(self):
        self.sentiment_classes = ['Negative', 'Neutral', 'Positive']
        self.emotion_classes = ['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise']
        self.sentiment_num_classes = len(self.sentiment_classes)
        self.emotion_num_classes = len(self.emotion_classes)

roberta_model_config = distilroBERTaModelConfig()
print("Configuration classes defined")

Configuration classes defined


In [78]:
class distilroBERTaModelConfig:
    def __init__(self):
        self.sentiment_classes = ['Negative', 'Neutral', 'Positive']
        self.emotion_classes = ['Anger', 'Fear', 'Joy', 'No Emotion', 'Sadness', 'Surprise']
        self.sentiment_num_classes = len(self.sentiment_classes)
        self.emotion_num_classes = len(self.emotion_classes)

roberta_model_config = distilroBERTaModelConfig()
print("Configuration classes defined")

Configuration classes defined


In [79]:
# Cell 3: Dataset Classes for distilroBERTa
class distilroBERTaSingleTaskDataset:
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        tokenizer,
        max_length: int = 128
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        assert len(texts) == len(labels), "Texts and labels must have same length"
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # RoBERTa tokenization
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'text': text
        }

class distilroBERTaMultiTaskDataset(Dataset):
    
    def __init__(
        self,
        texts: List[str],
        sentiment_labels: List[int],
        emotion_labels: List[int],
        tokenizer,
        max_length: int = 128
    ):
        self.texts = texts
        self.sentiment_labels = sentiment_labels
        self.emotion_labels = emotion_labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        assert len(texts) == len(sentiment_labels) == len(emotion_labels), \
            "All inputs must have same length"
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        sentiment_label = self.sentiment_labels[idx]
        emotion_label = self.emotion_labels[idx]
        
        # RoBERTa tokenization
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment_labels': torch.tensor(sentiment_label, dtype=torch.long),
            'emotion_labels': torch.tensor(emotion_label, dtype=torch.long),
            'text': text
        }

print("distilroBERTa Dataset classes defined")

distilroBERTa Dataset classes defined


In [80]:
# Cell 4: Model Architectures
class distilroBERTaSingleTaskTransformer(nn.Module):
    
    def __init__(
        self,
        model_name: str = "distilroberta-base",
        num_classes: int = 3,
        hidden_dropout_prob: float = 0.1,
        attention_dropout_prob: float = 0.1,
        classifier_dropout: float = 0.1
    ):
        super().__init__()
        self.num_classes = num_classes
        
        # Load RoBERTa model
        self.roberta = AutoModel.from_pretrained(
            model_name,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_dropout_prob
        )
        
        # Classification head
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa outputs
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0]  # [CLS] token
        pooled_output = self.dropout(pooled_output)
        
        # Classification
        logits = self.classifier(pooled_output)
        
        return {'logits': logits}

class distilroBERTaMultiTaskTransformer(nn.Module):
    
    def __init__(
        self,
        model_name: str = "distilroberta-base",
        sentiment_num_classes: int = 3,
        emotion_num_classes: int = 6,
        hidden_dropout_prob: float = 0.1,
        attention_dropout_prob: float = 0.1,
        classifier_dropout: float = 0.1
    ):
        super().__init__()
        self.sentiment_num_classes = sentiment_num_classes
        self.emotion_num_classes = emotion_num_classes
        
        # Shared RoBERTa encoder
        self.roberta = AutoModel.from_pretrained(
            model_name,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_dropout_prob
        )
        
        # Task-specific heads
        self.dropout = nn.Dropout(classifier_dropout)
        
        # Sentiment classification head
        self.sentiment_classifier = nn.Linear(
            self.roberta.config.hidden_size, 
            sentiment_num_classes
        )
        
        # Emotion classification head
        self.emotion_classifier = nn.Linear(
            self.roberta.config.hidden_size, 
            emotion_num_classes
        )
    
    def forward(self, input_ids, attention_mask):
        # Get shared RoBERTa representations
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0]  # [CLS] token
        pooled_output = self.dropout(pooled_output)
        
        # Task-specific predictions
        sentiment_logits = self.sentiment_classifier(pooled_output)
        emotion_logits = self.emotion_classifier(pooled_output)
        
        return {
            'sentiment_logits': sentiment_logits,
            'emotion_logits': emotion_logits
        }

print("distilroBERTa Model architectures defined")

distilroBERTa Model architectures defined


In [81]:
# Cell 5: Data Loading and Processing Functions for RoBERTa
def aggressive_memory_cleanup():
    import gc
    gc.collect()
    torch.cuda.empty_cache()

def load_and_process_datasets_roberta():
    """Load and process datasets for RoBERTa training"""
    print("📥 Loading external datasets for RoBERTa...")
    
    # Load SST-2 for sentiment
    try:
        sst2_dataset = load_dataset("sst2")
        print(f"✅ SST-2 dataset loaded: {len(sst2_dataset['train'])} train samples")
    except Exception as e:
        print(f"❌ Error loading SST-2: {e}")
        raise
    
    # Load GoEmotions for emotion
    try:
        emotions_dataset = load_dataset("go_emotions", "simplified")
        print(f"✅ GoEmotions dataset loaded: {len(emotions_dataset['train'])} train samples")
    except Exception as e:
        print(f"❌ Error loading GoEmotions: {e}")
        raise
    
    # Process sentiment data
    sentiment_data = process_sentiment_data_roberta(sst2_dataset)
    
    # Process emotion data  
    emotion_data = process_emotion_data_roberta(emotions_dataset)
    
    return sentiment_data, emotion_data

def process_sentiment_data_roberta(sst2_dataset, max_samples=10000):
    """Process SST-2 dataset for RoBERTa sentiment classification"""
    
    print("🔄 Processing sentiment data for RoBERTa...")
    
    # Extract texts and labels
    train_texts = sst2_dataset['train']['sentence'][:max_samples]
    train_labels = sst2_dataset['train']['label'][:max_samples]
    
    # Map SST-2 labels to 3 classes: 0->Negative, 1->Positive
    # Add some neutral examples by random assignment
    expanded_labels = []
    expanded_texts = []
    
    for text, label in zip(train_texts, train_labels):
        if label == 0:  # Negative
            expanded_labels.append(0)
            expanded_texts.append(text)
        elif label == 1:  # Positive
            # Sometimes assign as positive, sometimes as neutral
            if np.random.random() < 0.15:  # 15% chance to be neutral
                expanded_labels.append(1)  # Neutral
            else:
                expanded_labels.append(2)  # Positive
            expanded_texts.append(text)
    
    # Ensure we have all 3 classes
    if 1 not in expanded_labels:
        # Force some examples to be neutral
        neutral_indices = np.random.choice(len(expanded_labels), size=100, replace=False)
        for idx in neutral_indices:
            expanded_labels[idx] = 1
    
    # Create train/val/test splits
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        expanded_texts, expanded_labels, test_size=0.3, random_state=42, stratify=expanded_labels
    )
    
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
    )
    
    sentiment_data = {
        'train': {'texts': train_texts, 'labels': train_labels},
        'val': {'texts': val_texts, 'labels': val_labels},
        'test': {'texts': test_texts, 'labels': test_labels}
    }
    
    print(f"✅ RoBERTa Sentiment data processed:")
    print(f"  Train: {len(train_texts)} samples")
    print(f"  Val: {len(val_texts)} samples")
    print(f"  Test: {len(test_texts)} samples")
    
    return sentiment_data

def process_emotion_data_roberta(emotion_dataset, max_samples=10000):
    """Process GoEmotion dataset for RoBERTa emotion classification"""
    
    print("🔄 Processing emotion data for RoBERTa...")
    
    # Filter to first 6 emotions only
    def filter_emotions(example):
        if isinstance(example['labels'], list):
            return example['labels'] and example['labels'][0] in range(6)
        else:
            return example['labels'] in range(6)
    
    filtered_train = emotion_dataset['train'].filter(filter_emotions)
    filtered_val = emotion_dataset['validation'].filter(filter_emotions)
    
    # Extract texts and labels
    train_texts = filtered_train['text'][:max_samples]
    train_labels_raw = filtered_train['labels'][:max_samples]
    
    # Handle multi-label to single-label conversion
    train_labels = []
    for label in train_labels_raw:
        if isinstance(label, list):
            train_labels.append(label[0] if label else 0)
        else:
            train_labels.append(label)
    
    # Create train/val/test splits
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        train_texts, train_labels, test_size=0.3, random_state=42, stratify=train_labels
    )
    
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
    )
    
    emotion_data = {
        'train': {'texts': train_texts, 'labels': train_labels},
        'val': {'texts': val_texts, 'labels': val_labels},
        'test': {'texts': test_texts, 'labels': test_labels}
    }
    
    print(f"✅ RoBERTa Emotion data processed:")
    print(f"  Train: {len(train_texts)} samples")
    print(f"  Val: {len(val_texts)} samples")
    print(f"  Test: {len(test_texts)} samples")
    
    return emotion_data

def create_multitask_data_roberta(sentiment_data, emotion_data):
    """Create combined dataset for multi-task learning with RoBERTa"""
    
    print("🔄 Creating multi-task dataset for RoBERTa...")
    
    # Take minimum length to balance datasets
    min_train_len = min(len(sentiment_data['train']['texts']), len(emotion_data['train']['texts']))
    min_val_len = min(len(sentiment_data['val']['texts']), len(emotion_data['val']['texts']))
    min_test_len = min(len(sentiment_data['test']['texts']), len(emotion_data['test']['texts']))
    
    multitask_data = {
        'train': {
            'texts': sentiment_data['train']['texts'][:min_train_len],
            'sentiment_labels': sentiment_data['train']['labels'][:min_train_len],
            'emotion_labels': emotion_data['train']['labels'][:min_train_len]
        },
        'val': {
            'texts': sentiment_data['val']['texts'][:min_val_len],
            'sentiment_labels': sentiment_data['val']['labels'][:min_val_len],
            'emotion_labels': emotion_data['val']['labels'][:min_val_len]
        },
        'test': {
            'texts': sentiment_data['test']['texts'][:min_test_len],
            'sentiment_labels': sentiment_data['test']['labels'][:min_test_len],
            'emotion_labels': emotion_data['test']['labels'][:min_test_len]
        }
    }
    
    print(f"✅ RoBERTa Multi-task data created:")
    print(f"  Train: {len(multitask_data['train']['texts'])} samples")
    print(f"  Val: {len(multitask_data['val']['texts'])} samples")
    print(f"  Test: {len(multitask_data['test']['texts'])} samples")
    
    return multitask_data

print("✅ RoBERTa data processing functions defined!")

✅ RoBERTa data processing functions defined!


In [82]:
# Cell 6: distilroBERTa Training Classes
class distilroBERTaSingleTaskTrainer:
    
    def __init__(self, config: TrainingConfig, num_classes: int):
        self.config = config
        self.num_classes = num_classes
        self.device = device
        
        # Initialize distilroBERTa tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Initialize distilroBERTa model
        self.model = distilroBERTaSingleTaskTransformer(
            model_name=config.model_name,
            num_classes=num_classes,
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_dropout_prob=config.attention_dropout_prob,
            classifier_dropout=config.classifier_dropout
        ).to(self.device)
        
        # Loss function
        self.loss_fn = nn.CrossEntropyLoss()
        
        # Initialize tracking
        self.training_history = {
            'train_loss': [],
            'train_accuracy': [],
            'val_loss': [],
            'val_accuracy': [],
            'val_f1_macro': []
        }
    
    def create_data_loaders(self, data_splits: Dict):
        
        # Create datasets
        train_dataset = distilroBERTaSingleTaskDataset(
            texts=data_splits['train']['texts'],
            labels=data_splits['train']['labels'],
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        val_dataset = distilroBERTaSingleTaskDataset(
            texts=data_splits['val']['texts'],
            labels=data_splits['val']['labels'],
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        # Create data loaders
        self.train_loader = DataLoader(
            train_dataset,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=0,
            pin_memory=True
        )
        
        self.val_loader = DataLoader(
            val_dataset,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=True
        )
        
        # Setup optimizer and scheduler
        total_steps = len(self.train_loader) * self.config.num_epochs
        
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=self.config.learning_rate,
            weight_decay=self.config.weight_decay
        )
        
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=int(total_steps * self.config.warmup_ratio),
            num_training_steps=total_steps
        )
    
    def train_epoch(self):
        self.model.train()
        
        total_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        
        for batch in self.train_loader:
            # Move to device
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            
            # Forward pass
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            loss = self.loss_fn(outputs['logits'], labels)
            
            # Backward pass
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            
            # Track metrics
            total_loss += loss.item()
            predictions = torch.argmax(outputs['logits'], dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
        
        avg_loss = total_loss / len(self.train_loader)
        accuracy = correct_predictions / total_predictions
        
        return avg_loss, accuracy
    
    def evaluate(self):
        self.model.eval()
        
        total_loss = 0.0
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                loss = self.loss_fn(outputs['logits'], labels)
                
                total_loss += loss.item()
                predictions = torch.argmax(outputs['logits'], dim=-1)
                
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_loss = total_loss / len(self.val_loader)
        accuracy = accuracy_score(all_labels, all_predictions)
        f1_macro = f1_score(all_labels, all_predictions, average='macro', zero_division=0)
        
        return avg_loss, accuracy, f1_macro
    
    def train(self, data_splits: Dict):
        print(f"Starting distilroBERTa single-task training ({self.config.task_type})...")
        
        # Setup data loaders
        self.create_data_loaders(data_splits)
        
        best_f1 = 0.0
        
        for epoch in range(self.config.num_epochs):
            print(f"\n📍 Epoch {epoch + 1}/{self.config.num_epochs}")
            
            # Train
            train_loss, train_accuracy = self.train_epoch()
            
            # Evaluate
            val_loss, val_accuracy, val_f1_macro = self.evaluate()
            
            # Track metrics
            self.training_history['train_loss'].append(train_loss)
            self.training_history['train_accuracy'].append(train_accuracy)
            self.training_history['val_loss'].append(val_loss)
            self.training_history['val_accuracy'].append(val_accuracy)
            self.training_history['val_f1_macro'].append(val_f1_macro)
            
            # Print results
            print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
            print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, Val F1: {val_f1_macro:.4f}")
            
            # Save best model
            if val_f1_macro > best_f1:
                best_f1 = val_f1_macro
                self.save_model(is_best=True)
        
        print(f"\ndistilroBERTa training completed! Best F1: {best_f1:.4f}")
        return self.training_history
    
    def save_model(self, is_best=False):
        suffix = "_best" if is_best else ""
        model_dir = os.path.join(self.config.output_dir, f"model{suffix}")
        
        os.makedirs(model_dir, exist_ok=True)
        
        # Save model
        self.model.roberta.save_pretrained(model_dir)
        self.tokenizer.save_pretrained(model_dir)
        
        # Save custom components
        torch.save({
            'classifier_state_dict': self.model.classifier.state_dict(),
            'num_classes': self.num_classes,
            'config': self.config
        }, os.path.join(model_dir, 'custom_components.pt'))
        
        if is_best:
            print(f"Best distilroBERTa model saved to {model_dir}")

class distilroBERTaMultiTaskTrainer:
    
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.device = device
        
        # Initialize RoBERTa tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Initialize RoBERTa multi-task model
        self.model = distilroBERTaMultiTaskTransformer(
            model_name=config.model_name,
            sentiment_num_classes=roberta_model_config.sentiment_num_classes,
            emotion_num_classes=roberta_model_config.emotion_num_classes,
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_dropout_prob=config.attention_dropout_prob,
            classifier_dropout=config.classifier_dropout
        ).to(self.device)
        
        # Loss function
        self.loss_fn = nn.CrossEntropyLoss()
        
        # Initialize tracking
        self.training_history = {
            'train_loss': [],
            'train_sentiment_accuracy': [],
            'train_emotion_accuracy': [],
            'val_loss': [],
            'val_sentiment_accuracy': [],
            'val_emotion_accuracy': [],
            'val_sentiment_f1_macro': [],
            'val_emotion_f1_macro': []
        }
    
    def create_data_loaders(self, data_splits: Dict):
        """Create data loaders for RoBERTa multi-task training"""
        
        # Create datasets
        train_dataset = distilroBERTaMultiTaskDataset(
            texts=data_splits['train']['texts'],
            sentiment_labels=data_splits['train']['sentiment_labels'],
            emotion_labels=data_splits['train']['emotion_labels'],
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        val_dataset = distilroBERTaMultiTaskDataset(
            texts=data_splits['val']['texts'],
            sentiment_labels=data_splits['val']['sentiment_labels'],
            emotion_labels=data_splits['val']['emotion_labels'],
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        # Create data loaders
        self.train_loader = DataLoader(
            train_dataset,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=0,
            pin_memory=True
        )
        
        self.val_loader = DataLoader(
            val_dataset,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=True
        )
        
        # Setup optimizer and scheduler
        total_steps = len(self.train_loader) * self.config.num_epochs
        
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=self.config.learning_rate,
            weight_decay=self.config.weight_decay
        )
        
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=int(total_steps * self.config.warmup_ratio),
            num_training_steps=total_steps
        )
    
    def train_epoch(self):
        self.model.train()
        
        total_loss = 0.0
        sentiment_correct = 0
        emotion_correct = 0
        total_predictions = 0
        
        for batch in self.train_loader:
            # Move to device
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            sentiment_labels = batch['sentiment_labels'].to(self.device)
            emotion_labels = batch['emotion_labels'].to(self.device)
            
            # Forward pass
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Calculate losses
            sentiment_loss = self.loss_fn(outputs['sentiment_logits'], sentiment_labels)
            emotion_loss = self.loss_fn(outputs['emotion_logits'], emotion_labels)
            
            # Combined loss with alpha weighting
            loss = self.config.alpha * sentiment_loss + (1 - self.config.alpha) * emotion_loss
            
            # Backward pass
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            
            # Track metrics
            total_loss += loss.item()
            
            sentiment_preds = torch.argmax(outputs['sentiment_logits'], dim=-1)
            emotion_preds = torch.argmax(outputs['emotion_logits'], dim=-1)
            
            sentiment_correct += (sentiment_preds == sentiment_labels).sum().item()
            emotion_correct += (emotion_preds == emotion_labels).sum().item()
            total_predictions += sentiment_labels.size(0)
        
        avg_loss = total_loss / len(self.train_loader)
        sentiment_accuracy = sentiment_correct / total_predictions
        emotion_accuracy = emotion_correct / total_predictions
        
        return avg_loss, sentiment_accuracy, emotion_accuracy
    
    def evaluate(self):
        self.model.eval()
        
        total_loss = 0.0
        sentiment_predictions = []
        emotion_predictions = []
        sentiment_labels = []
        emotion_labels = []
        
        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                sentiment_true = batch['sentiment_labels'].to(self.device)
                emotion_true = batch['emotion_labels'].to(self.device)
                
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                
                # Calculate combined loss
                sentiment_loss = self.loss_fn(outputs['sentiment_logits'], sentiment_true)
                emotion_loss = self.loss_fn(outputs['emotion_logits'], emotion_true)
                loss = self.config.alpha * sentiment_loss + (1 - self.config.alpha) * emotion_loss
                
                total_loss += loss.item()
                
                sentiment_preds = torch.argmax(outputs['sentiment_logits'], dim=-1)
                emotion_preds = torch.argmax(outputs['emotion_logits'], dim=-1)
                
                sentiment_predictions.extend(sentiment_preds.cpu().numpy())
                emotion_predictions.extend(emotion_preds.cpu().numpy())
                sentiment_labels.extend(sentiment_true.cpu().numpy())
                emotion_labels.extend(emotion_true.cpu().numpy())
        
        avg_loss = total_loss / len(self.val_loader)
        
        # Calculate metrics
        sentiment_accuracy = accuracy_score(sentiment_labels, sentiment_predictions)
        emotion_accuracy = accuracy_score(emotion_labels, emotion_predictions)
        sentiment_f1_macro = f1_score(sentiment_labels, sentiment_predictions, average='macro', zero_division=0)
        emotion_f1_macro = f1_score(emotion_labels, emotion_predictions, average='macro', zero_division=0)
        
        return avg_loss, sentiment_accuracy, emotion_accuracy, sentiment_f1_macro, emotion_f1_macro
    
    def train(self, data_splits: Dict):
        print(f"🚀 Starting distilroBERTa multi-task training...")
        
        # Setup data loaders
        self.create_data_loaders(data_splits)
        
        best_combined_f1 = 0.0
        
        for epoch in range(self.config.num_epochs):
            print(f"\n📍 Epoch {epoch + 1}/{self.config.num_epochs}")
            
            # Train
            train_loss, train_sent_acc, train_emo_acc = self.train_epoch()
            
            # Evaluate
            val_loss, val_sent_acc, val_emo_acc, val_sent_f1, val_emo_f1 = self.evaluate()
            
            # Track metrics
            self.training_history['train_loss'].append(train_loss)
            self.training_history['train_sentiment_accuracy'].append(train_sent_acc)
            self.training_history['train_emotion_accuracy'].append(train_emo_acc)
            self.training_history['val_loss'].append(val_loss)
            self.training_history['val_sentiment_accuracy'].append(val_sent_acc)
            self.training_history['val_emotion_accuracy'].append(val_emo_acc)
            self.training_history['val_sentiment_f1_macro'].append(val_sent_f1)
            self.training_history['val_emotion_f1_macro'].append(val_emo_f1)
            
            # Print results
            print(f"  Train Loss: {train_loss:.4f}")
            print(f"  Train Sentiment Acc: {train_sent_acc:.4f}, Train Emotion Acc: {train_emo_acc:.4f}")
            print(f"  Val Loss: {val_loss:.4f}")
            print(f"  Val Sentiment Acc: {val_sent_acc:.4f}, F1: {val_sent_f1:.4f}")
            print(f"  Val Emotion Acc: {val_emo_acc:.4f}, F1: {val_emo_f1:.4f}")
            
            # Save best model
            combined_f1 = (val_sent_f1 + val_emo_f1) / 2
            if combined_f1 > best_combined_f1:
                best_combined_f1 = combined_f1
                self.save_model(is_best=True)
        
        print(f"\ndistilroBERTa training completed! Best Combined F1: {best_combined_f1:.4f}")
        return self.training_history
    
    def save_model(self, is_best=False):
        suffix = "_best" if is_best else ""
        model_dir = os.path.join(self.config.output_dir, f"model{suffix}")
        
        os.makedirs(model_dir, exist_ok=True)
        
        # Save model
        self.model.roberta.save_pretrained(model_dir)
        self.tokenizer.save_pretrained(model_dir)
        
        # Save custom components
        torch.save({
            'sentiment_classifier_state_dict': self.model.sentiment_classifier.state_dict(),
            'emotion_classifier_state_dict': self.model.emotion_classifier.state_dict(),
            'sentiment_num_classes': self.model.sentiment_num_classes,
            'emotion_num_classes': self.model.emotion_num_classes,
            'config': self.config
        }, os.path.join(model_dir, 'custom_components.pt'))
        
        if is_best:
            print(f"Best distilroBERTa model saved to {model_dir}")

print("distilroBERTa Training classes defined!")

distilroBERTa Training classes defined!


In [83]:
# Cell 7: Evaluation Functions for distilroBERTa
def evaluate_distilroberta_model(model_path: str, model_type: str, test_data: Dict, model_name: str):
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Initialize the appropriate model architecture
    if model_type == "multitask":
        model = distilroBERTaMultiTaskTransformer(
            model_name=model_name,
            sentiment_num_classes=roberta_model_config.sentiment_num_classes,
            emotion_num_classes=roberta_model_config.emotion_num_classes
        )
    else:
        num_classes = (roberta_model_config.sentiment_num_classes 
                      if model_type == "sentiment" 
                      else roberta_model_config.emotion_num_classes)
        model = distilroBERTaSingleTaskTransformer(
            model_name=model_name,
            num_classes=num_classes
        )
    
    # Load the saved state dict
    custom_components = torch.load(os.path.join(model_path, 'custom_components.pt'))
    
    if model_type == "multitask":
        model.sentiment_classifier.load_state_dict(custom_components['sentiment_classifier_state_dict'])
        model.emotion_classifier.load_state_dict(custom_components['emotion_classifier_state_dict'])
    else:
        model.classifier.load_state_dict(custom_components['classifier_state_dict'])
    
    # Load the base model weights
    base_model = AutoModel.from_pretrained(model_path)
    model.roberta = base_model
    
    # Make sure model is on the correct device
    model = model.to(device)
    model.eval()
    
    # Rest of the evaluation code remains the same...
    # Create dataset and dataloader
    if model_type == "multitask":
        dataset = distilroBERTaMultiTaskDataset(
            texts=test_data['texts'],
            sentiment_labels=test_data['sentiment_labels'],
            emotion_labels=test_data['emotion_labels'],
            tokenizer=tokenizer,
            max_length=128
        )
    else:
        dataset = distilroBERTaSingleTaskDataset(
            texts=test_data['texts'],
            labels=test_data['labels'],
            tokenizer=tokenizer,
            max_length=128
        )
    
    dataloader = DataLoader(
        dataset,
        batch_size=32,
        shuffle=False,
        num_workers=0,
        pin_memory=True
    )
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            # Move everything to the same device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            if model_type == "multitask":
                sentiment_labels = batch['sentiment_labels'].to(device)
                emotion_labels = batch['emotion_labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                sentiment_preds = torch.argmax(outputs['sentiment_logits'], dim=-1)
                emotion_preds = torch.argmax(outputs['emotion_logits'], dim=-1)
                
                # Move predictions back to CPU for sklearn metrics
                all_predictions.extend([
                    sentiment_preds.cpu().numpy(),
                    emotion_preds.cpu().numpy()
                ])
                all_labels.extend([
                    sentiment_labels.cpu().numpy(),
                    emotion_labels.cpu().numpy()
                ])
            else:
                labels = batch['labels'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                predictions = torch.argmax(outputs['logits'], dim=-1)
                
                # Move predictions back to CPU for sklearn metrics
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    if model_type == "multitask":
        sentiment_accuracy = accuracy_score(all_labels[0], all_predictions[0])
        sentiment_f1 = f1_score(all_labels[0], all_predictions[0], average='macro')
        emotion_accuracy = accuracy_score(all_labels[1], all_predictions[1])
        emotion_f1 = f1_score(all_labels[1], all_predictions[1], average='macro')
        
        return {
            'sentiment_accuracy': sentiment_accuracy,
            'sentiment_f1_macro': sentiment_f1,
            'emotion_accuracy': emotion_accuracy,
            'emotion_f1_macro': emotion_f1,
            'combined_accuracy': (sentiment_accuracy + emotion_accuracy) / 2,
            'combined_f1_macro': (sentiment_f1 + emotion_f1) / 2
        }
    else:
        return {
            'accuracy': accuracy_score(all_labels, all_predictions),
            'f1_macro': f1_score(all_labels, all_predictions, average='macro')
        }
    
print("distilroBERTa evaluation functions defined")

distilroBERTa evaluation functions defined


In [84]:
# Cell 8: Ultra-Fast Hyperparameter Tuning Classes for distilroBERTa 
def create_tuning_subset(data_splits, subset_ratio=0.01):  # Even smaller: 1%
    print(f"🔪 Creating {subset_ratio*100:.0f}% subset for hyperparameter tuning...")
    
    def sample_split(split_data, ratio):
        n_samples = int(len(split_data['texts']) * ratio)
        if n_samples < 20:  # Minimum 20 samples
            n_samples = min(20, len(split_data['texts']))
        indices = np.random.choice(len(split_data['texts']), n_samples, replace=False)
        
        return {
            'texts': [split_data['texts'][i] for i in indices],
            'labels': [split_data['labels'][i] for i in indices]
        }
    
    val_key = 'val' if 'val' in data_splits else ('validation' if 'validation' in data_splits else 'test')
    
    tuning_data = {
        'train': sample_split(data_splits['train'], subset_ratio),
        'val': sample_split(data_splits[val_key], subset_ratio),
        'test': sample_split(data_splits['test'], subset_ratio) if 'test' in data_splits else sample_split(data_splits[val_key], subset_ratio)
    }
    
    print(f"📊 Tuning subset created:")
    print(f"  Train: {len(tuning_data['train']['texts'])} samples")
    print(f"  Val: {len(tuning_data['val']['texts'])} samples")
    
    return tuning_data

def create_multitask_tuning_subset(data_splits, subset_ratio=0.01):
    print(f"🔪 Creating {subset_ratio*100:.0f}% multitask subset for hyperparameter tuning...")
    
    def sample_multitask_split(split_data, ratio):
        n_samples = int(len(split_data['texts']) * ratio)
        if n_samples < 20:
            n_samples = min(20, len(split_data['texts']))
        indices = np.random.choice(len(split_data['texts']), n_samples, replace=False)
        
        return {
            'texts': [split_data['texts'][i] for i in indices],
            'sentiment_labels': [split_data['sentiment_labels'][i] for i in indices],
            'emotion_labels': [split_data['emotion_labels'][i] for i in indices]
        }
    
    val_key = 'val' if 'val' in data_splits else ('validation' if 'validation' in data_splits else 'test')
    
    tuning_data = {
        'train': sample_multitask_split(data_splits['train'], subset_ratio),
        'val': sample_multitask_split(data_splits[val_key], subset_ratio),
        'test': sample_multitask_split(data_splits['test'], subset_ratio) if 'test' in data_splits else sample_multitask_split(data_splits[val_key], subset_ratio)
    }
    
    print(f"Multitask tuning subset created:")
    print(f"  Train: {len(tuning_data['train']['texts'])} samples")
    print(f"  Val: {len(tuning_data['val']['texts'])} samples")
    
    return tuning_data

In [85]:
# Cell 8: Hyperparameter Tuning Classes for distilroBERTa
class distilroBERTaHyperparameterTuner:
    
    def __init__(
        self,
        model_type: str,  # "sentiment", "emotion", "multitask"
        data_splits: Dict,
        n_trials: int = 15,
        model_name: str = "distilroberta-base"
    ):
        self.model_type = model_type
        self.data_splits = data_splits
        self.n_trials = n_trials
        self.model_name = model_name
        
        print(f"🔍 distilroBERTa hyperparameter tuner initialized for {model_type}")
        print(f"🚀 Using Random Search for optimization")
    
    def objective(self, trial):
        """Optuna objective function for distilroBERTa"""
        
        # Sample hyperparameters
        learning_rate = trial.suggest_float('learning_rate', 2e-5, 1e-4, log=True)
        batch_size = trial.suggest_categorical('batch_size', [4, 8, 16])
        num_epochs = trial.suggest_int('num_epochs', 3, 6)
        warmup_ratio = trial.suggest_float('warmup_ratio', 0.05, 0.2)
        weight_decay = trial.suggest_float('weight_decay', 0.001, 0.1)
        hidden_dropout = trial.suggest_float('hidden_dropout_prob', 0.1, 0.3)
        classifier_dropout = trial.suggest_float('classifier_dropout', 0.1, 0.4)
        max_length = trial.suggest_categorical('max_length', [128, 256])
        
        # Multi-task specific parameter
        alpha = trial.suggest_float('alpha', 0.3, 0.7) if self.model_type == "multitask" else 0.5
        
        # Create config
        config = TrainingConfig(
            model_name=self.model_name,
            learning_rate=learning_rate,
            batch_size=batch_size,
            num_epochs=num_epochs,
            warmup_ratio=warmup_ratio,
            weight_decay=weight_decay,
            hidden_dropout_prob=hidden_dropout,
            classifier_dropout=classifier_dropout,
            max_length=max_length,
            alpha=alpha,
            task_type=self.model_type,
            output_dir=f"./distilroberta_trial_{trial.number}"
        )
        
        try:
            # Clear memory
            aggressive_memory_cleanup()
            
            # Train model
            if self.model_type == "multitask":
                trainer = distilroBERTaMultiTaskTrainer(config)
                history = trainer.train(self.data_splits)
                
                # Return combined F1 score
                best_sentiment_f1 = max(history['val_sentiment_f1_macro'])
                best_emotion_f1 = max(history['val_emotion_f1_macro'])
                combined_f1 = (best_sentiment_f1 + best_emotion_f1) / 2
                
                print(f"Trial {trial.number}: Combined F1 = {combined_f1:.4f}")
                return combined_f1
                
            else:
                # Single task training
                if self.model_type == "sentiment":
                    num_classes = roberta_model_config.sentiment_num_classes
                else:  # emotion
                    num_classes = roberta_model_config.emotion_num_classes
                
                trainer = distilroBERTaSingleTaskTrainer(config, num_classes)
                history = trainer.train(self.data_splits)
                
                # Return best F1 score
                best_f1 = max(history['val_f1_macro'])
                print(f"Trial {trial.number}: F1 = {best_f1:.4f}")
                return best_f1
                
        except Exception as e:
            print(f"Trial {trial.number} failed: {e}")
            return 0.0
        
        finally:
            # Clean up
            aggressive_memory_cleanup()
    
    def tune(self):
        """Run hyperparameter optimization"""
        
        # Create study with Random Search
        study = optuna.create_study(
            direction='maximize',
            sampler=optuna.samplers.RandomSampler(seed=42)
        )
        
        print(f"\n🔍 Starting hyperparameter optimization for {self.model_type}...")
        print(f"🎯 Random Search: {self.n_trials} trials")
        print("=" * 60)
        
        # Run optimization
        study.optimize(self.objective, n_trials=self.n_trials)
        
        # Print results
        print(f"\n🏆 Optimization completed for {self.model_type}!")
        print(f"Best trial: {study.best_trial.number}")
        print(f"Best F1 score: {study.best_value:.4f}")
        print(f"Best parameters:")
        for key, value in study.best_params.items():
            print(f"  {key}: {value}")
        
        return study

print("✅ distilroBERTa Hyperparameter Tuning classes defined!")

✅ distilroBERTa Hyperparameter Tuning classes defined!


In [86]:
# Cell 8B: Ultra-Fast Trainers for Speed
class UltraFastdistilroBERTaSingleTaskTrainer:
    
    def __init__(self, config: TrainingConfig, num_classes: int):
        self.config = config
        self.num_classes = num_classes
        self.device = device
        
        # Initialize tokenizer (reuse if possible)
        if not hasattr(self, '_tokenizer_cache'):
            UltraFastdistilroBERTaSingleTaskTrainer._tokenizer_cache = AutoTokenizer.from_pretrained(config.model_name)
            if UltraFastdistilroBERTaSingleTaskTrainer._tokenizer_cache.pad_token is None:
                UltraFastdistilroBERTaSingleTaskTrainer._tokenizer_cache.pad_token = UltraFastdistilroBERTaSingleTaskTrainer._tokenizer_cache.eos_token
        
        self.tokenizer = UltraFastdistilroBERTaSingleTaskTrainer._tokenizer_cache
        
        # Initialize model
        self.model = distilroBERTaSingleTaskTransformer(
            model_name=config.model_name,
            num_classes=num_classes,
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_dropout_prob=config.attention_dropout_prob,
            classifier_dropout=config.classifier_dropout
        ).to(self.device)
        
        self.loss_fn = nn.CrossEntropyLoss()
        self.training_history = {
            'train_loss': [], 'train_accuracy': [], 'val_loss': [], 'val_accuracy': [], 'val_f1_macro': []
        }
    
    def create_data_loaders(self, data_splits: Dict):
        train_dataset = distilroBERTaSingleTaskDataset(
            texts=data_splits['train']['texts'],
            labels=data_splits['train']['labels'],
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        val_dataset = distilroBERTaSingleTaskDataset(
            texts=data_splits['val']['texts'],
            labels=data_splits['val']['labels'],
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        # Speed-optimized data loaders
        self.train_loader = DataLoader(
            train_dataset,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=0,  # No multiprocessing for speed
            pin_memory=False  # Disable pin_memory
        )
        
        self.val_loader = DataLoader(
            val_dataset,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=False
        )
        
        # Simple optimizer setup
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=self.config.learning_rate,
            weight_decay=self.config.weight_decay
        )
    
    def train_epoch(self):
        self.model.train()
        
        total_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        
        for batch_idx, batch in enumerate(self.train_loader):
            input_ids = batch['input_ids'].to(self.device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(self.device, non_blocking=True)
            labels = batch['labels'].to(self.device, non_blocking=True)
            
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            loss = self.loss_fn(outputs['logits'], labels)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            # Track metrics
            total_loss += loss.item()
            predictions = torch.argmax(outputs['logits'], dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
            
            # Print progress for very small datasets
            if batch_idx % max(1, len(self.train_loader) // 4) == 0:
                print(f"    Batch {batch_idx + 1}/{len(self.train_loader)}")
        
        return total_loss / len(self.train_loader), correct_predictions / total_predictions
    
    def evaluate(self):
        self.model.eval()
        
        total_loss = 0.0
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch['input_ids'].to(self.device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(self.device, non_blocking=True)
                labels = batch['labels'].to(self.device, non_blocking=True)
                
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                loss = self.loss_fn(outputs['logits'], labels)
                
                total_loss += loss.item()
                predictions = torch.argmax(outputs['logits'], dim=-1)
                
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        accuracy = accuracy_score(all_labels, all_predictions)
        f1_macro = f1_score(all_labels, all_predictions, average='macro', zero_division=0)
        
        return total_loss / len(self.val_loader), accuracy, f1_macro
    
    def train(self, data_splits: Dict):
        print(f"🚀 Starting ultra-fast distilroBERTa training ({self.config.task_type})...")
        
        self.create_data_loaders(data_splits)
        
        best_f1 = 0.0
        
        for epoch in range(self.config.num_epochs):
            print(f"  📍 Epoch {epoch + 1}/{self.config.num_epochs}")
            
            train_loss, train_accuracy = self.train_epoch()
            val_loss, val_accuracy, val_f1_macro = self.evaluate()
            
            self.training_history['train_loss'].append(train_loss)
            self.training_history['train_accuracy'].append(train_accuracy)
            self.training_history['val_loss'].append(val_loss)
            self.training_history['val_accuracy'].append(val_accuracy)
            self.training_history['val_f1_macro'].append(val_f1_macro)
            
            print(f"    Loss: {train_loss:.4f}, Acc: {train_accuracy:.4f}, Val F1: {val_f1_macro:.4f}")
            
            if val_f1_macro > best_f1:
                best_f1 = val_f1_macro
        
        print(f"✅ Training completed! Best F1: {best_f1:.4f}")
        return self.training_history

class UltraFastRoBERTaMultiTaskTrainer:
    # Similar structure but for multitask...
    pass  # Implement if needed

print("Ultra-fast trainers defined")

Ultra-fast trainers defined


In [87]:
# Cell 9: Data Loading and Initial Setup for distilroBERTa
print("🚀 STARTING distilroBERTa TRAINING PIPELINE")
print("=" * 80)

# Clear memory before starting
aggressive_memory_cleanup()

# Load and process datasets for distilroBERTa
print("\n1️⃣ Loading and processing datasets for distilroBERTa...")
sentiment_data, emotion_data = load_and_process_datasets_roberta()
multitask_data = create_multitask_data_roberta(sentiment_data, emotion_data)

# Model configurations
model_name = "distilroberta-base"
n_trials = 15  # Number of hyperparameter tuning trials

print("Data loading completed!")
print(f"Sentiment data: {len(sentiment_data['train']['texts'])} train samples")
print(f"Emotion data: {len(emotion_data['train']['texts'])} train samples")
print(f"Multitask data: {len(multitask_data['train']['texts'])} train samples")
print(f"Model: {model_name}")
print(f"Hyperparameter trials per model: {n_trials}")

🚀 STARTING distilroBERTa TRAINING PIPELINE

1️⃣ Loading and processing datasets for distilroBERTa...
📥 Loading external datasets for RoBERTa...
✅ SST-2 dataset loaded: 67349 train samples
✅ GoEmotions dataset loaded: 43410 train samples
🔄 Processing sentiment data for RoBERTa...
✅ RoBERTa Sentiment data processed:
  Train: 7000 samples
  Val: 1500 samples
  Test: 1500 samples
🔄 Processing emotion data for RoBERTa...
✅ RoBERTa Emotion data processed:
  Train: 7000 samples
  Val: 1500 samples
  Test: 1500 samples
🔄 Creating multi-task dataset for RoBERTa...
✅ RoBERTa Multi-task data created:
  Train: 7000 samples
  Val: 1500 samples
  Test: 1500 samples
Data loading completed!
Sentiment data: 7000 train samples
Emotion data: 7000 train samples
Multitask data: 7000 train samples
Model: distilroberta-base
Hyperparameter trials per model: 15


In [88]:
# Cell 9: Initial Sentiment Model Training
print("\n" + "="*80)
print("📍 PHASE 1: INITIAL DISTILROBERTA TRAINING - SENTIMENT MODEL")
print("="*80)

# Initialize results dictionary
all_results = {}

# Default configuration for sentiment
default_config_sentiment = TrainingConfig(
    model_name=model_name,
    batch_size=8,
    learning_rate=2e-5,
    num_epochs=3,
    max_length=128,
    task_type="sentiment",
    output_dir="./initial_distilroberta_sentiment_model"
)

print("\n2️⃣ Training Initial distilroBERTa Sentiment Model...")
print("="*60)

# Train initial sentiment model
initial_sentiment_trainer = distilroBERTaSingleTaskTrainer(
    config=default_config_sentiment,
    num_classes=roberta_model_config.sentiment_num_classes
)
initial_sentiment_history = initial_sentiment_trainer.train(sentiment_data)

# Evaluate initial sentiment model
initial_sentiment_results = evaluate_distilroberta_model(
    model_path="./initial_distilroberta_sentiment_model/model_best",
    model_type="sentiment",
    test_data=sentiment_data['test'],
    model_name=model_name
)

# Store results
all_results['initial_sentiment'] = initial_sentiment_results

print(f"\n✅ Initial Sentiment Model Results:")
print(f"  Accuracy: {initial_sentiment_results['accuracy']:.4f}")
print(f"  F1 Macro: {initial_sentiment_results['f1_macro']:.4f}")

# Clean up memory
aggressive_memory_cleanup()


📍 PHASE 1: INITIAL DISTILROBERTA TRAINING - SENTIMENT MODEL

2️⃣ Training Initial distilroBERTa Sentiment Model...
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/3
  Train Loss: 0.6889, Train Acc: 0.7306
  Val Loss: 0.5309, Val Acc: 0.8127, Val F1: 0.5669
Best distilroBERTa model saved to ./initial_distilroberta_sentiment_model\model_best

📍 Epoch 2/3
  Train Loss: 0.4799, Train Acc: 0.8426
  Val Loss: 0.5704, Val Acc: 0.8273, Val F1: 0.5772
Best distilroBERTa model saved to ./initial_distilroberta_sentiment_model\model_best

📍 Epoch 3/3
  Train Loss: 0.4000, Train Acc: 0.8766
  Val Loss: 0.6107, Val Acc: 0.8327, Val F1: 0.5811
Best distilroBERTa model saved to ./initial_distilroberta_sentiment_model\model_best

distilroBERTa training completed! Best F1: 0.5811

✅ Initial Sentiment Model Results:
  Accuracy: 0.8333
  F1 Macro: 0.5812


In [89]:
# Cell 10: Initial Emotion Model Training
print("\n" + "="*80)
print("📍 PHASE 1: INITIAL DISTILROBERTA TRAINING - EMOTION MODEL")
print("="*80)

# Default configuration for emotion
default_config_emotion = TrainingConfig(
    model_name=model_name,
    batch_size=8,
    learning_rate=2e-5,
    num_epochs=3,
    max_length=128,
    task_type="emotion",
    output_dir="./initial_distilroberta_emotion_model"
)

print("\n3️⃣ Training Initial distilroBERTa Emotion Model...")
print("="*60)

# Train initial emotion model
initial_emotion_trainer = distilroBERTaSingleTaskTrainer(
    config=default_config_emotion,
    num_classes=roberta_model_config.emotion_num_classes
)
initial_emotion_history = initial_emotion_trainer.train(emotion_data)

# Evaluate initial emotion model
initial_emotion_results = evaluate_distilroberta_model(
    model_path="./initial_distilroberta_emotion_model/model_best",
    model_type="emotion",
    test_data=emotion_data['test'],
    model_name=model_name
)
all_results['initial_emotion'] = initial_emotion_results

print(f"\n✅ Initial Emotion Model Results:")
print(f"  Accuracy: {initial_emotion_results['accuracy']:.4f}")
print(f"  F1 Macro: {initial_emotion_results['f1_macro']:.4f}")

# Clean up memory
aggressive_memory_cleanup()


📍 PHASE 1: INITIAL DISTILROBERTA TRAINING - EMOTION MODEL

3️⃣ Training Initial distilroBERTa Emotion Model...
Starting distilroBERTa single-task training (emotion)...

📍 Epoch 1/3
  Train Loss: 1.0058, Train Acc: 0.6143
  Val Loss: 0.7677, Val Acc: 0.7180, Val F1: 0.6652
Best distilroBERTa model saved to ./initial_distilroberta_emotion_model\model_best

📍 Epoch 2/3
  Train Loss: 0.6032, Train Acc: 0.7853
  Val Loss: 0.7826, Val Acc: 0.7413, Val F1: 0.7043
Best distilroBERTa model saved to ./initial_distilroberta_emotion_model\model_best

📍 Epoch 3/3
  Train Loss: 0.4370, Train Acc: 0.8466
  Val Loss: 0.8387, Val Acc: 0.7493, Val F1: 0.7207
Best distilroBERTa model saved to ./initial_distilroberta_emotion_model\model_best

distilroBERTa training completed! Best F1: 0.7207

✅ Initial Emotion Model Results:
  Accuracy: 0.7667
  F1 Macro: 0.7329


In [90]:
# Cell 11: Initial Multitask Model Training
print("\n" + "="*80)
print("📍 PHASE 1: INITIAL DISTILROBERTA TRAINING - MULTITASK MODEL")
print("="*80)

# Default configuration for multitask
default_config_multitask = TrainingConfig(
    model_name=model_name,
    batch_size=8,
    learning_rate=2e-5,
    num_epochs=3,
    max_length=128,
    alpha=0.5,
    task_type="multitask",
    output_dir="./initial_distilroberta_multitask_model"
)

print("\n4️⃣ Training Initial distilroBERTa Multi-task Model...")
print("="*60)

# Train initial multitask model
initial_multitask_trainer = distilroBERTaMultiTaskTrainer(config=default_config_multitask)
initial_multitask_history = initial_multitask_trainer.train(multitask_data)

# Evaluate initial multitask model
initial_multitask_results = evaluate_distilroberta_model(
    model_path="./initial_distilroberta_multitask_model/model_best",
    model_type="multitask",
    test_data=multitask_data['test'],
    model_name=model_name
)
all_results['initial_multitask'] = initial_multitask_results

print(f"\n✅ Initial Multitask Model Results:")
print(f"  Sentiment - Accuracy: {initial_multitask_results['sentiment_accuracy']:.4f}, F1: {initial_multitask_results['sentiment_f1_macro']:.4f}")
print(f"  Emotion - Accuracy: {initial_multitask_results['emotion_accuracy']:.4f}, F1: {initial_multitask_results['emotion_f1_macro']:.4f}")
print(f"  Combined - Accuracy: {initial_multitask_results['combined_accuracy']:.4f}, F1: {initial_multitask_results['combined_f1_macro']:.4f}")

# Clean up memory
aggressive_memory_cleanup()


📍 PHASE 1: INITIAL DISTILROBERTA TRAINING - MULTITASK MODEL

4️⃣ Training Initial distilroBERTa Multi-task Model...
🚀 Starting distilroBERTa multi-task training...

📍 Epoch 1/3
  Train Loss: 1.1999
  Train Sentiment Acc: 0.7303, Train Emotion Acc: 0.2776
  Val Loss: 1.1355
  Val Sentiment Acc: 0.7907, F1: 0.5520
  Val Emotion Acc: 0.2807, F1: 0.1070
Best distilroBERTa model saved to ./initial_distilroberta_multitask_model\model_best

📍 Epoch 2/3
  Train Loss: 1.0823
  Train Sentiment Acc: 0.8460, Train Emotion Acc: 0.2900
  Val Loss: 1.1108
  Val Sentiment Acc: 0.8227, F1: 0.5735
  Val Emotion Acc: 0.2993, F1: 0.0812

📍 Epoch 3/3
  Train Loss: 1.0223
  Train Sentiment Acc: 0.8791, Train Emotion Acc: 0.3040
  Val Loss: 1.1299
  Val Sentiment Acc: 0.8260, F1: 0.5762
  Val Emotion Acc: 0.3007, F1: 0.1022
Best distilroBERTa model saved to ./initial_distilroberta_multitask_model\model_best

distilroBERTa training completed! Best Combined F1: 0.3392

✅ Initial Multitask Model Results:
  Sen

In [91]:
# Cell 13: Initial Results Summary
print("\n" + "="*80)
print("📍 INITIAL ROBERTA RESULTS SUMMARY")
print("="*80)

print(f"\n📊 INITIAL ROBERTA MODEL PERFORMANCE:")
print(f"  Sentiment Model:")
print(f"    Accuracy: {initial_sentiment_results['accuracy']:.4f}")
print(f"    F1 Macro: {initial_sentiment_results['f1_macro']:.4f}")

print(f"\n  Emotion Model:")
print(f"    Accuracy: {initial_emotion_results['accuracy']:.4f}")
print(f"    F1 Macro: {initial_emotion_results['f1_macro']:.4f}")

print(f"\n  Multitask Model:")
print(f"    Sentiment - Accuracy: {initial_multitask_results['sentiment_accuracy']:.4f}, F1: {initial_multitask_results['sentiment_f1_macro']:.4f}")
print(f"    Emotion - Accuracy: {initial_multitask_results['emotion_accuracy']:.4f}, F1: {initial_multitask_results['emotion_f1_macro']:.4f}")
print(f"    Combined - Accuracy: {initial_multitask_results['combined_accuracy']:.4f}, F1: {initial_multitask_results['combined_f1_macro']:.4f}")

# Store results for later comparison
all_results = {
    'initial_sentiment': initial_sentiment_results,
    'initial_emotion': initial_emotion_results,
    'initial_multitask': initial_multitask_results
}

print(f"\n💡 These are RoBERTa baseline results. Now proceeding to hyperparameter tuning!")


📍 INITIAL ROBERTA RESULTS SUMMARY

📊 INITIAL ROBERTA MODEL PERFORMANCE:
  Sentiment Model:
    Accuracy: 0.8333
    F1 Macro: 0.5812

  Emotion Model:
    Accuracy: 0.7667
    F1 Macro: 0.7329

  Multitask Model:
    Sentiment - Accuracy: 0.7812, F1: 0.5376
    Emotion - Accuracy: 0.3750, F1: 0.0930
    Combined - Accuracy: 0.5781, F1: 0.3153

💡 These are RoBERTa baseline results. Now proceeding to hyperparameter tuning!


In [None]:
# Cell 12: Hyperparameter Tuning - Sentiment
print("\n" + "="*80)
print("📍 PHASE 2: HYPERPARAMETER TUNING - SENTIMENT")
print("="*80)

print("\n6️⃣ Hyperparameter Tuning for distilroBERTa Sentiment Model...")
print("="*60)

# Create tuner for sentiment
sentiment_tuner = distilroBERTaHyperparameterTuner(
    model_type="sentiment",
    data_splits=sentiment_data,
    n_trials=15,
    model_name=model_name
)
sentiment_study = sentiment_tuner.tune()

[I 2025-07-22 20:38:22,272] A new study created in memory with name: no-name-e818d2b4-0b02-4659-a212-615b6ac1d60a



📍 PHASE 2: HYPERPARAMETER TUNING - SENTIMENT

6️⃣ Hyperparameter Tuning for distilroBERTa Sentiment Model...
🔍 distilroBERTa hyperparameter tuner initialized for sentiment
🚀 Using Random Search for optimization

🔍 Starting hyperparameter optimization for sentiment...
🎯 Random Search: 15 trials
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/3
  Train Loss: 0.8141, Train Acc: 0.6893
  Val Loss: 0.7534, Val Acc: 0.7900, Val F1: 0.5503
Best distilroBERTa model saved to ./distilroberta_trial_0\model_best

📍 Epoch 2/3
  Train Loss: 0.7254, Train Acc: 0.7973
  Val Loss: 0.8328, Val Acc: 0.8060, Val F1: 0.5622
Best distilroBERTa model saved to ./distilroberta_trial_0\model_best

📍 Epoch 3/3
  Train Loss: 0.6750, Train Acc: 0.8306
  Val Loss: 0.9426, Val Acc: 0.8107, Val F1: 0.5654


[I 2025-07-22 20:46:21,158] Trial 0 finished with value: 0.5654105603692791 and parameters: {'learning_rate': 3.65445235521325e-05, 'batch_size': 4, 'num_epochs': 3, 'warmup_ratio': 0.0733991780504304, 'weight_decay': 0.006750277604651747, 'hidden_dropout_prob': 0.273235229154987, 'classifier_dropout': 0.2803345035229627, 'max_length': 128}. Best is trial 0 with value: 0.5654105603692791.


Best distilroBERTa model saved to ./distilroberta_trial_0\model_best

distilroBERTa training completed! Best F1: 0.5654
Trial 0: F1 = 0.5654
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/3
  Train Loss: 0.8653, Train Acc: 0.6643
  Val Loss: 0.6716, Val Acc: 0.7633, Val F1: 0.5324
Best distilroBERTa model saved to ./distilroberta_trial_1\model_best

📍 Epoch 2/3
  Train Loss: 0.7422, Train Acc: 0.7796
  Val Loss: 0.8249, Val Acc: 0.7767, Val F1: 0.5422
Best distilroBERTa model saved to ./distilroberta_trial_1\model_best

📍 Epoch 3/3
  Train Loss: 0.5975, Train Acc: 0.8407
  Val Loss: 0.8167, Val Acc: 0.8073, Val F1: 0.5631


[I 2025-07-22 21:02:50,917] Trial 1 finished with value: 0.5631400335536304 and parameters: {'learning_rate': 9.527257190058904e-05, 'batch_size': 4, 'num_epochs': 3, 'warmup_ratio': 0.09563633644393066, 'weight_decay': 0.05295088673159155, 'hidden_dropout_prob': 0.18638900372842315, 'classifier_dropout': 0.1873687420594126, 'max_length': 128}. Best is trial 0 with value: 0.5654105603692791.


Best distilroBERTa model saved to ./distilroberta_trial_1\model_best

distilroBERTa training completed! Best F1: 0.5631
Trial 1: F1 = 0.5631
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/3
  Train Loss: 0.7057, Train Acc: 0.7103
  Val Loss: 0.5383, Val Acc: 0.7987, Val F1: 0.5578
Best distilroBERTa model saved to ./distilroberta_trial_2\model_best

📍 Epoch 2/3
  Train Loss: 0.4646, Train Acc: 0.8406
  Val Loss: 0.4998, Val Acc: 0.8187, Val F1: 0.5709
Best distilroBERTa model saved to ./distilroberta_trial_2\model_best

📍 Epoch 3/3
  Train Loss: 0.3676, Train Acc: 0.8771
  Val Loss: 0.5827, Val Acc: 0.8200, Val F1: 0.5721


[I 2025-07-22 21:07:00,292] Trial 2 finished with value: 0.5721370158820985 and parameters: {'learning_rate': 3.2005921956585e-05, 'batch_size': 16, 'num_epochs': 3, 'warmup_ratio': 0.12713516576204176, 'weight_decay': 0.05964904231734221, 'hidden_dropout_prob': 0.10929008254399955, 'classifier_dropout': 0.28226345557043153, 'max_length': 128}. Best is trial 2 with value: 0.5721370158820985.


Best distilroBERTa model saved to ./distilroberta_trial_2\model_best

distilroBERTa training completed! Best F1: 0.5721
Trial 2: F1 = 0.5721
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/3
  Train Loss: 0.8474, Train Acc: 0.6983
  Val Loss: 0.8214, Val Acc: 0.7627, Val F1: 0.5315
Best distilroBERTa model saved to ./distilroberta_trial_3\model_best

📍 Epoch 2/3
  Train Loss: 0.7227, Train Acc: 0.7986
  Val Loss: 0.7043, Val Acc: 0.7940, Val F1: 0.5537
Best distilroBERTa model saved to ./distilroberta_trial_3\model_best

📍 Epoch 3/3
  Train Loss: 0.5448, Train Acc: 0.8627
  Val Loss: 0.8870, Val Acc: 0.8027, Val F1: 0.5597


[I 2025-07-22 21:13:37,185] Trial 3 finished with value: 0.5596709257543712 and parameters: {'learning_rate': 9.210273435223547e-05, 'batch_size': 4, 'num_epochs': 3, 'warmup_ratio': 0.15263495397682356, 'weight_decay': 0.04457509688022053, 'hidden_dropout_prob': 0.12440764696895577, 'classifier_dropout': 0.2485530730333811, 'max_length': 256}. Best is trial 2 with value: 0.5721370158820985.


Best distilroBERTa model saved to ./distilroberta_trial_3\model_best

distilroBERTa training completed! Best F1: 0.5597
Trial 3: F1 = 0.5597
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/5
  Train Loss: 0.8570, Train Acc: 0.6784
  Val Loss: 0.8430, Val Acc: 0.7673, Val F1: 0.5335
Best distilroBERTa model saved to ./distilroberta_trial_4\model_best

📍 Epoch 2/5
  Train Loss: 0.7765, Train Acc: 0.7929
  Val Loss: 0.8037, Val Acc: 0.8160, Val F1: 0.5686
Best distilroBERTa model saved to ./distilroberta_trial_4\model_best

📍 Epoch 3/5
  Train Loss: 0.6889, Train Acc: 0.8307
  Val Loss: 0.9276, Val Acc: 0.8140, Val F1: 0.5682

📍 Epoch 4/5
  Train Loss: 0.6222, Train Acc: 0.8570
  Val Loss: 1.0354, Val Acc: 0.8167, Val F1: 0.5698
Best distilroBERTa model saved to ./distilroberta_trial_4\model_best

📍 Epoch 5/5
  Train Loss: 0.5697, Train Acc: 0.8673
  Val Loss: 0.9957, Val Acc: 0.8193, Val F1: 0.5714


[I 2025-07-22 21:23:37,294] Trial 4 finished with value: 0.5714219534007071 and parameters: {'learning_rate': 3.0332586204364574e-05, 'batch_size': 4, 'num_epochs': 5, 'warmup_ratio': 0.07772816832882906, 'weight_decay': 0.0969888781486913, 'hidden_dropout_prob': 0.2550265646722229, 'classifier_dropout': 0.38184968246925677, 'max_length': 128}. Best is trial 2 with value: 0.5721370158820985.


Best distilroBERTa model saved to ./distilroberta_trial_4\model_best

distilroBERTa training completed! Best F1: 0.5714
Trial 4: F1 = 0.5714
Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/4
  Train Loss: 0.8249, Train Acc: 0.6574
  Val Loss: 0.6533, Val Acc: 0.7580, Val F1: 0.5289
Best distilroBERTa model saved to ./distilroberta_trial_5\model_best

📍 Epoch 2/4
  Train Loss: 0.6238, Train Acc: 0.7794
  Val Loss: 0.6078, Val Acc: 0.7887, Val F1: 0.5490
Best distilroBERTa model saved to ./distilroberta_trial_5\model_best

📍 Epoch 3/4
  Train Loss: 0.5112, Train Acc: 0.8299
  Val Loss: 0.6730, Val Acc: 0.7987, Val F1: 0.5573
Best distilroBERTa model saved to ./distilroberta_trial_5\model_best

📍 Epoch 4/4
  Train Loss: 0.4555, Train Acc: 0.8589
  Val Loss: 0.7548, Val Acc: 0.8067, Val F1: 0.5627
Best distilroBERTa model saved to ./distilroberta_trial_5\model_best

distilroBERTa training completed! Best F1: 0.5627
Trial 5: F1 = 0.5627


[I 2025-07-22 22:06:07,586] Trial 5 finished with value: 0.5626823551920409 and parameters: {'learning_rate': 8.818453591655191e-05, 'batch_size': 8, 'num_epochs': 4, 'warmup_ratio': 0.10830159345342232, 'weight_decay': 0.0278635541456157, 'hidden_dropout_prob': 0.26574750183038587, 'classifier_dropout': 0.2070259980080768, 'max_length': 256}. Best is trial 2 with value: 0.5721370158820985.


Starting distilroBERTa single-task training (sentiment)...

📍 Epoch 1/6
  Train Loss: 0.8409, Train Acc: 0.6341
  Val Loss: 0.5422, Val Acc: 0.8093, Val F1: 0.5643
Best distilroBERTa model saved to ./distilroberta_trial_6\model_best

📍 Epoch 2/6
  Train Loss: 0.6012, Train Acc: 0.7794
  Val Loss: 0.5183, Val Acc: 0.8100, Val F1: 0.5650
Best distilroBERTa model saved to ./distilroberta_trial_6\model_best

📍 Epoch 3/6
  Train Loss: 0.5223, Train Acc: 0.8140
  Val Loss: 0.5563, Val Acc: 0.8027, Val F1: 0.5606

📍 Epoch 4/6
  Train Loss: 0.4759, Train Acc: 0.8349
  Val Loss: 0.5554, Val Acc: 0.8213, Val F1: 0.5729
Best distilroBERTa model saved to ./distilroberta_trial_6\model_best

📍 Epoch 5/6


In [None]:
# Cell 13: Hyperparameter Tuning - Emotion
print("\n" + "="*80)
print("📍 PHASE 2: HYPERPARAMETER TUNING - EMOTION")
print("="*80)

print("\n7️⃣ Hyperparameter Tuning for distilroBERTa Emotion Model...")
print("="*60)

# Create tuner for emotion
emotion_tuner = distilroBERTaHyperparameterTuner(
    model_type="emotion",
    data_splits=emotion_data,
    n_trials=15,
    model_name=model_name
)
emotion_study = emotion_tuner.tune()

In [None]:
# Cell 14: Hyperparameter Tuning - Multitask
print("\n" + "="*80)
print("📍 PHASE 2: HYPERPARAMETER TUNING - MULTITASK")
print("="*80)

print("\n8️⃣ Hyperparameter Tuning for distilroBERTa Multi-task Model...")
print("="*60)

# Create tuner for multitask
multitask_tuner = distilroBERTaHyperparameterTuner(
    model_type="multitask",
    data_splits=multitask_data,
    n_trials=15,
    model_name=model_name
)
multitask_study = multitask_tuner.tune()

In [None]:
# Cell 15: Final Training - Sentiment with Best Parameters
print("\n" + "="*80)
print("📍 PHASE 3: FINAL TRAINING - OPTIMIZED SENTIMENT MODEL")
print("="*80)

print("\n9️⃣ Training Final distilroBERTa Sentiment Model with Best Parameters...")
print("="*60)

# Get best parameters from sentiment tuning
best_sentiment_params = sentiment_study.best_params
print(f"🎯 Using best hyperparameters:")
for key, value in best_sentiment_params.items():
    print(f"  {key}: {value}")

# Create optimized config for final training
final_sentiment_config = TrainingConfig(
    model_name=model_name,
    learning_rate=best_sentiment_params['learning_rate'],
    batch_size=best_sentiment_params['batch_size'],
    num_epochs=5,  # Increase epochs for final training
    warmup_ratio=best_sentiment_params['warmup_ratio'],
    weight_decay=best_sentiment_params['weight_decay'],
    hidden_dropout_prob=best_sentiment_params['hidden_dropout_prob'],
    classifier_dropout=best_sentiment_params['classifier_dropout'],
    max_length=best_sentiment_params.get('max_length', 128),
    task_type="sentiment",
    output_dir="./final_distilroberta_sentiment_model"
)

# Train final sentiment model
final_sentiment_trainer = distilroBERTaSingleTaskTrainer(
    config=final_sentiment_config,
    num_classes=roberta_model_config.sentiment_num_classes
)
final_sentiment_history = final_sentiment_trainer.train(sentiment_data)

# Evaluate final sentiment model
final_sentiment_results = evaluate_distilroberta_model(
    model_path="./final_distilroberta_sentiment_model/model_best",
    model_type="sentiment",
    test_data=sentiment_data['test'],
    model_name=model_name
)
all_results['final_sentiment'] = final_sentiment_results

print(f"\n✅ Final Sentiment Model Results:")
print(f"  Accuracy: {final_sentiment_results['accuracy']:.4f}")
print(f"  F1 Macro: {final_sentiment_results['f1_macro']:.4f}")

# Clean up memory
aggressive_memory_cleanup()

In [None]:
# Cell 16: Final Emotion Model Training with Best Parameters
print("\n" + "="*80)
print("📍 PHASE 3: FINAL TRAINING - OPTIMIZED EMOTION MODEL")
print("="*80)

print("\n🔟 Training Final distilroBERTa Emotion Model with Best Parameters...")
print("="*60)

# Get best parameters from emotion tuning
best_emotion_params = emotion_study.best_params
print(f"🎯 Using best hyperparameters:")
for key, value in best_emotion_params.items():
    print(f"  {key}: {value}")

# Create optimized config for final training
final_emotion_config = TrainingConfig(
    model_name=model_name,
    learning_rate=best_emotion_params['learning_rate'],
    batch_size=best_emotion_params['batch_size'],
    num_epochs=5,  # Increase epochs for final training
    warmup_ratio=best_emotion_params['warmup_ratio'],
    weight_decay=best_emotion_params['weight_decay'],
    hidden_dropout_prob=best_emotion_params['hidden_dropout_prob'],
    classifier_dropout=best_emotion_params['classifier_dropout'],
    max_length=best_emotion_params.get('max_length', 128),
    task_type="emotion",
    output_dir="./final_distilroberta_emotion_model"
)

print(f"\n🚀 Training final emotion model:")
print(f"  Dataset: Full emotion data ({len(emotion_data['train']['texts'])} train samples)")
print(f"  Epochs: {final_emotion_config.num_epochs}")
print(f"  Batch size: {final_emotion_config.batch_size}")
print(f"  Learning rate: {final_emotion_config.learning_rate:.2e}")

# Train final emotion model
final_emotion_trainer = distilroBERTaSingleTaskTrainer(
    config=final_emotion_config,
    num_classes=roberta_model_config.emotion_num_classes
)
final_emotion_history = final_emotion_trainer.train(emotion_data)

# Evaluate final emotion model
final_emotion_results = evaluate_distilroberta_model(
    model_path="./final_distilroberta_emotion_model/model_best",
    model_type="emotion",
    test_data=emotion_data['test'],
    model_name=model_name
)
all_results['final_emotion'] = final_emotion_results

print(f"\n✅ Final Emotion Model Results:")
print(f"  Accuracy: {final_emotion_results['accuracy']:.4f}")
print(f"  F1 Macro: {final_emotion_results['f1_macro']:.4f}")

# Clean up memory
aggressive_memory_cleanup()

In [None]:
# Cell 17: Final Multitask Model Training with Best Parameters
print("\n" + "="*80)
print("📍 PHASE 3: FINAL TRAINING - OPTIMIZED MULTITASK MODEL")
print("="*80)

print("\n1️⃣1️⃣ Training Final distilroBERTa Multi-task Model with Best Parameters...")
print("="*60)

# Get best parameters from multitask tuning
best_multitask_params = multitask_study.best_params
print(f"🎯 Using best hyperparameters:")
for key, value in best_multitask_params.items():
    print(f"  {key}: {value}")

# Create optimized config for final training
final_multitask_config = TrainingConfig(
    model_name=model_name,
    learning_rate=best_multitask_params['learning_rate'],
    batch_size=best_multitask_params['batch_size'],
    num_epochs=5,  # Increase epochs for final training
    warmup_ratio=best_multitask_params['warmup_ratio'],
    weight_decay=best_multitask_params['weight_decay'],
    hidden_dropout_prob=best_multitask_params['hidden_dropout_prob'],
    classifier_dropout=best_multitask_params['classifier_dropout'],
    max_length=best_multitask_params.get('max_length', 128),
    alpha=best_multitask_params['alpha'],  # Multitask-specific parameter
    task_type="multitask",
    output_dir="./final_distilroberta_multitask_model"
)

print(f"\n🚀 Training final multitask model:")
print(f"  Dataset: Full multitask data ({len(multitask_data['train']['texts'])} train samples)")
print(f"  Epochs: {final_multitask_config.num_epochs}")
print(f"  Batch size: {final_multitask_config.batch_size}")
print(f"  Learning rate: {final_multitask_config.learning_rate:.2e}")
print(f"  Alpha (loss weighting): {final_multitask_config.alpha:.3f}")

# Train final multitask model
final_multitask_trainer = distilroBERTaMultiTaskTrainer(config=final_multitask_config)
final_multitask_history = final_multitask_trainer.train(multitask_data)

# Evaluate final multitask model
final_multitask_results = evaluate_distilroberta_model(
    model_path="./final_distilroberta_multitask_model/model_best",
    model_type="multitask",
    test_data=multitask_data['test'],
    model_name=model_name
)
all_results['final_multitask'] = final_multitask_results

print(f"\n✅ Final Multitask Model Results:")
print(f"  Sentiment - Accuracy: {final_multitask_results['sentiment_accuracy']:.4f}, F1: {final_multitask_results['sentiment_f1_macro']:.4f}")
print(f"  Emotion - Accuracy: {final_multitask_results['emotion_accuracy']:.4f}, F1: {final_emotion_results['f1_macro']:.4f}")
print(f"  Combined - Accuracy: {final_multitask_results['combined_accuracy']:.4f}, F1: {final_multitask_results['combined_f1_macro']:.4f}")

# Clean up memory
aggressive_memory_cleanup()

In [None]:
# Cell 18: Final Results Summary
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

print(f"\nMODEL PERFORMANCE COMPARISON:")

print(f"\n1️⃣ Sentiment Task:")
print(f"  Initial Model:")
print(f"    Accuracy: {all_results['initial_sentiment']['accuracy']:.4f}")
print(f"    F1 Macro: {all_results['initial_sentiment']['f1_macro']:.4f}")
print(f"  Optimized Model:")
print(f"    Accuracy: {all_results['final_sentiment']['accuracy']:.4f}")
print(f"    F1 Macro: {all_results['final_sentiment']['f1_macro']:.4f}")
print(f"  Improvement:")
print(f"    Accuracy: {(all_results['final_sentiment']['accuracy'] - all_results['initial_sentiment']['accuracy'])*100:.2f}%")
print(f"    F1 Macro: {(all_results['final_sentiment']['f1_macro'] - all_results['initial_sentiment']['f1_macro'])*100:.2f}%")

print(f"\n2️⃣ Emotion Task:")
print(f"  Initial Model:")
print(f"    Accuracy: {all_results['initial_emotion']['accuracy']:.4f}")
print(f"    F1 Macro: {all_results['initial_emotion']['f1_macro']:.4f}")
print(f"  Optimized Model:")
print(f"    Accuracy: {all_results['final_emotion']['accuracy']:.4f}")
print(f"    F1 Macro: {all_results['final_emotion']['f1_macro']:.4f}")
print(f"  Improvement:")
print(f"    Accuracy: {(all_results['final_emotion']['accuracy'] - all_results['initial_emotion']['accuracy'])*100:.2f}%")
print(f"    F1 Macro: {(all_results['final_emotion']['f1_macro'] - all_results['initial_emotion']['f1_macro'])*100:.2f}%")

print(f"\n3️⃣ Multi-task Model:")
print(f"  Initial Model:")
print(f"    Sentiment - Accuracy: {all_results['initial_multitask']['sentiment_accuracy']:.4f}, F1: {all_results['initial_multitask']['sentiment_f1_macro']:.4f}")
print(f"    Emotion - Accuracy: {all_results['initial_multitask']['emotion_accuracy']:.4f}, F1: {all_results['initial_multitask']['emotion_f1_macro']:.4f}")
print(f"  Optimized Model:")
print(f"    Sentiment - Accuracy: {all_results['final_multitask']['sentiment_accuracy']:.4f}, F1: {all_results['final_multitask']['sentiment_f1_macro']:.4f}")
print(f"    Emotion - Accuracy: {all_results['final_multitask']['emotion_accuracy']:.4f}, F1: {all_results['final_multitask']['emotion_f1_macro']:.4f}")
print(f"  Improvement:")
print(f"    Sentiment - Accuracy: {(all_results['final_multitask']['sentiment_accuracy'] - all_results['initial_multitask']['sentiment_accuracy'])*100:.2f}%")
print(f"    Emotion - Accuracy: {(all_results['final_multitask']['emotion_accuracy'] - all_results['initial_multitask']['emotion_accuracy'])*100:.2f}%")

print("\nTraining pipeline completed!")