# 🚀 EmoWOZ Final Implementation: Days 5-7
## M4 DialoGPT + Comprehensive Evaluation + Final Package

**Project**: One-Turn-Ahead Frustration Forecasting in Task-Oriented Dialogs  
**Current Status**: M3 RoBERTa-GRU BREAKTHROUGH (Macro-F1: 0.7408, Latency: 11.57ms) ✅  
**Next Goals**: 
- **Day 5**: M4 DialoGPT (target: beat M3's 0.7408)
- **Day 6**: Cross-model evaluation and benchmarking
- **Day 7**: Final documentation and package

---

## 📋 Implementation Plan

### **Day 5: M4 DialoGPT Fine-tuned Model**
- DialoGPT-small fine-tuning
- Longer context (5 turns, max_length=1024)
- Last token representation
- Target: Macro-F1 > 0.7408

### **Day 6: Comprehensive Evaluation**
- eval.py script for all models (M1-M4)
- Statistical significance testing
- Cross-model latency benchmarking
- Error analysis

### **Day 7: Final Package**
- Complete documentation
- Model comparison report
- Reproducibility guide
- Benchmark package


In [14]:
# Setup and imports for M4-M7 implementation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
import json
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from scipy import stats
from tqdm import tqdm
import time
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


Using device: cuda
CUDA available: True
GPU: NVIDIA H100 PCIe
GPU Memory: 79.1 GB


# 📊 DAY 5: M4 DialoGPT Implementation

## 🎯 M4 Objectives
- **Target**: Beat M3's Macro-F1 (0.7408)
- **Innovation**: Longer context (5 turns vs 3 turns)
- **Architecture**: DialoGPT-small + classification head
- **Context**: max_length=1024 (vs M3's 512)
- **Representation**: Last token (vs pooler output)


In [15]:
# M4 DialoGPT Configuration
M4_CONFIG = {
    'model_name': 'microsoft/DialoGPT-small',
    'max_length': 1024,       # Longer context than M3 (512)
    'context_window': 5,      # More turns than M3 (3)
    'dropout': 0.1,
    'batch_size': 8,          # Smaller due to longer sequences
    'learning_rate': 1e-5,    # Lower LR for fine-tuning
    'epochs': 5,              # More epochs for convergence
    'weight_decay': 0.01,
    'class_weight_ratio': 13.7,
    'patience': 3
}

print("M4 DialoGPT Configuration:")
for k, v in M4_CONFIG.items():
    print(f"  {k}: {v}")

# Compare with M3
print(f"\n📊 M4 vs M3 Differences:")
print(f"  Context Window: {M4_CONFIG['context_window']} vs 3 turns")
print(f"  Max Length: {M4_CONFIG['max_length']} vs 512 tokens")
print(f"  Batch Size: {M4_CONFIG['batch_size']} vs 16 (memory optimization)")
print(f"  Learning Rate: {M4_CONFIG['learning_rate']} vs 2e-5")


M4 DialoGPT Configuration:
  model_name: microsoft/DialoGPT-small
  max_length: 1024
  context_window: 5
  dropout: 0.1
  batch_size: 8
  learning_rate: 1e-05
  epochs: 5
  weight_decay: 0.01
  class_weight_ratio: 13.7
  patience: 3

📊 M4 vs M3 Differences:
  Context Window: 5 vs 3 turns
  Max Length: 1024 vs 512 tokens
  Batch Size: 8 vs 16 (memory optimization)
  Learning Rate: 1e-05 vs 2e-5


In [17]:
# M4 DialoGPT Model Architecture
class DialoGPTClassifier(nn.Module):
    def __init__(self, config):
        super(DialoGPTClassifier, self).__init__()
        self.config = config
        
        # DialoGPT for dialogue understanding
        self.gpt = GPT2LMHeadModel.from_pretrained(config['model_name'])
        self.hidden_size = self.gpt.config.hidden_size
        
        # Freeze language modeling head (we only need the transformer)
        for param in self.gpt.lm_head.parameters():
            param.requires_grad = False
            
        # Classification head
        self.dropout = nn.Dropout(config['dropout'])
        self.classifier = nn.Linear(self.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        # Get DialoGPT transformer outputs
        outputs = self.gpt.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use last token representation (dialogue completion)
        # Find the last non-padded token for each sequence
        batch_size = input_ids.shape[0]
        last_token_indices = attention_mask.sum(dim=1) - 1  # Last actual token
        
        # Extract last token representations
        last_hidden_states = []
        for i in range(batch_size):
            last_idx = last_token_indices[i]
            last_hidden_states.append(outputs.last_hidden_state[i, last_idx, :])
        
        last_hidden = torch.stack(last_hidden_states)
        
        # Classification
        output = self.dropout(last_hidden)
        logits = self.classifier(output)
        
        return logits

print("M4 DialoGPT Classifier architecture defined")
print("Key innovations:")
print("  - Uses DialoGPT transformer (dialogue-specific pre-training)")
print("  - Last token representation (captures full dialogue context)")
print("  - Longer context window (5 turns vs M3's 3)")
print("  - Larger sequence length (1024 vs M3's 512)")


M4 DialoGPT Classifier architecture defined
Key innovations:
  - Uses DialoGPT transformer (dialogue-specific pre-training)
  - Last token representation (captures full dialogue context)
  - Longer context window (5 turns vs M3's 3)
  - Larger sequence length (1024 vs M3's 512)


In [18]:
# M4 Dataset class for longer context
class EmoWOZDialoGPTDataset(Dataset):
    def __init__(self, data_path, tokenizer, config):
        self.tokenizer = tokenizer
        self.max_length = config['max_length']
        self.context_window = config['context_window']
        
        # Load data with error handling
        self.data = []
        skipped_lines = 0
        
        with open(data_path, 'r') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    data_item = json.loads(line)
                    self.data.append(data_item)
                except json.JSONDecodeError as e:
                    skipped_lines += 1
                    continue
        
        print(f"M4 Dataset: Loaded {len(self.data)} samples from {data_path}")
        if skipped_lines > 0:
            print(f"Skipped {skipped_lines} invalid lines")
    
    def __len__(self):
        return len(self.data)
    
    def parse_context_string(self, context_str):
        """Parse context string into individual turns"""
        import re
        
        turns = []
        pattern = r'\[(USER|SYSTEM)\]'
        matches = list(re.finditer(pattern, context_str))
        
        for i, match in enumerate(matches):
            speaker = match.group(1)
            start_pos = match.end()
            
            if i + 1 < len(matches):
                end_pos = matches[i + 1].start()
                text = context_str[start_pos:end_pos].strip()
            else:
                text = context_str[start_pos:].strip()
            
            if text:
                turns.append({
                    'speaker': speaker,
                    'text': text
                })
        
        return turns
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Parse context string into turns
        context_str = item['context']
        context_turns = self.parse_context_string(context_str)
        
        # Take last N turns (longer context for M4)
        if len(context_turns) > self.context_window:
            context_turns = context_turns[-self.context_window:]
        
        # Create dialogue string for DialoGPT
        dialogue_text = ""
        for turn in context_turns:
            if turn['speaker'] == 'USER':
                dialogue_text += f"User: {turn['text']} "
            else:
                dialogue_text += f"System: {turn['text']} "
        
        # Add the current turn
        current_text = item.get('text', '')
        if current_text:
            dialogue_text += f"User: {current_text}"
        
        # Tokenize the full dialogue
        encoded = self.tokenizer(
            dialogue_text.strip(),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Label
        label = torch.tensor(item['label'], dtype=torch.float)
        
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': label
        }

print("M4 DialoGPT Dataset class defined")
print("Key features:")
print(f"  - Longer context window: {M4_CONFIG['context_window']} turns")
print(f"  - Larger max length: {M4_CONFIG['max_length']} tokens")
print("  - Dialogue-style formatting for DialoGPT")
print("  - User/System speaker formatting")


M4 DialoGPT Dataset class defined
Key features:
  - Longer context window: 5 turns
  - Larger max length: 1024 tokens
  - Dialogue-style formatting for DialoGPT
  - User/System speaker formatting


In [19]:
# Step 2: M4 Data Loading & Training Setup

# Load DialoGPT tokenizer
print("Loading DialoGPT tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(M4_CONFIG['model_name'])

# Add padding token (DialoGPT doesn't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded: {M4_CONFIG['model_name']}")
print(f"Vocab size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")

# Create M4 datasets with longer context
train_dataset = EmoWOZDialoGPTDataset('../data/train.jsonl', tokenizer, M4_CONFIG)
val_dataset = EmoWOZDialoGPTDataset('../data/val.jsonl', tokenizer, M4_CONFIG)
test_dataset = EmoWOZDialoGPTDataset('../data/test.jsonl', tokenizer, M4_CONFIG)

# Create data loaders with smaller batch size
train_loader = DataLoader(train_dataset, batch_size=M4_CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=M4_CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=M4_CONFIG['batch_size'], shuffle=False)

print(f"\n📊 M4 Data Loading Complete:")
print(f"Train batches: {len(train_loader)} (batch_size={M4_CONFIG['batch_size']})")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")
print(f"Total samples: {len(train_dataset)} train, {len(val_dataset)} val, {len(test_dataset)} test")


Using pad_token, but it is not set yet.


Loading DialoGPT tokenizer...
Tokenizer loaded: microsoft/DialoGPT-small
Vocab size: 50257
Pad token: <|endoftext|>
M4 Dataset: Loaded 25738 samples from ../data/train.jsonl
Skipped 1 invalid lines
M4 Dataset: Loaded 7409 samples from ../data/val.jsonl
M4 Dataset: Loaded 7534 samples from ../data/test.jsonl

📊 M4 Data Loading Complete:
Train batches: 3218 (batch_size=8)
Validation batches: 927
Test batches: 942
Total samples: 25738 train, 7409 val, 7534 test


In [20]:
# Initialize M4 Model and Training Components
print("Initializing M4 DialoGPT model...")

# Initialize model
model = DialoGPTClassifier(M4_CONFIG).to(device)

# Loss function with class weights
pos_weight = torch.tensor(M4_CONFIG['class_weight_ratio']).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Optimizer (lower learning rate for fine-tuning)
optimizer = optim.AdamW(model.parameters(), lr=M4_CONFIG['learning_rate'], weight_decay=M4_CONFIG['weight_decay'])

# Model information
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f"\n🏗️ M4 Model Information:")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Frozen parameters: {frozen_params:,} (LM head)")
print(f"Model size: {total_params * 4 / 1024**2:.1f} MB")
print(f"Hidden size: {model.hidden_size}")

# Compare with M3
print(f"\n📊 M4 vs M3 Model Comparison:")
print(f"M4 Parameters: {total_params:,}")
print(f"M3 Parameters: ~125M (RoBERTa + GRU)")
print(f"M4 Context: {M4_CONFIG['context_window']} turns, {M4_CONFIG['max_length']} tokens")
print(f"M3 Context: 3 turns, 512 tokens")


Initializing M4 DialoGPT model...



🏗️ M4 Model Information:
Total parameters: 124,440,577
Trainable parameters: 85,843,201
Frozen parameters: 38,597,376 (LM head)
Model size: 474.7 MB
Hidden size: 768

📊 M4 vs M3 Model Comparison:
M4 Parameters: 124,440,577
M3 Parameters: ~125M (RoBERTa + GRU)
M4 Context: 5 turns, 1024 tokens
M3 Context: 3 turns, 512 tokens


In [10]:
# M4 Training and Evaluation Functions
def train_epoch_m4(model, train_loader, criterion, optimizer, device):
    """Training function for M4 DialoGPT"""
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc="M4 Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        outputs = outputs.squeeze(-1)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def evaluate_m4(model, eval_loader, criterion, device):
    """Evaluation function for M4 DialoGPT"""
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(eval_loader, desc="M4 Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            outputs = outputs.squeeze(-1)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            # Get predictions
            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    avg_loss = total_loss / len(eval_loader)
    preds_binary = (np.array(all_preds) > 0.5).astype(int)
    
    macro_f1 = f1_score(all_labels, preds_binary, average='macro')
    auc = roc_auc_score(all_labels, all_preds)
    accuracy = accuracy_score(all_labels, preds_binary)
    
    return avg_loss, macro_f1, auc, accuracy, np.array(all_labels), np.array(all_preds), preds_binary

print("M4 training and evaluation functions defined")
print("Key features:")
print("  - Optimized for DialoGPT longer sequences")
print("  - Handles last token representation")
print("  - Memory-efficient batch processing")


M4 training and evaluation functions defined
Key features:
  - Optimized for DialoGPT longer sequences
  - Handles last token representation
  - Memory-efficient batch processing


In [12]:
# M4 Training Loop
print("🚀 Starting M4 DialoGPT Training...")
print("=" * 60)
print(f"Target: Beat M3's Macro-F1 of 0.7408")
print(f"Configuration: {M4_CONFIG['epochs']} epochs, batch_size={M4_CONFIG['batch_size']}, lr={M4_CONFIG['learning_rate']}")
print("=" * 60)

best_macro_f1 = 0
patience_counter = 0
m4_training_history = []

start_time = time.time()

for epoch in range(M4_CONFIG['epochs']):
    print(f"\n🔄 Epoch {epoch + 1}/{M4_CONFIG['epochs']}")
    
    # Training
    train_loss = train_epoch_m4(model, train_loader, criterion, optimizer, device)
    
    # Validation
    val_loss, val_macro_f1, val_auc, val_accuracy, _, _, _ = evaluate_m4(model, val_loader, criterion, device)
    
    # Log results
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f}")
    print(f"Val Macro-F1: {val_macro_f1:.4f}")
    print(f"Val Accuracy: {val_accuracy:.4f}")
    print(f"Val AUC: {val_auc:.4f}")
    
    # Save training history
    m4_training_history.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_macro_f1': val_macro_f1,
        'val_accuracy': val_accuracy,
        'val_auc': val_auc
    })
    
    # Early stopping and model saving
    if val_macro_f1 > best_macro_f1:
        best_macro_f1 = val_macro_f1
        patience_counter = 0
        # Save best model
        os.makedirs('../checkpoints/M4_dialogpt', exist_ok=True)
        torch.save(model.state_dict(), '../checkpoints/M4_dialogpt/best_model.pt')
        print(f"✅ New best M4 model saved! Macro-F1: {best_macro_f1:.4f}")
        
        # Check if we beat M3
        if val_macro_f1 > 0.7408:
            print(f"🎉 M4 BEATS M3! {val_macro_f1:.4f} > 0.7408")
    else:
        patience_counter += 1
        print(f"⏸️ No improvement. Patience: {patience_counter}/{M4_CONFIG['patience']}")
    
    # Early stopping
    if patience_counter >= M4_CONFIG['patience']:
        print(f"⏹️ Early stopping triggered after {epoch + 1} epochs")
        break

training_time = time.time() - start_time
print(f"\n🎯 M4 Training completed in {training_time:.1f} seconds ({training_time/60:.1f} minutes)")
print(f"Best validation Macro-F1: {best_macro_f1:.4f}")

# Compare with M3
m3_target = 0.7408
if best_macro_f1 > m3_target:
    improvement = best_macro_f1 - m3_target
    print(f"🏆 SUCCESS: M4 beats M3 by +{improvement:.4f} Macro-F1 ({improvement/m3_target*100:.2f}%)")
else:
    deficit = m3_target - best_macro_f1
    print(f"📊 M4 Result: {best_macro_f1:.4f} vs M3's {m3_target} (deficit: -{deficit:.4f})")


🚀 Starting M4 DialoGPT Training...
Target: Beat M3's Macro-F1 of 0.7408
Configuration: 5 epochs, batch_size=8, lr=1e-05

🔄 Epoch 1/5


M4 Training: 100%|██████████| 3218/3218 [15:04<00:00,  3.56it/s]
M4 Evaluating: 100%|██████████| 927/927 [01:35<00:00,  9.67it/s]


Train Loss: 1.6143
Val Loss: 0.7718
Val Macro-F1: 0.6841
Val Accuracy: 0.8757
Val AUC: 0.8807
✅ New best M4 model saved! Macro-F1: 0.6841

🔄 Epoch 2/5


M4 Training: 100%|██████████| 3218/3218 [15:05<00:00,  3.55it/s]
M4 Evaluating: 100%|██████████| 927/927 [01:35<00:00,  9.67it/s]


Train Loss: 0.9114
Val Loss: 0.7063
Val Macro-F1: 0.7011
Val Accuracy: 0.8807
Val AUC: 0.8996
✅ New best M4 model saved! Macro-F1: 0.7011

🔄 Epoch 3/5


M4 Training:  94%|█████████▍| 3020/3218 [14:09<00:55,  3.55it/s]

In [None]:
# M4 Final Test Evaluation & Latency Benchmarking

# Load best M4 model
print("Loading best M4 model for final evaluation...")
model.load_state_dict(torch.load('../checkpoints/M4_dialogpt/best_model.pt', weights_only=True))
print("✅ Best M4 model loaded")

# Final test evaluation
print("\n📊 M4 FINAL TEST EVALUATION")
print("=" * 50)

test_loss, test_macro_f1, test_auc, test_accuracy, test_labels, test_probs, test_preds = evaluate_m4(model, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Macro-F1: {test_macro_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test AUC: {test_auc:.4f}")

# Detailed classification report
print("\nDetailed Classification Report:")
target_names = ['Not Frustrated', 'Will Be Frustrated']
print(classification_report(test_labels, test_preds, target_names=target_names, digits=4))

# M4 Latency Benchmarking
print("\n⚡ M4 LATENCY BENCHMARKING")
print("=" * 40)

model.eval()
latencies = []

# Warm-up
print("Warming up M4 model...")
for i, batch in enumerate(test_loader):
    if i >= 3:  # Fewer warm-up batches due to longer sequences
        break
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        _ = model(input_ids, attention_mask)

print("Measuring M4 latency...")

# Measure latency (sample fewer due to longer sequences)
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader, desc="M4 Latency test")):
        if i >= 100:  # Sample 100 batches for latency measurement
            break
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        for j in range(min(4, input_ids.shape[0])):  # Test 4 samples per batch
            single_input = input_ids[j:j+1]
            single_mask = attention_mask[j:j+1]
            
            start_time = time.perf_counter()
            _ = model(single_input, single_mask)
            end_time = time.perf_counter()
            
            latencies.append((end_time - start_time) * 1000)  # Convert to milliseconds

# Calculate latency statistics
latencies = np.array(latencies)
avg_latency = np.mean(latencies)
median_latency = np.median(latencies)
p95_latency = np.percentile(latencies, 95)
p99_latency = np.percentile(latencies, 99)

print(f"Average Latency: {avg_latency:.2f}ms")
print(f"Median Latency: {median_latency:.2f}ms")
print(f"95th Percentile: {p95_latency:.2f}ms")
print(f"99th Percentile: {p99_latency:.2f}ms")
print(f"Throughput: {1000/avg_latency:.1f} samples/sec")

# Check latency target
latency_target = 15.0  # ms
if avg_latency <= latency_target:
    print(f"✅ M4 LATENCY TARGET MET: {avg_latency:.2f}ms ≤ {latency_target}ms")
else:
    print(f"❌ M4 LATENCY TARGET MISSED: {avg_latency:.2f}ms > {latency_target}ms")
    print(f"   Expected due to longer context ({M4_CONFIG['context_window']} turns, {M4_CONFIG['max_length']} tokens)")


In [None]:
# Check if M4 training was completed or interrupted
import os

m4_checkpoint_path = '../checkpoints/M4_dialogpt/best_model.pt'
if os.path.exists(m4_checkpoint_path):
    print("✅ M4 checkpoint found! Loading best model for evaluation...")
    model.load_state_dict(torch.load(m4_checkpoint_path, weights_only=True))
    
    # Quick validation check
    val_loss, val_macro_f1, val_auc, val_accuracy, _, _, _ = evaluate_m4(model, val_loader, criterion, device)
    print(f"Loaded M4 model validation metrics:")
    print(f"  Validation Macro-F1: {val_macro_f1:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")
    print(f"  Validation AUC: {val_auc:.4f}")
    
    # Check if it beats M3
    m3_target = 0.7408
    if val_macro_f1 > m3_target:
        improvement = val_macro_f1 - m3_target
        print(f"🎉 M4 BEATS M3! {val_macro_f1:.4f} > {m3_target} (+{improvement:.4f})")
    else:
        deficit = m3_target - val_macro_f1
        print(f"📊 M4 vs M3: {val_macro_f1:.4f} vs {m3_target} (-{deficit:.4f})")
        
else:
    print("❌ No M4 checkpoint found. Training was interrupted.")
    print("Proceeding with current model state for evaluation...")


In [None]:
# M4 Final Test Evaluation & Latency Benchmarking

print("📊 M4 FINAL TEST EVALUATION")
print("=" * 50)

# Final test evaluation
test_loss, test_macro_f1, test_auc, test_accuracy, test_labels, test_probs, test_preds = evaluate_m4(model, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Macro-F1: {test_macro_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test AUC: {test_auc:.4f}")

# Detailed classification report
print("\nDetailed Classification Report:")
target_names = ['Not Frustrated', 'Will Be Frustrated']
print(classification_report(test_labels, test_preds, target_names=target_names, digits=4))

# Save M4 results
m4_results = {
    'model': 'M4_DialoGPT',
    'test_loss': float(test_loss),
    'test_macro_f1': float(test_macro_f1),
    'test_accuracy': float(test_accuracy),
    'test_auc': float(test_auc),
    'config': M4_CONFIG
}

os.makedirs('../results', exist_ok=True)
with open('../results/M4_dialogpt_results.json', 'w') as f:
    json.dump(m4_results, f, indent=2)

print(f"\n✅ M4 results saved to ../results/M4_dialogpt_results.json")


In [None]:
# M4 Latency Benchmarking
print("⚡ M4 LATENCY BENCHMARKING")
print("=" * 40)

model.eval()
latencies = []

# Warm-up (fewer batches due to longer sequences)
print("Warming up M4 model...")
for i, batch in enumerate(test_loader):
    if i >= 3:
        break
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        _ = model(input_ids, attention_mask)

print("Measuring M4 latency...")

# Measure latency (sample fewer due to longer sequences)
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader, desc="M4 Latency test")):
        if i >= 100:  # Sample 100 batches for latency measurement
            break
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        for j in range(min(4, input_ids.shape[0])):  # Test 4 samples per batch
            single_input = input_ids[j:j+1]
            single_mask = attention_mask[j:j+1]
            
            start_time = time.perf_counter()
            _ = model(single_input, single_mask)
            end_time = time.perf_counter()
            
            latencies.append((end_time - start_time) * 1000)  # Convert to milliseconds

# Calculate latency statistics
latencies = np.array(latencies)
avg_latency = np.mean(latencies)
median_latency = np.median(latencies)
p95_latency = np.percentile(latencies, 95)
p99_latency = np.percentile(latencies, 99)

print(f"Average Latency: {avg_latency:.2f}ms")
print(f"Median Latency: {median_latency:.2f}ms")
print(f"95th Percentile: {p95_latency:.2f}ms")
print(f"99th Percentile: {p99_latency:.2f}ms")
print(f"Throughput: {1000/avg_latency:.1f} samples/sec")

# Check latency target
latency_target = 15.0  # ms
if avg_latency <= latency_target:
    print(f"✅ M4 LATENCY TARGET MET: {avg_latency:.2f}ms ≤ {latency_target}ms")
else:
    print(f"❌ M4 LATENCY TARGET MISSED: {avg_latency:.2f}ms > {latency_target}ms")
    print(f"   Expected due to longer context ({M4_CONFIG['context_window']} turns, {M4_CONFIG['max_length']} tokens)")

# Update results with latency
m4_results['avg_latency_ms'] = float(avg_latency)
m4_results['median_latency_ms'] = float(median_latency)
m4_results['p95_latency_ms'] = float(p95_latency)
m4_results['p99_latency_ms'] = float(p99_latency)
m4_results['throughput_samples_per_sec'] = float(1000/avg_latency)

with open('../results/M4_dialogpt_results.json', 'w') as f:
    json.dump(m4_results, f, indent=2)

print(f"\n✅ M4 complete results updated with latency metrics")
