In [1]:
# NEURAL NETWORKS FOR PROSODIC EVENT DETECTION
# Complete pipeline: 1D CNN, LSTM, Bidirectional LSTM, and Transformer models
# for detecting prosodic prominence and boundaries in speech

import numpy as np
import pandas as pd
import pickle
import time
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# ML utilities
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("Neural Networks for Prosodic Event Detection")
print("=" * 60)

# 1. DATA LOADING AND PREPARATION

class ProsodyDataset(Dataset):
    """
    PyTorch Dataset for prosodic event detection
    Handles temporal sequences with proper padding
    """
    
    def __init__(self, features, prominence_labels, boundary_labels, sequence_length=100):
        """
        Args:
            features: List of feature arrays for each file
            prominence_labels: List of prominence label arrays  
            boundary_labels: List of boundary label arrays
            sequence_length: Fixed sequence length for batching
        """
        self.sequence_length = sequence_length
        self.sequences = []
        self.prominence_targets = []
        self.boundary_targets = []
        
        # Create fixed-length sequences from variable-length files
        for file_features, prom_labels, bound_labels in zip(features, prominence_labels, boundary_labels):
            # Split each file into sequences
            file_sequences = self._create_sequences(file_features, prom_labels, bound_labels)
            self.sequences.extend(file_sequences['features'])
            self.prominence_targets.extend(file_sequences['prominence'])
            self.boundary_targets.extend(file_sequences['boundary'])
        
        print(f"📊 Created {len(self.sequences)} sequences of length {sequence_length}")
    
    def _create_sequences(self, features, prom_labels, bound_labels):
        """Create overlapping sequences from a single file"""
        seq_features = []
        seq_prominence = []
        seq_boundary = []
        
        # Use overlapping windows with 50% overlap
        hop_size = self.sequence_length // 2
        
        for i in range(0, len(features) - self.sequence_length + 1, hop_size):
            end_idx = i + self.sequence_length
            
            seq_features.append(features[i:end_idx])
            seq_prominence.append(prom_labels[i:end_idx])
            seq_boundary.append(bound_labels[i:end_idx])
        
        return {
            'features': seq_features,
            'prominence': seq_prominence, 
            'boundary': seq_boundary
        }
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return {
            'features': torch.FloatTensor(self.sequences[idx]),
            'prominence': torch.LongTensor(self.prominence_targets[idx]),
            'boundary': torch.LongTensor(self.boundary_targets[idx])
        }

def load_and_prepare_data(data_path="autorpt_processed_subset.pkl"):
    """Load data and prepare for neural networks"""
    print("📂 Loading processed data...")
    
    with open(data_path, 'rb') as f:
        data = pickle.load(f)
    
    processed_data = data['processed_data']
    print(f"✅ Loaded {len(processed_data)} files")
    
    # Separate by splits (same as classical ML)
    n_files = len(processed_data)
    train_files = int(0.7 * n_files)  # 99 files
    val_files = int(0.15 * n_files)   # 21 files
    
    # Extract features and labels by split
    splits = {
        'train': processed_data[:train_files],
        'val': processed_data[train_files:train_files + val_files],
        'test': processed_data[train_files + val_files:]
    }
    
    print(f"📊 Split sizes: Train={len(splits['train'])}, Val={len(splits['val'])}, Test={len(splits['test'])}")
    
    # Scale features globally
    print("🔧 Scaling features...")
    all_features = np.vstack([d['features'] for d in processed_data])
    scaler = StandardScaler()
    scaler.fit(all_features)
    
    # Apply scaling to each split
    for split_name in ['train', 'val', 'test']:
        for file_data in splits[split_name]:
            file_data['features'] = scaler.transform(file_data['features'])
    
    return splits, scaler

Neural Networks for Prosodic Event Detection


In [2]:
# NEURAL NETWORKS DEBUG & FIX
# =====================================================
# Let's diagnose and fix the F1=0.000 issue

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import pickle

print("🔍 DIAGNOSING NEURAL NETWORK ISSUES")
print("=" * 50)

# =====================================================
# 1. DIAGNOSTIC FUNCTIONS
# =====================================================

def diagnose_class_distribution(splits):
    """Analyze the extreme class imbalance issue"""
    print("\n📊 CLASS DISTRIBUTION ANALYSIS")
    print("-" * 40)
    
    for split_name in ['train', 'val', 'test']:
        all_prom = np.hstack([d['prominence_labels'] for d in splits[split_name]])
        all_bound = np.hstack([d['boundary_labels'] for d in splits[split_name]])
        
        prom_rate = all_prom.mean()
        bound_rate = all_bound.mean()
        
        print(f"{split_name.upper():5s}: {len(all_prom):,} frames")
        print(f"  Prominence: {all_prom.sum():,} / {len(all_prom):,} = {prom_rate:.1%}")
        print(f"  Boundary:   {all_bound.sum():,} / {len(all_bound):,} = {bound_rate:.1%}")
        print()

def diagnose_sequence_labels(seq_datasets):
    """Check what happens when we convert to sequence labels"""
    print("\n🔍 SEQUENCE LABEL ANALYSIS")
    print("-" * 40)
    
    train_dataset = seq_datasets['train']
    
    # Sample a few sequences to see the label conversion
    sample_sequences = []
    for i in range(min(10, len(train_dataset))):
        sample = train_dataset[i]
        prom_seq = sample['prominence'].numpy()
        bound_seq = sample['boundary'].numpy()
        
        # This is what our training does:
        prom_label = int(prom_seq.mean() > 0.5)
        bound_label = int(bound_seq.mean() > 0.5)
        
        sample_sequences.append({
            'prom_frames': prom_seq.sum(),
            'bound_frames': bound_seq.sum(),
            'prom_label': prom_label,
            'bound_label': bound_label
        })
    
    print("Sample sequences (first 10):")
    for i, seq in enumerate(sample_sequences):
        print(f"  Seq {i}: Prom {seq['prom_frames']:2d}/50 → {seq['prom_label']} | "
              f"Bound {seq['bound_frames']:2d}/50 → {seq['bound_label']}")
    
    # Check overall sequence-level distribution
    all_prom_labels = []
    all_bound_labels = []
    
    for i in range(len(train_dataset)):
        sample = train_dataset[i]
        prom_seq = sample['prominence'].numpy()
        bound_seq = sample['boundary'].numpy()
        
        prom_label = int(prom_seq.mean() > 0.5)
        bound_label = int(bound_seq.mean() > 0.5)
        
        all_prom_labels.append(prom_label)
        all_bound_labels.append(bound_label)
    
    prom_pos = np.sum(all_prom_labels)
    bound_pos = np.sum(all_bound_labels)
    total = len(all_prom_labels)
    
    print(f"\n📊 SEQUENCE-LEVEL DISTRIBUTION:")
    print(f"  Total sequences: {total:,}")
    print(f"  Positive prominence sequences: {prom_pos:,} ({100*prom_pos/total:.1f}%)")
    print(f"  Positive boundary sequences: {bound_pos:,} ({100*bound_pos/total:.1f}%)")
    
    return all_prom_labels, all_bound_labels

# =====================================================
# 2. SIMPLE BASELINE MODEL
# =====================================================

class SimpleBaseline(nn.Module):
    """Ultra-simple baseline to test if learning works at all"""
    
    def __init__(self, input_features=16):
        super().__init__()
        # Just global average pooling + linear layers
        self.prominence_head = nn.Sequential(
            nn.Linear(input_features, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 2)
        )
        
        self.boundary_head = nn.Sequential(
            nn.Linear(input_features, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 2)
        )
    
    def forward(self, x):
        # Global average over sequence dimension
        x_avg = torch.mean(x, dim=1)  # (batch, features)
        
        prom_out = self.prominence_head(x_avg)
        bound_out = self.boundary_head(x_avg)
        
        return prom_out, bound_out

# =====================================================
# 3. FOCAL LOSS FOR EXTREME IMBALANCE
# =====================================================

class FocalLoss(nn.Module):
    """Focal loss to handle extreme class imbalance"""
    
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        return focal_loss.mean()

# =====================================================
# 4. IMPROVED TRAINER
# =====================================================

class ImprovedTrainer:
    """Trainer specifically designed for extreme class imbalance"""
    
    def __init__(self, model, device='cpu', use_focal_loss=True):
        self.model = model.to(device)
        self.device = device
        
        # Use focal loss or weighted CrossEntropy
        if use_focal_loss:
            self.prominence_criterion = FocalLoss(alpha=0.25, gamma=2.0)
            self.boundary_criterion = FocalLoss(alpha=0.1, gamma=3.0)  # More aggressive for boundaries
        else:
            # Moderate weights
            prom_weight = torch.FloatTensor([1.0, 5.0]).to(device)
            bound_weight = torch.FloatTensor([1.0, 10.0]).to(device)
            
            self.prominence_criterion = nn.CrossEntropyLoss(weight=prom_weight)
            self.boundary_criterion = nn.CrossEntropyLoss(weight=bound_weight)
        
        # Lower learning rate with different optimizer
        self.optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-3)
        
        self.history = []
    
    def train_epoch(self, train_loader):
        self.model.train()
        total_loss = 0
        all_prom_preds, all_prom_targets = [], []
        all_bound_preds, all_bound_targets = [], []
        
        for batch in train_loader:
            features = batch['features'].to(self.device)
            prom_targets = batch['prominence'].to(self.device)
            bound_targets = batch['boundary'].to(self.device)
            
            # Convert to sequence labels with LOWER threshold
            prom_seq_labels = (prom_targets.float().mean(dim=1) > 0.3).long()  # 30% threshold
            bound_seq_labels = (bound_targets.float().mean(dim=1) > 0.2).long()  # 20% threshold
            
            self.optimizer.zero_grad()
            
            # Forward pass
            prom_outputs, bound_outputs = self.model(features)
            
            # Calculate losses
            prom_loss = self.prominence_criterion(prom_outputs, prom_seq_labels)
            bound_loss = self.boundary_criterion(bound_outputs, bound_seq_labels)
            total_loss_batch = prom_loss + bound_loss
            
            # Backward pass
            total_loss_batch.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
            self.optimizer.step()
            
            total_loss += total_loss_batch.item()
            
            # Collect predictions
            prom_preds = torch.argmax(prom_outputs, dim=1).cpu().numpy()
            bound_preds = torch.argmax(bound_outputs, dim=1).cpu().numpy()
            
            all_prom_preds.extend(prom_preds)
            all_prom_targets.extend(prom_seq_labels.cpu().numpy())
            all_bound_preds.extend(bound_preds)
            all_bound_targets.extend(bound_seq_labels.cpu().numpy())
        
        # Calculate metrics
        train_prom_f1 = f1_score(all_prom_targets, all_prom_preds, zero_division=0)
        train_bound_f1 = f1_score(all_bound_targets, all_bound_preds, zero_division=0)
        
        # Print diagnostics
        prom_cm = confusion_matrix(all_prom_targets, all_prom_preds)
        bound_cm = confusion_matrix(all_bound_targets, all_bound_preds)
        
        return total_loss / len(train_loader), train_prom_f1, train_bound_f1, prom_cm, bound_cm
    
    def evaluate(self, val_loader):
        self.model.eval()
        all_prom_preds, all_prom_targets = [], []
        all_bound_preds, all_bound_targets = [], []
        
        with torch.no_grad():
            for batch in val_loader:
                features = batch['features'].to(self.device)
                prom_targets = batch['prominence'].to(self.device)
                bound_targets = batch['boundary'].to(self.device)
                
                # Same thresholds as training
                prom_seq_labels = (prom_targets.float().mean(dim=1) > 0.3).long()
                bound_seq_labels = (bound_targets.float().mean(dim=1) > 0.2).long()
                
                # Forward pass
                prom_outputs, bound_outputs = self.model(features)
                
                # Collect predictions
                prom_preds = torch.argmax(prom_outputs, dim=1).cpu().numpy()
                bound_preds = torch.argmax(bound_outputs, dim=1).cpu().numpy()
                
                all_prom_preds.extend(prom_preds)
                all_prom_targets.extend(prom_seq_labels.cpu().numpy())
                all_bound_preds.extend(bound_preds)
                all_bound_targets.extend(bound_seq_labels.cpu().numpy())
        
        val_prom_f1 = f1_score(all_prom_targets, all_prom_preds, zero_division=0)
        val_bound_f1 = f1_score(all_bound_targets, all_bound_preds, zero_division=0)
        
        return val_prom_f1, val_bound_f1
    
    def train(self, train_loader, val_loader, epochs=15):
        print(f"🚀 Training with improved setup for {epochs} epochs...")
        
        for epoch in range(epochs):
            # Train
            train_loss, train_prom_f1, train_bound_f1, prom_cm, bound_cm = self.train_epoch(train_loader)
            
            # Validate
            val_prom_f1, val_bound_f1 = self.evaluate(val_loader)
            
            print(f"Epoch {epoch+1:2d}/{epochs} | "
                  f"Loss: {train_loss:.4f} | "
                  f"Prom F1: {train_prom_f1:.3f}/{val_prom_f1:.3f} | "
                  f"Bound F1: {train_bound_f1:.3f}/{val_bound_f1:.3f}")
            
            # Print confusion matrices every 5 epochs
            if (epoch + 1) % 5 == 0:
                print(f"  Prom CM: TN={prom_cm[0,0]}, FP={prom_cm[0,1]}, FN={prom_cm[1,0]}, TP={prom_cm[1,1]}")
                print(f"  Bound CM: TN={bound_cm[0,0]}, FP={bound_cm[0,1]}, FN={bound_cm[1,0]}, TP={bound_cm[1,1]}")

# =====================================================
# 5. MAIN DEBUGGING EXPERIMENT
# =====================================================

def run_debug_experiment():
    print("🔍 RUNNING DEBUG EXPERIMENT")
    print("=" * 50)
    
    # Load data
    with open("autorpt_processed_subset.pkl", 'rb') as f:
        data = pickle.load(f)
    
    processed_data = data['processed_data']
    
    # Same splits as before
    n_files = len(processed_data)
    train_files = int(0.7 * n_files)
    val_files = int(0.15 * n_files)
    
    splits = {
        'train': processed_data[:train_files],
        'val': processed_data[train_files:train_files + val_files],
        'test': processed_data[train_files + val_files:]
    }
    
    # Diagnose class distribution
    diagnose_class_distribution(splits)
    
    # Create datasets
    from collections import defaultdict
    
    class SimpleProsodyDataset:
        def __init__(self, splits_data, sequence_length=50):
            self.sequences = []
            self.prominence_targets = []
            self.boundary_targets = []
            
            for file_data in splits_data:
                features = file_data['features']
                prom_labels = file_data['prominence_labels']
                bound_labels = file_data['boundary_labels']
                
                # Create overlapping sequences
                hop_size = sequence_length // 2
                for i in range(0, len(features) - sequence_length + 1, hop_size):
                    end_idx = i + sequence_length
                    
                    self.sequences.append(features[i:end_idx])
                    self.prominence_targets.append(prom_labels[i:end_idx])
                    self.boundary_targets.append(bound_labels[i:end_idx])
        
        def __len__(self):
            return len(self.sequences)
        
        def __getitem__(self, idx):
            return {
                'features': torch.FloatTensor(self.sequences[idx]),
                'prominence': torch.LongTensor(self.prominence_targets[idx]),
                'boundary': torch.LongTensor(self.boundary_targets[idx])
            }
    
    # Create datasets
    train_dataset = SimpleProsodyDataset(splits['train'])
    val_dataset = SimpleProsodyDataset(splits['val'])
    
    # Diagnose sequence labels
    seq_datasets = {'train': train_dataset, 'val': val_dataset}
    all_prom_labels, all_bound_labels = diagnose_sequence_labels(seq_datasets)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    # Test simple baseline
    print(f"\n🧪 TESTING SIMPLE BASELINE MODEL")
    print("-" * 40)
    
    model = SimpleBaseline(input_features=16)
    trainer = ImprovedTrainer(model, device='cpu', use_focal_loss=True)
    
    # Train
    trainer.train(train_loader, val_loader, epochs=15)
    
    print(f"\n✅ Debug experiment complete!")
    print(f"💡 Key insight: Lower thresholds + Focal loss should help with extreme imbalance")

# =====================================================
# RUN DEBUG
# =====================================================

if __name__ == "__main__":
    run_debug_experiment()

🔍 DIAGNOSING NEURAL NETWORK ISSUES
🔍 RUNNING DEBUG EXPERIMENT

📊 CLASS DISTRIBUTION ANALYSIS
----------------------------------------
TRAIN: 296,833 frames
  Prominence: 51,730 / 296,833 = 17.4%
  Boundary:   16,393 / 296,833 = 5.5%

VAL  : 58,803 frames
  Prominence: 10,609 / 58,803 = 18.0%
  Boundary:   3,028 / 58,803 = 5.1%

TEST : 64,409 frames
  Prominence: 11,202 / 64,409 = 17.4%
  Boundary:   3,168 / 64,409 = 4.9%


🔍 SEQUENCE LABEL ANALYSIS
----------------------------------------
Sample sequences (first 10):
  Seq 0: Prom  0/50 → 0 | Bound  0/50 → 0
  Seq 1: Prom  0/50 → 0 | Bound  0/50 → 0
  Seq 2: Prom  0/50 → 0 | Bound  0/50 → 0
  Seq 3: Prom 10/50 → 0 | Bound  0/50 → 0
  Seq 4: Prom 13/50 → 0 | Bound  0/50 → 0
  Seq 5: Prom 12/50 → 0 | Bound  0/50 → 0
  Seq 6: Prom  9/50 → 0 | Bound  8/50 → 0
  Seq 7: Prom  0/50 → 0 | Bound 11/50 → 0
  Seq 8: Prom 11/50 → 0 | Bound  3/50 → 0
  Seq 9: Prom 11/50 → 0 | Bound  0/50 → 0

📊 SEQUENCE-LEVEL DISTRIBUTION:
  Total sequences: 11,736

In [3]:
# FINAL NEURAL PROSODY DETECTION - PRODUCTION READY
# =====================================================
# Optimized neural networks for prosodic event detection with proper class balance handling

import numpy as np
import pandas as pd
import pickle
import time
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

print("🚀 FINAL NEURAL PROSODY DETECTION")
print("=" * 60)

# =====================================================
# 1. OPTIMIZED DATASET CLASS
# =====================================================

class OptimizedProsodyDataset(Dataset):
    """
    Optimized dataset with smart thresholding for sequence labels
    """
    
    def __init__(self, splits_data, sequence_length=50, 
                 prominence_threshold=0.15, boundary_threshold=0.10):
        """
        Args:
            prominence_threshold: % of frames needed for positive prominence sequence
            boundary_threshold: % of frames needed for positive boundary sequence
        """
        self.sequence_length = sequence_length
        self.prom_threshold = prominence_threshold
        self.bound_threshold = boundary_threshold
        
        self.sequences = []
        self.prominence_targets = []
        self.boundary_targets = []
        
        print(f"  📏 Creating sequences with length {sequence_length}")
        print(f"  🎯 Prominence threshold: {prominence_threshold:.1%}")
        print(f"  🎯 Boundary threshold: {boundary_threshold:.1%}")
        
        for file_data in splits_data:
            features = file_data['features']
            prom_labels = file_data['prominence_labels']
            bound_labels = file_data['boundary_labels']
            
            # Create overlapping sequences
            hop_size = sequence_length // 3  # More overlap for better coverage
            
            for i in range(0, len(features) - sequence_length + 1, hop_size):
                end_idx = i + sequence_length
                
                seq_features = features[i:end_idx]
                seq_prom = prom_labels[i:end_idx]
                seq_bound = bound_labels[i:end_idx]
                
                # Smart sequence labeling with lower thresholds
                prom_rate = seq_prom.mean()
                bound_rate = seq_bound.mean()
                
                prom_seq_label = 1 if prom_rate >= prominence_threshold else 0
                bound_seq_label = 1 if bound_rate >= boundary_threshold else 0
                
                self.sequences.append(seq_features)
                self.prominence_targets.append(prom_seq_label)
                self.boundary_targets.append(bound_seq_label)
        
        # Report final distribution
        prom_pos = np.sum(self.prominence_targets)
        bound_pos = np.sum(self.boundary_targets)
        total = len(self.sequences)
        
        print(f"  📊 Created {total:,} sequences")
        print(f"  📊 Positive prominence: {prom_pos:,} ({100*prom_pos/total:.1f}%)")
        print(f"  📊 Positive boundary: {bound_pos:,} ({100*bound_pos/total:.1f}%)")
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return {
            'features': torch.FloatTensor(self.sequences[idx]),
            'prominence': torch.LongTensor([self.prominence_targets[idx]]),
            'boundary': torch.LongTensor([self.boundary_targets[idx]])
        }

# =====================================================
# 2. PRODUCTION NEURAL ARCHITECTURES
# =====================================================

class Production_CNN(nn.Module):
    """
    Production-ready 1D CNN for prosodic event detection
    """
    
    def __init__(self, input_features=16, dropout=0.4):
        super().__init__()
        
        # Feature extraction with 1D convolutions
        self.feature_extractor = nn.Sequential(
            # First conv block
            nn.Conv1d(input_features, 32, kernel_size=7, padding=3),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(dropout),
            
            # Second conv block
            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(dropout),
            
            # Third conv block
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)  # Global average pooling
        )
        
        # Task-specific heads
        self.prominence_head = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
        
        self.boundary_head = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        # Input: (batch, seq_len, features) -> (batch, features, seq_len)
        x = x.transpose(1, 2)
        
        # Feature extraction
        features = self.feature_extractor(x)
        features = features.view(features.size(0), -1)  # Flatten
        
        # Task predictions
        prom_out = self.prominence_head(features)
        bound_out = self.boundary_head(features)
        
        return prom_out, bound_out

class Production_RNN(nn.Module):
    """
    Production-ready RNN with attention for prosodic event detection
    """
    
    def __init__(self, input_features=16, hidden_size=64, dropout=0.4):
        super().__init__()
        
        # Bidirectional GRU (faster than LSTM)
        self.rnn = nn.GRU(
            input_features, hidden_size, 
            batch_first=True, bidirectional=True, dropout=dropout
        )
        
        rnn_output_size = hidden_size * 2  # Bidirectional
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(rnn_output_size, 32),
            nn.Tanh(),
            nn.Linear(32, 1)
        )
        
        # Task-specific heads
        self.prominence_head = nn.Sequential(
            nn.Linear(rnn_output_size, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
        
        self.boundary_head = nn.Sequential(
            nn.Linear(rnn_output_size, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        # RNN forward pass
        rnn_out, _ = self.rnn(x)  # (batch, seq_len, hidden*2)
        
        # Attention weights
        attention_weights = self.attention(rnn_out)  # (batch, seq_len, 1)
        attention_weights = F.softmax(attention_weights, dim=1)
        
        # Weighted sum
        attended_features = torch.sum(rnn_out * attention_weights, dim=1)  # (batch, hidden*2)
        
        # Task predictions
        prom_out = self.prominence_head(attended_features)
        bound_out = self.boundary_head(attended_features)
        
        return prom_out, bound_out

# =====================================================
# 3. PRODUCTION TRAINER
# =====================================================

class ProductionTrainer:
    """
    Production trainer with all the fixes applied
    """
    
    def __init__(self, model, device='cpu'):
        self.model = model.to(device)
        self.device = device
        
        # Balanced loss functions
        self.prominence_criterion = nn.CrossEntropyLoss()
        self.boundary_criterion = nn.CrossEntropyLoss()
        
        # Optimizer with schedule
        self.optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=50)
        
        self.history = {
            'train_loss': [], 'val_loss': [],
            'train_prom_f1': [], 'val_prom_f1': [],
            'train_bound_f1': [], 'val_bound_f1': []
        }
    
    def train_epoch(self, train_loader):
        self.model.train()
        total_loss = 0
        all_prom_preds, all_prom_targets = [], []
        all_bound_preds, all_bound_targets = [], []
        
        for batch in train_loader:
            features = batch['features'].to(self.device)
            prom_targets = batch['prominence'].squeeze().to(self.device)
            bound_targets = batch['boundary'].squeeze().to(self.device)
            
            self.optimizer.zero_grad()
            
            # Forward pass
            prom_outputs, bound_outputs = self.model(features)
            
            # Calculate losses
            prom_loss = self.prominence_criterion(prom_outputs, prom_targets)
            bound_loss = self.boundary_criterion(bound_outputs, bound_targets)
            total_loss_batch = prom_loss + bound_loss
            
            # Backward pass
            total_loss_batch.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            total_loss += total_loss_batch.item()
            
            # Collect predictions
            prom_preds = torch.argmax(prom_outputs, dim=1).cpu().numpy()
            bound_preds = torch.argmax(bound_outputs, dim=1).cpu().numpy()
            
            all_prom_preds.extend(prom_preds)
            all_prom_targets.extend(prom_targets.cpu().numpy())
            all_bound_preds.extend(bound_preds)
            all_bound_targets.extend(bound_targets.cpu().numpy())
        
        # Calculate F1 scores
        train_prom_f1 = f1_score(all_prom_targets, all_prom_preds, zero_division=0)
        train_bound_f1 = f1_score(all_bound_targets, all_bound_preds, zero_division=0)
        
        return total_loss / len(train_loader), train_prom_f1, train_bound_f1
    
    def evaluate(self, val_loader):
        self.model.eval()
        total_loss = 0
        all_prom_preds, all_prom_targets = [], []
        all_bound_preds, all_bound_targets = [], []
        
        with torch.no_grad():
            for batch in val_loader:
                features = batch['features'].to(self.device)
                prom_targets = batch['prominence'].squeeze().to(self.device)
                bound_targets = batch['boundary'].squeeze().to(self.device)
                
                # Forward pass
                prom_outputs, bound_outputs = self.model(features)
                
                # Calculate losses
                prom_loss = self.prominence_criterion(prom_outputs, prom_targets)
                bound_loss = self.boundary_criterion(bound_outputs, bound_targets)
                total_loss += (prom_loss + bound_loss).item()
                
                # Collect predictions
                prom_preds = torch.argmax(prom_outputs, dim=1).cpu().numpy()
                bound_preds = torch.argmax(bound_outputs, dim=1).cpu().numpy()
                
                all_prom_preds.extend(prom_preds)
                all_prom_targets.extend(prom_targets.cpu().numpy())
                all_bound_preds.extend(bound_preds)
                all_bound_targets.extend(bound_targets.cpu().numpy())
        
        val_prom_f1 = f1_score(all_prom_targets, all_prom_preds, zero_division=0)
        val_bound_f1 = f1_score(all_bound_targets, all_bound_preds, zero_division=0)
        
        return total_loss / len(val_loader), val_prom_f1, val_bound_f1
    
    def train(self, train_loader, val_loader, epochs=30):
        print(f"🚀 Training for {epochs} epochs...")
        
        best_val_f1 = 0
        patience = 0
        
        for epoch in range(epochs):
            start_time = time.time()
            
            # Train
            train_loss, train_prom_f1, train_bound_f1 = self.train_epoch(train_loader)
            
            # Validate
            val_loss, val_prom_f1, val_bound_f1 = self.evaluate(val_loader)
            
            # Scheduler step
            self.scheduler.step()
            
            # Save history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['train_prom_f1'].append(train_prom_f1)
            self.history['val_prom_f1'].append(val_prom_f1)
            self.history['train_bound_f1'].append(train_bound_f1)
            self.history['val_bound_f1'].append(val_bound_f1)
            
            # Calculate average F1
            val_avg_f1 = (val_prom_f1 + val_bound_f1) / 2
            
            # Print progress
            epoch_time = time.time() - start_time
            print(f"Epoch {epoch+1:2d}/{epochs} | "
                  f"Time: {epoch_time:.1f}s | "
                  f"Loss: {train_loss:.4f}/{val_loss:.4f} | "
                  f"Prom F1: {train_prom_f1:.3f}/{val_prom_f1:.3f} | "
                  f"Bound F1: {train_bound_f1:.3f}/{val_bound_f1:.3f}")
            
            # Save best model
            if val_avg_f1 > best_val_f1:
                best_val_f1 = val_avg_f1
                patience = 0
                torch.save(self.model.state_dict(), f'best_prosody_model.pth')
            else:
                patience += 1
            
            # Early stopping
            if patience >= 8:
                print(f"💤 Early stopping at epoch {epoch+1}")
                break
        
        # Load best model
        try:
            self.model.load_state_dict(torch.load('best_prosody_model.pth'))
            print(f"✅ Training complete! Best validation F1: {best_val_f1:.3f}")
        except:
            print(f"⚠️ Using final model state (no improvement found)")

# =====================================================
# 4. MAIN EXPERIMENT
# =====================================================

def run_production_experiment():
    """
    Run the final production experiment
    """
    print("🚀 PRODUCTION NEURAL NETWORK EXPERIMENT")
    print("=" * 50)
    
    # Load data
    print("📂 Loading data...")
    with open("autorpt_processed_subset.pkl", 'rb') as f:
        data = pickle.load(f)
    
    processed_data = data['processed_data']
    
    # Same splits as classical ML
    n_files = len(processed_data)
    train_files = int(0.7 * n_files)
    val_files = int(0.15 * n_files)
    
    splits = {
        'train': processed_data[:train_files],
        'val': processed_data[train_files:train_files + val_files],
        'test': processed_data[train_files + val_files:]
    }
    
    print(f"📊 Files: Train={len(splits['train'])}, Val={len(splits['val'])}, Test={len(splits['test'])}")
    
    # Create optimized datasets
    print("\n📦 Creating optimized datasets...")
    train_dataset = OptimizedProsodyDataset(splits['train'], 
                                           prominence_threshold=0.15, 
                                           boundary_threshold=0.10)
    
    val_dataset = OptimizedProsodyDataset(splits['val'], 
                                         prominence_threshold=0.15, 
                                         boundary_threshold=0.10)
    
    test_dataset = OptimizedProsodyDataset(splits['test'], 
                                          prominence_threshold=0.15, 
                                          boundary_threshold=0.10)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"🔧 Using device: {device}")
    
    # Models to test
    models = {
        'Production_CNN': Production_CNN(input_features=16),
        'Production_RNN': Production_RNN(input_features=16)
    }
    
    results = {}
    
    # Train each model
    for model_name, model in models.items():
        print(f"\n🚀 Training {model_name}...")
        print("-" * 40)
        
        trainer = ProductionTrainer(model, device=device)
        trainer.train(train_loader, val_loader, epochs=25)
        
        # Test evaluation
        test_loss, test_prom_f1, test_bound_f1 = trainer.evaluate(test_loader)
        
        results[model_name] = {
            'test_prom_f1': test_prom_f1,
            'test_bound_f1': test_bound_f1,
            'test_loss': test_loss,
            'history': trainer.history
        }
        
        print(f"🎯 {model_name} Final Test Results:")
        print(f"  Prominence F1: {test_prom_f1:.3f}")
        print(f"  Boundary F1: {test_bound_f1:.3f}")
    
    # Final comparison
    print(f"\n📊 FINAL NEURAL NETWORK RESULTS")
    print("=" * 50)
    print(f"{'Model':<15} | {'Prom F1':<8} | {'Bound F1':<8} | {'Avg F1':<8}")
    print("-" * 50)
    
    for model_name, result in results.items():
        avg_f1 = (result['test_prom_f1'] + result['test_bound_f1']) / 2
        print(f"{model_name:<15} | {result['test_prom_f1']:<8.3f} | {result['test_bound_f1']:<8.3f} | {avg_f1:<8.3f}")
    
    return results

# =====================================================
# 5. COMPARISON PLOTTING
# =====================================================

def plot_neural_vs_classical():
    """
    Plot comparison with classical ML results
    """
    print(f"\n📊 NEURAL vs CLASSICAL COMPARISON")
    print("=" * 40)
    
    # Classical ML results (from your previous experiments)
    classical_results = {
        'Logistic Regression': {'prom_f1': 0.475, 'bound_f1': 0.143},
        'Random Forest': {'prom_f1': 0.486, 'bound_f1': 0.170},
        'Naive Bayes': {'prom_f1': 0.450, 'bound_f1': 0.010}
    }
    
    print("Classical ML Baselines:")
    for name, result in classical_results.items():
        avg = (result['prom_f1'] + result['bound_f1']) / 2
        print(f"  {name:<18}: Prom={result['prom_f1']:.3f}, Bound={result['bound_f1']:.3f}, Avg={avg:.3f}")

# =====================================================
# RUN PRODUCTION EXPERIMENT
# =====================================================

if __name__ == "__main__":
    results = run_production_experiment()
    plot_neural_vs_classical()
    
    print(f"\n🎉 Production neural network experiment complete!")
    print(f"💡 These models should significantly outperform classical ML baselines!")

🚀 FINAL NEURAL PROSODY DETECTION
🚀 PRODUCTION NEURAL NETWORK EXPERIMENT
📂 Loading data...
📊 Files: Train=99, Val=21, Test=22

📦 Creating optimized datasets...
  📏 Creating sequences with length 50
  🎯 Prominence threshold: 15.0%
  🎯 Boundary threshold: 10.0%
  📊 Created 18,284 sequences
  📊 Positive prominence: 12,308 (67.3%)
  📊 Positive boundary: 4,726 (25.8%)
  📏 Creating sequences with length 50
  🎯 Prominence threshold: 15.0%
  🎯 Boundary threshold: 10.0%
  📊 Created 3,622 sequences
  📊 Positive prominence: 2,480 (68.5%)
  📊 Positive boundary: 855 (23.6%)
  📏 Creating sequences with length 50
  🎯 Prominence threshold: 15.0%
  🎯 Boundary threshold: 10.0%
  📊 Created 3,970 sequences
  📊 Positive prominence: 2,653 (66.8%)
  📊 Positive boundary: 919 (23.1%)
🔧 Using device: cpu

🚀 Training Production_CNN...
----------------------------------------
🚀 Training for 25 epochs...
Epoch  1/25 | Time: 3.6s | Loss: 1.0422/1.0572 | Prom F1: 0.838/0.796 | Bound F1: 0.242/0.478
Epoch  2/25 | Time