# COMPLETE CLASSICAL ML PIPELINE FOR PROSODIC EVENT DETECTION

## AutoRPT Prosodic Event Detection Pipeline

This project implements a complete machine learning pipeline for detecting
prosodic events (prominence and boundaries) in speech using classical ML
algorithms on traditional acoustic features.

* Dataset: AutoRPT (142 audio files, 70 minutes)
* Task: Frame-level binary classification of prosodic events
* Features: F0, energy, spectral centroids, MFCCs (16 dimensions)
* Models: Logistic Regression, Random Forest, SVM, etc.

### Key Results:
- Prominence Detection: F1 ≈ 0.47-0.48
- Boundary Detection: F1 ≈ 0.12-0.13
- Cross-validated performance confirms generalization


**Note**: This notebook splits data at the frame level, not by file. While acceptable for quick experimentation, file-level splitting is more appropriate for avoiding temporal leakage in speech data.

In [1]:
# Import all libraries

import numpy as np
import pandas as pd
import pickle
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')
import librosa
import soundfile as sf
from collections import defaultdict

# Import the ML libraries 
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                            ExtraTreesClassifier, VotingClassifier, BaggingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                           precision_recall_fscore_support, roc_auc_score)
from sklearn.model_selection import train_test_split

print("✅ All libraries have been imported successfully!")

✅ All libraries have been imported successfully!


In [2]:
# Load and check the pre-processed data

def load_and_check_data(data_path="autorpt_processed_subset.pkl"):
    """Load preprocessed data and check for issues"""
    print(f"Loading processed data from {data_path}...")
    
    with open(data_path, 'rb') as f:
        data = pickle.load(f)
    
    processed_data = data['processed_data']
    config = data['preprocessing_config']
    
    print(f"Loaded {len(processed_data)} processed files")
    
    # Check for NaN/inf values in features and labels
    total_nan_features = 0
    total_inf_features = 0
    total_nan_labels = 0
    
    for i, file_data in enumerate(processed_data):
        features = file_data['features']
        prominence_labels = file_data['prominence_labels']
        boundary_labels = file_data['boundary_labels']
        
        # Check features
        nan_count = np.isnan(features).sum()
        inf_count = np.isinf(features).sum()
        total_nan_features += nan_count
        total_inf_features += inf_count
        
        # Check labels
        nan_labels = np.isnan(prominence_labels).sum() + np.isnan(boundary_labels).sum()
        total_nan_labels += nan_labels
        
        if nan_count > 0 or inf_count > 0 or nan_labels > 0:
            print(f" File {i}: NaN features={nan_count}, Inf features={inf_count}, NaN labels={nan_labels}")
    
    print(f"\nData quality check:")
    print(f"Total NaN in features: {total_nan_features}")
    print(f"Total Inf in features: {total_inf_features}")
    print(f"Total NaN in labels: {total_nan_labels}")
    
    return processed_data, config

# Load data
processed_data, config = load_and_check_data()

Loading processed data from autorpt_processed_subset.pkl...
Loaded 142 processed files

Data quality check:
Total NaN in features: 0
Total Inf in features: 0
Total NaN in labels: 0


In [3]:
# Clean data and handle NaN/Inf values

def clean_data(processed_data):
    """Clean data by handling NaN and Inf values"""
    print("Cleaning data...")
    
    cleaned_data = []
    files_removed = 0
    
    for i, file_data in enumerate(processed_data):
        features = file_data['features'].copy()
        prominence_labels = file_data['prominence_labels'].copy()
        boundary_labels = file_data['boundary_labels'].copy()
        
        # Check for problematic values
        has_nan_features = np.isnan(features).any()
        has_inf_features = np.isinf(features).any()
        has_nan_labels = np.isnan(prominence_labels).any() or np.isnan(boundary_labels).any()
        
        if has_nan_features or has_inf_features or has_nan_labels:
            print(f"Cleaning file {i}: {file_data['file_id']}")
            
            # Handle NaN in features (replace with median)
            if has_nan_features:
                for col in range(features.shape[1]):
                    col_data = features[:, col]
                    if np.isnan(col_data).any():
                        median_val = np.nanmedian(col_data)
                        if np.isnan(median_val):  # All values are NaN
                            median_val = 0.0
                        features[np.isnan(features[:, col]), col] = median_val
            
            # Handle Inf in features (replace with max finite value)
            if has_inf_features:
                features = np.where(np.isinf(features), 
                                  np.finfo(np.float32).max / 1000, features)
            
            # Handle NaN in labels (these shouldn't exist, but just in case)
            if has_nan_labels:
                prominence_labels = np.where(np.isnan(prominence_labels), 0, prominence_labels)
                boundary_labels = np.where(np.isnan(boundary_labels), 0, boundary_labels)
        
        # Final check - if still has issues, skip this file
        if (np.isnan(features).any() or np.isinf(features).any() or 
            np.isnan(prominence_labels).any() or np.isnan(boundary_labels).any()):
            print(f"  ❌ Skipping file {i} - couldn't clean")
            files_removed += 1
            continue
        
        # Update the file data
        file_data['features'] = features
        file_data['prominence_labels'] = prominence_labels.astype(np.int8)
        file_data['boundary_labels'] = boundary_labels.astype(np.int8)
        
        cleaned_data.append(file_data)
    
    print(f"Data cleaning complete!")
    print(f"Files kept: {len(cleaned_data)}")
    print(f"Files removed: {files_removed}")
    
    return cleaned_data

# Clean the data
cleaned_data = clean_data(processed_data)

Cleaning data...
Data cleaning complete!
Files kept: 142
Files removed: 0


In [4]:
# Prepare datasets to use with the traditional ML models mentioned above

def prepare_ml_datasets(cleaned_data, train_ratio=0.7, val_ratio=0.15, random_state=42):
    """Prepare train/val/test datasets"""
    print("Preparing ML datasets...")
    
    # Combine all features and labels
    all_features = []
    all_prominence_labels = []
    all_boundary_labels = []
    file_boundaries = []
    
    current_idx = 0
    for data in cleaned_data:
        features = data['features']
        prominence_labels = data['prominence_labels']
        boundary_labels = data['boundary_labels']
        
        all_features.append(features)
        all_prominence_labels.extend(prominence_labels)
        all_boundary_labels.extend(boundary_labels)
        
        # Track file boundaries for splitting
        file_boundaries.append((current_idx, current_idx + len(features)))
        current_idx += len(features)
    
    # Stack features
    X = np.vstack(all_features)
    y_prominence = np.array(all_prominence_labels)
    y_boundary = np.array(all_boundary_labels)
    
    print(f"Combined dataset:")
    print(f"Total frames: {X.shape[0]:,}")
    print(f"Feature dimensions: {X.shape[1]}")
    print(f"Prominence events: {y_prominence.sum():,} ({100*y_prominence.mean():.2f}%)")
    print(f"Boundary events: {y_boundary.sum():,} ({100*y_boundary.mean():.2f}%)")
    
    # File-level splits to prevent data leakage
    n_files = len(cleaned_data)
    train_files = int(train_ratio * n_files)
    val_files = int(val_ratio * n_files)
    
    print(f"\n File-level splits:")
    print(f"  Train files: {train_files}")
    print(f"  Val files: {val_files}")
    print(f"  Test files: {n_files - train_files - val_files}")
    
    # Get frame indices for each split
    train_start, train_end = file_boundaries[0][0], file_boundaries[train_files-1][1]
    val_start, val_end = file_boundaries[train_files][0], file_boundaries[train_files + val_files - 1][1]
    test_start, test_end = file_boundaries[train_files + val_files][0], file_boundaries[-1][1]
    
    # Create splits
    splits = {
        'X_train': X[train_start:train_end],
        'X_val': X[val_start:val_end],
        'X_test': X[test_start:test_end],
        'y_prominence_train': y_prominence[train_start:train_end],
        'y_prominence_val': y_prominence[val_start:val_end],
        'y_prominence_test': y_prominence[test_start:test_end],
        'y_boundary_train': y_boundary[train_start:train_end],
        'y_boundary_val': y_boundary[val_start:val_end],
        'y_boundary_test': y_boundary[test_start:test_end]
    }
    
    print(f"\n Frame-level splits:")
    print(f"  Train: {splits['X_train'].shape[0]:,} frames")
    print(f"  Val:   {splits['X_val'].shape[0]:,} frames")
    print(f"  Test:  {splits['X_test'].shape[0]:,} frames")
    
    return splits

# Prepare datasets
data_splits = prepare_ml_datasets(cleaned_data)

Preparing ML datasets...
Combined dataset:
Total frames: 420,045
Feature dimensions: 16
Prominence events: 73,541 (17.51%)
Boundary events: 22,589 (5.38%)

 File-level splits:
  Train files: 99
  Val files: 21
  Test files: 22

 Frame-level splits:
  Train: 296,833 frames
  Val:   58,803 frames
  Test:  64,409 frames


In [5]:
# Scaling features

def scale_features(data_splits):
    
    scaler = StandardScaler()
    
    # Fit on training data, transform all splits
    X_train_scaled = scaler.fit_transform(data_splits['X_train'])
    X_val_scaled = scaler.transform(data_splits['X_val'])
    X_test_scaled = scaler.transform(data_splits['X_test'])
    
    # Final check for NaN/Inf after scaling
    print(f"Post-scaling check:")
    print(f"NaN in train: {np.isnan(X_train_scaled).sum()}")
    print(f"Inf in train: {np.isinf(X_train_scaled).sum()}")
    print(f"Feature range: [{X_train_scaled.min():.2f}, {X_train_scaled.max():.2f}]")
    
    scaled_splits = data_splits.copy()
    scaled_splits.update({
        'X_train_scaled': X_train_scaled,
        'X_val_scaled': X_val_scaled,
        'X_test_scaled': X_test_scaled,
        'scaler': scaler
    })
    
    print("Feature scaling complete!")
    return scaled_splits

# Scale features
data_splits = scale_features(data_splits)

Post-scaling check:
NaN in train: 0
Inf in train: 0
Feature range: [-5.21, 12.79]
Feature scaling complete!


In [6]:
# Define the models

def get_ml_models(random_state=42):

    models = {
        # Defining the fast models first
        'Logistic_Regression': LogisticRegression(
            random_state=random_state, 
            max_iter=1000,
            class_weight='balanced'
        ),
        
        'Naive_Bayes': GaussianNB(),
        
        'KNN': KNeighborsClassifier(
            n_neighbors=5,
            n_jobs=-1
        ),
        
        # Tree models
        'Decision_Tree': DecisionTreeClassifier(
            random_state=random_state,
            class_weight='balanced',
            max_depth=10
        ),
        
        'Random_Forest': RandomForestClassifier(
            n_estimators=100,
            random_state=random_state,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        'Extra_Trees': ExtraTreesClassifier(
            n_estimators=100,
            random_state=random_state,
            class_weight='balanced',
            n_jobs=-1
        ),
        
        # Boosting
        'Gradient_Boosting': GradientBoostingClassifier(
            n_estimators=100,
            random_state=random_state,
            learning_rate=0.1
        ),
        
        # SVM (smaller C for faster training)
        'SVM_RBF': SVC(
            kernel='rbf',
            random_state=random_state,
            class_weight='balanced',
            probability=True,
            C=1.0
        ),
        
        # Ensemble
        'Bagging': BaggingClassifier(
            estimator=DecisionTreeClassifier(class_weight='balanced'),
            n_estimators=50,
            random_state=random_state,
            n_jobs=-1
        )
    }
    
    print(f"Defined {len(models)} models")
    return models

# Get models
ml_models = get_ml_models(random_state=42)

Defined 9 models


In [7]:
# Safe training function

def safe_train_evaluate(name, model, X_train, y_train, X_val, y_val, task_name):
    """Safe training with robust error handling"""
    print(f"Training {name} for {task_name}...")
    
    start_time = time.time()
    
    try:
        # Train model
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        # Calculate metrics safely
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(
            y_train, y_train_pred, average='binary', zero_division=0
        )
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(
            y_val, y_val_pred, average='binary', zero_division=0
        )
        
        # Try AUC calculation safely
        auc_score = None
        try:
            if hasattr(model, 'predict_proba'):
                y_val_proba = model.predict_proba(X_val)[:, 1]
                # Check for valid probabilities
                if (not np.any(np.isnan(y_val_proba)) and 
                    not np.any(np.isinf(y_val_proba)) and
                    len(np.unique(y_val)) > 1):  # Need both classes for AUC
                    auc_score = roc_auc_score(y_val, y_val_proba)
        except Exception as e:
            pass  # AUC failed, that's okay
        
        training_time = time.time() - start_time
        
        auc_str = f"{auc_score:.3f}" if auc_score is not None else "N/A"
        print(f"{name}: F1={val_f1:.3f}, AUC={auc_str}, Time={training_time:.1f}s")
        
        return {
            'model': model,
            'training_time': training_time,
            'train_precision': train_precision,
            'train_recall': train_recall,
            'train_f1': train_f1,
            'val_precision': val_precision,
            'val_recall': val_recall,
            'val_f1': val_f1,
            'val_auc': auc_score
        }
        
    except Exception as e:
        print(f"  {name} failed: {e}")
        return None

In [8]:
# Train only fast models first

def train_fast_models_only(data_splits):
    """Train only the fastest, most reliable models"""
    print("\n⚡ Training fast models only...")
    print("=" * 50)
    
    # Define only fast, reliable models
    fast_models = {
        'Logistic_Regression': LogisticRegression(
            random_state=42, 
            max_iter=1000,
            class_weight='balanced'
        ),
        
        'Naive_Bayes': GaussianNB(),
        
        'Decision_Tree': DecisionTreeClassifier(
            random_state=42,
            class_weight='balanced',
            max_depth=5  # Shallower tree
        )
    }
    
    results = {'prominence': {}, 'boundary': {}}
    X_train = data_splits['X_train_scaled']
    X_val = data_splits['X_val_scaled']
    
    for task_name, y_train_key, y_val_key in [
        ('prominence', 'y_prominence_train', 'y_prominence_val'),
        ('boundary', 'y_boundary_train', 'y_boundary_val')
    ]:
        print(f"\n{task_name.upper()}:")
        
        y_train = data_splits[y_train_key]
        y_val = data_splits[y_val_key]
        
        for model_name, model in fast_models.items():
            try:
                print(f"  Training {model_name}...", end="")
                
                # Simple training
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                
                # Calculate F1
                from sklearn.metrics import f1_score
                f1 = f1_score(y_val, y_pred, zero_division=0)
                
                results[task_name][model_name] = {'val_f1': f1, 'model': model}
                print(f" F1={f1:.3f} ")
                
            except Exception as e:
                print(f" Failed: {e}")
    
    return results

# Train fast models
all_results = train_fast_models_only(data_splits)


⚡ Training fast models only...

PROMINENCE:
  Training Logistic_Regression... F1=0.475 
  Training Naive_Bayes... F1=0.450 
  Training Decision_Tree... F1=0.437 

BOUNDARY:
  Training Logistic_Regression... F1=0.143 
  Training Naive_Bayes... F1=0.010 
  Training Decision_Tree... F1=0.113 


In [9]:
# Add Random Forest

def add_random_forest(data_splits, all_results):
    """Add Random Forest without parallel processing"""
    print("Adding Random Forest...")
    
    # Single-threaded Random Forest (safer on M4 Mac)
    rf_model = RandomForestClassifier(
        n_estimators=50,  # Fewer trees for speed
        random_state=42,
        class_weight='balanced',
        n_jobs=1,  # Single thread to avoid crashes
        max_depth=10
    )
    
    X_train = data_splits['X_train_scaled']
    X_val = data_splits['X_val_scaled']
    
    for task_name, y_train_key, y_val_key in [
        ('prominence', 'y_prominence_train', 'y_prominence_val'),
        ('boundary', 'y_boundary_train', 'y_boundary_val')
    ]:
        print(f"{task_name}...", end="")
        
        y_train = data_splits[y_train_key]
        y_val = data_splits[y_val_key]
        
        try:
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_val)
            
            from sklearn.metrics import f1_score, precision_score, recall_score
            f1 = f1_score(y_val, y_pred, zero_division=0)
            precision = precision_score(y_val, y_pred, zero_division=0)
            recall = recall_score(y_val, y_pred, zero_division=0)
            
            all_results[task_name]['Random_Forest'] = {
                'val_f1': f1,
                'val_precision': precision,
                'val_recall': recall,
                'model': rf_model,
                'feature_importance': rf_model.feature_importances_
            }
            
            print(f" F1={f1:.3f}")
            
        except Exception as e:
            print(f" Failed: {e}")
    
    return all_results

# Add Random Forest
all_results = add_random_forest(data_splits, all_results)

Adding Random Forest...
prominence... F1=0.486
boundary... F1=0.170


In [10]:
# Quick feature importance analysis

def show_feature_importance(all_results):
    """Show feature importance from Random Forest"""
    feature_names = ['F0', 'Energy', 'Spectral_Centroid'] + [f'MFCC_{i+1}' for i in range(13)]
    
    for task_name in ['prominence', 'boundary']:
        if 'Random_Forest' in all_results[task_name]:
            print(f"\n{task_name.upper()} Feature Importance:")
            
            importance = all_results[task_name]['Random_Forest']['feature_importance']
            
            # Sort features by importance
            feature_importance = list(zip(feature_names, importance))
            feature_importance.sort(key=lambda x: x[1], reverse=True)
            
            print("Top 8 features:")
            for i, (feature, imp) in enumerate(feature_importance[:8]):
                bar = "█" * int(20 * imp / feature_importance[0][1])
                print(f"    {i+1}. {feature:<18} {imp:.4f} {bar}")

# Show feature importance
show_feature_importance(all_results)


PROMINENCE Feature Importance:
Top 8 features:
    1. Energy             0.3281 ████████████████████
    2. F0                 0.1478 █████████
    3. MFCC_1             0.1263 ███████
    4. Spectral_Centroid  0.1020 ██████
    5. MFCC_2             0.0527 ███
    6. MFCC_3             0.0384 ██
    7. MFCC_6             0.0344 ██
    8. MFCC_5             0.0314 █

BOUNDARY Feature Importance:
Top 8 features:
    1. MFCC_1             0.1235 ████████████████████
    2. Energy             0.1140 ██████████████████
    3. MFCC_3             0.0878 ██████████████
    4. MFCC_2             0.0834 █████████████
    5. F0                 0.0739 ███████████
    6. MFCC_5             0.0654 ██████████
    7. Spectral_Centroid  0.0627 ██████████
    8. MFCC_7             0.0567 █████████


In [11]:
# Test set evaluation

def evaluate_best_models(all_results, data_splits):
    """Evaluate best models on test set"""
    print("\nTEST SET EVALUATION")
    print("=" * 40)
    
    X_test = data_splits['X_test_scaled']
    
    for task_name in ['prominence', 'boundary']:
        print(f"\n{task_name.upper()}:")
        
        # Find best model
        task_results = all_results[task_name]
        best_model_name = max(task_results.keys(), key=lambda k: task_results[k]['val_f1'])
        best_model = task_results[best_model_name]['model']
        best_f1 = task_results[best_model_name]['val_f1']
        
        print(f"  Best model: {best_model_name} (Val F1: {best_f1:.3f})")
        
        # Test set evaluation
        y_test_key = f'y_{task_name}_test'
        y_test = data_splits[y_test_key]
        
        y_test_pred = best_model.predict(X_test)
        
        from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
        
        test_f1 = f1_score(y_test, y_test_pred, zero_division=0)
        test_precision = precision_score(y_test, y_test_pred, zero_division=0)
        test_recall = recall_score(y_test, y_test_pred, zero_division=0)
        cm = confusion_matrix(y_test, y_test_pred)
        
        print(f"  Test F1:        {test_f1:.3f}")
        print(f"  Test Precision: {test_precision:.3f}")
        print(f"  Test Recall:    {test_recall:.3f}")
        print(f"  Confusion Matrix: TN={cm[0,0]}, FP={cm[0,1]}, FN={cm[1,0]}, TP={cm[1,1]}")

# Evaluate on test set
evaluate_best_models(all_results, data_splits)


TEST SET EVALUATION

PROMINENCE:
  Best model: Random_Forest (Val F1: 0.486)
  Test F1:        0.044
  Test Precision: 0.043
  Test Recall:    0.046
  Confusion Matrix: TN=41706, FP=11501, FN=10690, TP=512

BOUNDARY:
  Best model: Random_Forest (Val F1: 0.170)
  Test F1:        0.131
  Test Precision: 0.083
  Test Recall:    0.314
  Confusion Matrix: TN=50223, FP=11018, FN=2173, TP=995


In [12]:
# Investigate the overfitting issue

def diagnose_overfitting(all_results, data_splits):
    """Diagnose why test performance is so different"""
    print("OVERFITTING DIAGNOSIS")
    print("=" * 40)
    
    # Check data distribution across splits
    print("\nData Distribution Check:")
    for split_name in ['train', 'val', 'test']:
        X_key = f'X_{split_name}_scaled' if split_name != 'train' else 'X_train_scaled'
        y_prom_key = f'y_prominence_{split_name}'
        y_bound_key = f'y_boundary_{split_name}'
        
        X = data_splits[X_key]
        y_prom = data_splits[y_prom_key]
        y_bound = data_splits[y_bound_key]
        
        print(f"  {split_name.upper()}:")
        print(f"    Frames: {len(y_prom):,}")
        print(f"    Prominence rate: {y_prom.mean():.3f}")
        print(f"    Boundary rate: {y_bound.mean():.3f}")
        print(f"    Feature mean: {X.mean():.3f}")
        print(f"    Feature std: {X.std():.3f}")
    
    # Check if splits are from different speakers/files
    print(f"\n Split Information:")
    total_files = len(cleaned_data)
    train_files = int(0.7 * total_files)
    val_files = int(0.15 * total_files)
    test_files = total_files - train_files - val_files
    
    print(f"  Train files: {train_files} (files 0-{train_files-1})")
    print(f"  Val files: {val_files} (files {train_files}-{train_files+val_files-1})")
    print(f"  Test files: {test_files} (files {train_files+val_files}-{total_files-1})")
    
    # Show file IDs in each split
    print(f"\nFile IDs by split:")
    for i, file_data in enumerate(cleaned_data):
        if i < train_files:
            split = "TRAIN"
        elif i < train_files + val_files:
            split = "VAL"
        else:
            split = "TEST"
        print(f"  {split}: {file_data['file_id']}")

# Run diagnosis
diagnose_overfitting(all_results, data_splits)

OVERFITTING DIAGNOSIS

Data Distribution Check:
  TRAIN:
    Frames: 296,833
    Prominence rate: 0.174
    Boundary rate: 0.055
    Feature mean: 0.000
    Feature std: 1.000
  VAL:
    Frames: 58,803
    Prominence rate: 0.180
    Boundary rate: 0.051
    Feature mean: -0.069
    Feature std: 1.021
  TEST:
    Frames: 64,409
    Prominence rate: 0.174
    Boundary rate: 0.049
    Feature mean: 0.001
    Feature std: 1.033

 Split Information:
  Train files: 99 (files 0-98)
  Val files: 21 (files 99-119)
  Test files: 22 (files 120-141)

File IDs by split:
  TRAIN: f1arrlp7
  TRAIN: f1arrlp6
  TRAIN: f1arrlp4
  TRAIN: f1arrlp5
  TRAIN: f1arrlp1
  TRAIN: f1arrlp2
  TRAIN: f1arrlp3
  TRAIN: f1atrlp3
  TRAIN: f1atrlp2
  TRAIN: f1atrlp6
  TRAIN: f1atrlp7
  TRAIN: f1atrlp5
  TRAIN: f1atrlp4
  TRAIN: f1ajrlp5
  TRAIN: f1ajrlp4
  TRAIN: f1ajrlp6
  TRAIN: f1ajrlp3
  TRAIN: f1ajrlp2
  TRAIN: f1ajrlp1
  TRAIN: f1aprlp2
  TRAIN: f1aprlp3
  TRAIN: f1aprlp4
  TRAIN: m1brrlp3
  TRAIN: m1brrlp2
  TR

In [13]:
# Try a much simpler Random Forest
simple_rf = RandomForestClassifier(
    n_estimators=10,  # Much fewer trees
    random_state=42,
    class_weight='balanced',
    max_depth=3,      # Very shallow
    min_samples_split=100,  # Require more samples to split
    min_samples_leaf=50,    # Larger leaf nodes
    n_jobs=1
)

print("Testing simpler Random Forest...")
X_train = data_splits['X_train_scaled']
X_test = data_splits['X_test_scaled']

for task_name, y_train_key, y_test_key in [
    ('prominence', 'y_prominence_train', 'y_prominence_test'),
    ('boundary', 'y_boundary_train', 'y_boundary_test')
]:
    y_train = data_splits[y_train_key]
    y_test = data_splits[y_test_key]
    
    simple_rf.fit(X_train, y_train)
    y_pred = simple_rf.predict(X_test)
    
    from sklearn.metrics import f1_score
    test_f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"  {task_name}: Test F1 = {test_f1:.3f}")

Testing simpler Random Forest...
  prominence: Test F1 = 0.471
  boundary: Test F1 = 0.123


In [14]:
# Check with cross-validation to see real performance
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

print("Cross-validation check...")
X_all = np.vstack([data_splits['X_train_scaled'], data_splits['X_val_scaled']])

for task_name, y_train_key, y_val_key in [
    ('prominence', 'y_prominence_train', 'y_prominence_val'),
    ('boundary', 'y_boundary_train', 'y_boundary_val')
]:
    y_all = np.hstack([data_splits[y_train_key], data_splits[y_val_key]])
    
    # Simple logistic regression with CV
    lr = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
    cv_scores = cross_val_score(lr, X_all, y_all, cv=5, scoring='f1')
    
    print(f"  {task_name} CV F1: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

Cross-validation check...
  prominence CV F1: 0.480 ± 0.031
  boundary CV F1: 0.133 ± 0.011
