# Baseline Model for Subword Unit Duration Prediction

This notebook implements and evaluates the baseline model for subword unit duration prediction.

In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Add project root to path to import project modules
sys.path.insert(0, os.path.abspath('..'))

from src.data_processing import load_data, extract_phone_data, calculate_speaking_rate, preprocess_data, calculate_phone_statistics
from src.features import extract_features
from src.models import BaselineModel
from src.evaluations import evaluate_model, plot_predictions, plot_error_distribution
from config import Config

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Load and Preprocess Data

In [2]:
# Define data directories
native_dir = '../data/american_english'
nonnative_dir = '../data/other_english'

# Load the data
native_data = load_data(native_dir)
nonnative_data = load_data(nonnative_dir)

print(f"Loaded {len(native_data)} native speaker utterances")
print(f"Loaded {len(nonnative_data)} non-native speaker utterances")

# Initialize configuration
config = Config()
config.native_only = True  # Use only native data for training
config.smooth_factor = 1.0  # Laplace smoothing factor


File ../data/american_english\american_english\anonymous-20080425-atw_a0028\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20080702-oqm_a0387\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20090917-vwy_b0305\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20100515-nfo_a0217\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20100515-nfo_a0224\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20100820-ebd_b0103\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20101011-qrd_a0385\result.json has non-dict 'result': <class 'str'>
File ../data/american_english\american_english\anonymous-20111004-fxb_b0014\result.json has non-dict 'result': <class 'str'>


Loaded 28417 native speaker utterances
Loaded 4736 non-native speaker utterances


In [3]:
# Preprocess data
train_data, val_data, test_data = preprocess_data(
    native_data, 
    nonnative_data,
    native_only=config.native_only,
    config=config
)

print(f"Training data: {len(train_data)} phones")
print(f"Validation data: {len(val_data)} phones")
print(f"Test data: {len(test_data)} phones")

Training data: 825020 phones
Validation data: 176790 phones
Test data: 378609 phones


## 2. Extract Features

In [4]:
# Extract features for training, validation, and test sets
X_train, y_train = extract_features(train_data, config)
X_val, y_val = extract_features(val_data, config)
X_test, y_test = extract_features(test_data, config)

print(f"Training features: {X_train.shape}")
print(f"Validation features: {X_val.shape}")
print(f"Test features: {X_test.shape}")

Training features: (825020, 363)
Validation features: (176790, 308)
Test features: (378609, 353)


## 3. Baseline Model Implementation

In [5]:
# Train the baseline model
baseline_model = BaselineModel(smooth_factor=config.smooth_factor)
baseline_model.train(X_train, y_train)

<src.models.BaselineModel at 0x21bcf736f90>

In [6]:
# Get phone statistics from the model
phone_stats = pd.DataFrame([
    {
        'phone': phone,
        'mean': stats['mean'],
        'std': stats['std'],
        'count': stats['count']
    }
    for phone, stats in baseline_model.phone_stats.items()
])

# Sort by count (frequency)
phone_stats = phone_stats.sort_values('count', ascending=False)

# Display top 20 phones by frequency
phone_stats.head(20)

Unnamed: 0,phone,mean,std,count
2,362.0,0.089334,0.126411,825018.0
0,2.0,1.259668,0.089404,1.0
1,6.0,0.149668,0.089404,1.0


## 4. Model Evaluation

In [7]:
# Evaluate on validation set
val_metrics = evaluate_model(baseline_model, X_val, y_val)
print("Validation Metrics:")
print(f"MAE: {val_metrics['mae']:.4f}")
print(f"RMSE: {val_metrics['rmse']:.4f}")
print(f"Correlation: {val_metrics['correlation']:.4f}")

Validation Metrics:
MAE: 0.0546
RMSE: 0.1227
Correlation: nan


  corr, _ = pearsonr(y_true, y_pred)
