# 04 - Machine Learning Model Training
## Year Prediction from Audio Features

### Models to Train:
1. Linear Regression (baseline)
2. Ridge Regression
3. Lasso Regression
4. Random Forest Regressor
5. Gradient Boosting (XGBoost, LightGBM)
6. Support Vector Regression (SVR)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time
import json

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import xgboost as xgb
import lightgbm as lgb

import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Load Data

In [2]:
X_train = np.load('data/splits/X_train.npy')
X_val = np.load('data/splits/X_val.npy')
X_test = np.load('data/splits/X_test.npy')

y_train = np.load('data/splits/y_train.npy')
y_val = np.load('data/splits/y_val.npy')
y_test = np.load('data/splits/y_test.npy')

print(f"Full features - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print(f"Target range: {y_train.min()} - {y_train.max()}")

Full features - Train: (394074, 90), Val: (69543, 90), Test: (51514, 90)
Target range: 1922 - 2011


In [3]:
X_train_full = np.vstack([X_train, X_val])
y_train_full = np.concatenate([y_train, y_val])

print(f"Combined training set: {X_train_full.shape}")

Combined training set: (463617, 90)


## 2. Helper Functions

In [4]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Evaluate model on train, validation, and test sets.
    Returns a dictionary of metrics.
    """
    results = {}
    
    for name, X, y in [('train', X_train, y_train), ('val', X_val, y_val), ('test', X_test, y_test)]:
        y_pred = model.predict(X)
        results[f'{name}_mse'] = mean_squared_error(y, y_pred)
        results[f'{name}_rmse'] = np.sqrt(results[f'{name}_mse'])
        results[f'{name}_mae'] = mean_absolute_error(y, y_pred)
        results[f'{name}_r2'] = r2_score(y, y_pred)
    
    return results

def print_results(results, model_name):
    """Print formatted results."""
    print(f"\n{'='*60}")
    print(f"{model_name} Results")
    print(f"{'='*60}")
    print(f"{'Set':<10} {'MSE':<12} {'RMSE':<10} {'MAE':<10} {'R²':<10}")
    print("-" * 60)
    for set_name in ['train', 'val', 'test']:
        print(f"{set_name.capitalize():<10} {results[f'{set_name}_mse']:<12.4f} {results[f'{set_name}_rmse']:<10.4f} {results[f'{set_name}_mae']:<10.4f} {results[f'{set_name}_r2']:<10.4f}")

In [5]:
all_results = {}

## 3. Baseline Model - Linear Regression

In [6]:
print("Training Linear Regression...")
start_time = time.time()

lr = LinearRegression()
lr.fit(X_train, y_train)

train_time = time.time() - start_time
results_lr = evaluate_model(lr, X_train, y_train, X_val, y_val, X_test, y_test)
results_lr['train_time'] = train_time

print_results(results_lr, "Linear Regression")
print(f"Training time: {train_time:.2f}s")

all_results['Linear Regression'] = results_lr
joblib.dump(lr, 'models/ml/linear_regression.joblib')

Training Linear Regression...

Linear Regression Results
Set        MSE          RMSE       MAE        R²        
------------------------------------------------------------
Train      90.6327      9.5201     6.7767     0.2430    
Val        90.3554      9.5055     6.7759     0.2470    
Test       88.0707      9.3846     6.7168     0.2477    
Training time: 2.06s


['models/ml/linear_regression.joblib']

## 4. Ridge Regression

In [7]:
print("Training Ridge Regression...")
start_time = time.time()

alphas = [0.01, 0.1, 1.0, 10.0, 100.0]
best_ridge = None
best_val_rmse = float('inf')

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    val_pred = ridge.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_ridge = ridge
        best_alpha = alpha

train_time = time.time() - start_time
results_ridge = evaluate_model(best_ridge, X_train, y_train, X_val, y_val, X_test, y_test)
results_ridge['train_time'] = train_time
results_ridge['best_alpha'] = best_alpha

print_results(results_ridge, "Ridge Regression")
print(f"Best alpha: {best_alpha}")
print(f"Training time: {train_time:.2f}s")

all_results['Ridge Regression'] = results_ridge
joblib.dump(best_ridge, 'models/ml/ridge_regression.joblib')

Training Ridge Regression...

Ridge Regression Results
Set        MSE          RMSE       MAE        R²        
------------------------------------------------------------
Train      90.6327      9.5201     6.7767     0.2430    
Val        90.3554      9.5055     6.7759     0.2470    
Test       88.0707      9.3846     6.7168     0.2477    
Best alpha: 0.01
Training time: 2.52s


['models/ml/ridge_regression.joblib']

## 5. Lasso Regression

In [8]:
print("Training Lasso Regression...")
start_time = time.time()

alphas = [0.001, 0.01, 0.1, 1.0]
best_lasso = None
best_val_rmse = float('inf')

for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train, y_train)
    val_pred = lasso.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_lasso = lasso
        best_alpha = alpha

train_time = time.time() - start_time
results_lasso = evaluate_model(best_lasso, X_train, y_train, X_val, y_val, X_test, y_test)
results_lasso['train_time'] = train_time
results_lasso['best_alpha'] = best_alpha

print_results(results_lasso, "Lasso Regression")
print(f"Best alpha: {best_alpha}")
print(f"Non-zero coefficients: {np.sum(best_lasso.coef_ != 0)}/{len(best_lasso.coef_)}")
print(f"Training time: {train_time:.2f}s")

all_results['Lasso Regression'] = results_lasso
joblib.dump(best_lasso, 'models/ml/lasso_regression.joblib')

Training Lasso Regression...

Lasso Regression Results
Set        MSE          RMSE       MAE        R²        
------------------------------------------------------------
Train      90.6535      9.5212     6.7753     0.2429    
Val        90.3532      9.5054     6.7741     0.2470    
Test       88.1139      9.3869     6.7168     0.2473    
Best alpha: 0.01
Non-zero coefficients: 85/90
Training time: 10.32s


['models/ml/lasso_regression.joblib']

## 6. Random Forest Regressor

In [9]:
print("Training Random Forest...")
start_time = time.time()

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

train_time = time.time() - start_time
results_rf = evaluate_model(rf, X_train, y_train, X_val, y_val, X_test, y_test)
results_rf['train_time'] = train_time

print_results(results_rf, "Random Forest")
print(f"Training time: {train_time:.2f}s")

all_results['Random Forest'] = results_rf
joblib.dump(rf, 'models/ml/random_forest.joblib')

Training Random Forest...

Random Forest Results
Set        MSE          RMSE       MAE        R²        
------------------------------------------------------------
Train      33.9222      5.8243     4.1442     0.7167    
Val        84.9271      9.2156     6.6443     0.2922    
Test       83.3969      9.1322     6.6023     0.2876    
Training time: 162.74s


['models/ml/random_forest.joblib']

## 7. XGBoost

In [10]:
print("Training XGBoost...")
start_time = time.time()

xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

train_time = time.time() - start_time
results_xgb = evaluate_model(xgb_model, X_train, y_train, X_val, y_val, X_test, y_test)
results_xgb['train_time'] = train_time
results_xgb['best_iteration'] = xgb_model.best_iteration

print_results(results_xgb, "XGBoost")
print(f"Best iteration: {xgb_model.best_iteration}")
print(f"Training time: {train_time:.2f}s")

all_results['XGBoost'] = results_xgb
xgb_model.save_model('models/ml/xgboost.json')

Training XGBoost...



XGBoost Results
Set        MSE          RMSE       MAE        R²        
------------------------------------------------------------
Train      41.8692      6.4706     4.5995     0.6503    
Val        75.7336      8.7025     6.1021     0.3688    
Test       74.5410      8.6337     6.0898     0.3633    
Best iteration: 499
Training time: 48.06s


## 8. LightGBM

In [11]:
print("Training LightGBM...")
start_time = time.time()

lgb_model = lgb.LGBMRegressor(
    n_estimators=500,
    max_depth=10,
    learning_rate=0.1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

train_time = time.time() - start_time
results_lgb = evaluate_model(lgb_model, X_train, y_train, X_val, y_val, X_test, y_test)
results_lgb['train_time'] = train_time
results_lgb['best_iteration'] = lgb_model.best_iteration_

print_results(results_lgb, "LightGBM")
print(f"Best iteration: {lgb_model.best_iteration_}")
print(f"Training time: {train_time:.2f}s")

all_results['LightGBM'] = results_lgb
joblib.dump(lgb_model, 'models/ml/lightgbm.joblib')

Training LightGBM...

LightGBM Results
Set        MSE          RMSE       MAE        R²        
------------------------------------------------------------
Train      67.6788      8.2267     5.8343     0.4347    
Val        77.7675      8.8186     6.1782     0.3519    
Test       76.8188      8.7646     6.1584     0.3438    
Best iteration: 500
Training time: 16.41s


['models/ml/lightgbm.joblib']

## 9. Model Comparison

In [13]:
comparison_data = []
for model_name, results in all_results.items():
    comparison_data.append({
        'Model': model_name,
        'Train RMSE': results['train_rmse'],
        'Val RMSE': results['val_rmse'],
        'Test RMSE': results['test_rmse'],
        'Test MAE': results['test_mae'],
        'Test R²': results['test_r2'],
        'Train Time (s)': results['train_time']
    })

comparison_df = pd.DataFrame(comparison_data).sort_values('Test RMSE')
print("\nModel Comparison (sorted by Test RMSE):")
print(comparison_df.to_string(index=False))


Model Comparison (sorted by Test RMSE):
            Model  Train RMSE  Val RMSE  Test RMSE  Test MAE  Test R²  Train Time (s)
          XGBoost    6.470639  8.702507   8.633715  6.089814 0.363259       48.060873
         LightGBM    8.226712  8.818586   8.764633  6.158441 0.343802       16.409223
    Random Forest    5.824279  9.215589   9.132188  6.602323 0.287611      162.736211
Linear Regression    9.520119  9.505543   9.384602  6.716830 0.247686        2.060175
 Ridge Regression    9.520119  9.505543   9.384602  6.716830 0.247686        2.516867
 Lasso Regression    9.521212  9.505431   9.386899  6.716760 0.247318       10.323794


In [14]:
fig = px.bar(
    comparison_df,
    x='Model',
    y=['Train RMSE', 'Val RMSE', 'Test RMSE'],
    barmode='group',
    title='Model Comparison - RMSE',
    labels={'value': 'RMSE (years)', 'variable': 'Dataset'}
)
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/20_ml_model_comparison_rmse.html')
fig.show()

In [15]:
fig = px.bar(
    comparison_df,
    x='Model',
    y='Test R²',
    title='Model Comparison - R² Score',
    color='Test R²',
    color_continuous_scale='Viridis'
)
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/21_ml_model_comparison_r2.html')
fig.show()

In [16]:
fig = px.bar(
    comparison_df,
    x='Model',
    y='Train Time (s)',
    title='Model Comparison - Training Time',
    color='Train Time (s)',
    color_continuous_scale='Reds'
)
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/22_ml_training_time.html')
fig.show()

## 11. Best Model Predictions

In [17]:
best_model_name = comparison_df.iloc[0]['Model']
print(f"Best ML Model: {best_model_name}")

if best_model_name == 'XGBoost':
    best_model = xgb.XGBRegressor()
    best_model.load_model('models/ml/xgboost.json')
elif best_model_name == 'LightGBM':
    best_model = joblib.load('models/ml/lightgbm.joblib')
elif best_model_name == 'Random Forest':
    best_model = joblib.load('models/ml/random_forest.joblib')
elif best_model_name == 'Gradient Boosting':
    best_model = joblib.load('models/ml/gradient_boosting.joblib')
else:
    best_model = joblib.load(f'models/ml/{best_model_name.lower().replace(" ", "_")}.joblib')

Best ML Model: XGBoost


In [18]:
y_test_pred = best_model.predict(X_test)

sample_size = 5000
sample_idx = np.random.choice(len(y_test), sample_size, replace=False)

fig = px.scatter(
    x=y_test[sample_idx],
    y=y_test_pred[sample_idx],
    title=f'{best_model_name} - Predicted vs Actual Year',
    labels={'x': 'Actual Year', 'y': 'Predicted Year'},
    opacity=0.5
)

min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())
fig.add_trace(go.Scatter(
    x=[min_val, max_val],
    y=[min_val, max_val],
    mode='lines',
    name='Perfect Prediction',
    line=dict(color='red', dash='dash')
))

fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/23_best_ml_predictions.html')
fig.show()

In [19]:
residuals = y_test - y_test_pred

fig = px.histogram(
    x=residuals,
    nbins=100,
    title=f'{best_model_name} - Residual Distribution',
    labels={'x': 'Residual (Actual - Predicted)', 'y': 'Frequency'}
)
fig.add_vline(x=0, line_dash='dash', line_color='red')
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/24_best_ml_residuals.html')
fig.show()

print(f"\nResidual Statistics:")
print(f"Mean: {residuals.mean():.4f}")
print(f"Std: {residuals.std():.4f}")
print(f"Min: {residuals.min():.4f}")
print(f"Max: {residuals.max():.4f}")


Residual Statistics:
Mean: 0.1291
Std: 8.6328
Min: -65.5057
Max: 33.2781


## 12. Save Results

In [20]:
comparison_df.to_csv('reports/metrics/04_ml_model_comparison.csv', index=False)
print("Model comparison saved to reports/metrics/04_ml_model_comparison.csv")

Model comparison saved to reports/metrics/04_ml_model_comparison.csv


In [21]:
best_model_info = {
    'model_name': best_model_name,
    'test_rmse': float(comparison_df.iloc[0]['Test RMSE']),
    'test_mae': float(comparison_df.iloc[0]['Test MAE']),
    'test_r2': float(comparison_df.iloc[0]['Test R²'])
}

with open('reports/metrics/best_ml_model.json', 'w') as f:
    json.dump(best_model_info, f, indent=2)

print(f"\nBest ML Model: {best_model_name}")
print(f"Test RMSE: {best_model_info['test_rmse']:.4f} years")
print(f"Test MAE: {best_model_info['test_mae']:.4f} years")
print(f"Test R²: {best_model_info['test_r2']:.4f}")


Best ML Model: XGBoost
Test RMSE: 8.6337 years
Test MAE: 6.0898 years
Test R²: 0.3633
