# 06 - Model Evaluation and Comparison
## Year Prediction from Audio Features

### Objectives
1. Load all trained models
2. Compare ML and DL models
3. Analyze model performance in detail
4. Create ensemble predictions
5. Generate final evaluation metrics

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json

import joblib
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Load Data and Models

In [4]:
X_train = np.load('data/splits/X_train.npy')
X_val = np.load('data/splits/X_val.npy')
X_test = np.load('data/splits/X_test.npy')

y_train = np.load('data/splits/y_train.npy')
y_val = np.load('data/splits/y_val.npy')
y_test = np.load('data/splits/y_test.npy')

print(f"Test set: {X_test.shape}")
print(f"Target range: {y_test.min()} - {y_test.max()}")

Test set: (51514, 90)
Target range: 1924 - 2010


In [5]:
ml_models = {}

try:
    ml_models['Linear Regression'] = joblib.load('models/ml/linear_regression.joblib')
    print("Loaded: Linear Regression")
except:
    print("Linear Regression not found")

try:
    ml_models['Ridge'] = joblib.load('models/ml/ridge_regression.joblib')
    print("Loaded: Ridge Regression")
except:
    print("Ridge Regression not found")

try:
    ml_models['Lasso'] = joblib.load('models/ml/lasso_regression.joblib')
    print("Loaded: Lasso Regression")
except:
    print("Lasso Regression not found")

try:
    ml_models['Random Forest'] = joblib.load('models/ml/random_forest.joblib')
    print("Loaded: Random Forest")
except:
    print("Random Forest not found")

try:
    xgb_model = xgb.XGBRegressor()
    xgb_model.load_model('models/ml/xgboost.json')
    ml_models['XGBoost'] = xgb_model
    print("Loaded: XGBoost")
except:
    print("XGBoost not found")

try:
    ml_models['LightGBM'] = joblib.load('models/ml/lightgbm.joblib')
    print("Loaded: LightGBM")
except:
    print("LightGBM not found")

try:
    ml_models['Gradient Boosting'] = joblib.load('models/ml/gradient_boosting.joblib')
    print("Loaded: Gradient Boosting")
except:
    print("Gradient Boosting not found")

print(f"\nTotal ML models loaded: {len(ml_models)}")

Loaded: Linear Regression
Loaded: Ridge Regression
Loaded: Lasso Regression
Loaded: Random Forest
Loaded: XGBoost
Loaded: LightGBM
Gradient Boosting not found

Total ML models loaded: 6


In [6]:
dl_models = {}

try:
    dl_models['Simple MLP'] = keras.models.load_model('models/dl/simple_mlp_final.keras')
    print("Loaded: Simple MLP")
except:
    print("Simple MLP not found")

try:
    dl_models['Deep NN'] = keras.models.load_model('models/dl/deep_nn_final.keras')
    print("Loaded: Deep NN")
except:
    print("Deep NN not found")

try:
    dl_models['Wide and Deep'] = keras.models.load_model('models/dl/wide_deep_final.keras')
    print("Loaded: Wide and Deep")
except:
    print("Wide and Deep not found")

try:
    dl_models['ResNet'] = keras.models.load_model('models/dl/resnet_final.keras')
    print("Loaded: ResNet")
except:
    print("ResNet not found")

print(f"\nTotal DL models loaded: {len(dl_models)}")

Loaded: Simple MLP
Loaded: Deep NN
Loaded: Wide and Deep
Loaded: ResNet

Total DL models loaded: 4


## 2. Generate Predictions

In [7]:
predictions = {}

for name, model in ml_models.items():
    predictions[name] = model.predict(X_test)
    print(f"Generated predictions: {name}")

for name, model in dl_models.items():
    predictions[name] = model.predict(X_test, verbose=0).flatten()
    print(f"Generated predictions: {name}")

print(f"\nTotal predictions: {len(predictions)}")

Generated predictions: Linear Regression
Generated predictions: Ridge
Generated predictions: Lasso
Generated predictions: Random Forest
Generated predictions: XGBoost
Generated predictions: LightGBM
Generated predictions: Simple MLP
Generated predictions: Deep NN
Generated predictions: Wide and Deep
Generated predictions: ResNet

Total predictions: 10


## 3. Calculate Metrics

In [8]:
results = []

for name, y_pred in predictions.items():
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    model_type = 'ML' if name in ml_models else 'DL'
    
    results.append({
        'Model': name,
        'Type': model_type,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    })

results_df = pd.DataFrame(results).sort_values('RMSE')
print("\nAll Models - Test Set Metrics (sorted by RMSE):")
print(results_df.to_string(index=False))


All Models - Test Set Metrics (sorted by RMSE):
            Model Type        MSE      RMSE      MAE       R²
          XGBoost   ML  74.541039  8.633715 6.089814 0.363259
       Simple MLP   DL  75.797401  8.706170 6.048491 0.352527
         LightGBM   ML  76.818789  8.764633 6.158441 0.343802
           ResNet   DL  78.201385  8.843155 6.198777 0.331992
    Random Forest   ML  83.396859  9.132188 6.602323 0.287611
Linear Regression   ML  88.070745  9.384602 6.716830 0.247686
            Ridge   ML  88.070746  9.384602 6.716830 0.247686
            Lasso   ML  88.113879  9.386899 6.716760 0.247318
          Deep NN   DL 106.637131 10.326526 7.348140 0.089089
    Wide and Deep   DL 107.956291 10.390202 8.498646 0.077821


## 4. Visualize Comparison

In [9]:
fig = px.bar(
    results_df,
    x='Model',
    y='RMSE',
    color='Type',
    title='All Models Comparison - RMSE',
    labels={'RMSE': 'RMSE (years)'},
    color_discrete_map={'ML': '#636EFA', 'DL': '#EF553B'}
)
fig.update_layout(template='plotly_white', xaxis_tickangle=-45)
fig.write_html('reports/figures/33_all_models_rmse.html')
fig.show()

In [10]:
fig = px.bar(
    results_df,
    x='Model',
    y='R²',
    color='Type',
    title='All Models Comparison - R² Score',
    color_discrete_map={'ML': '#636EFA', 'DL': '#EF553B'}
)
fig.update_layout(template='plotly_white', xaxis_tickangle=-45)
fig.write_html('reports/figures/34_all_models_r2.html')
fig.show()

In [11]:
top_models = results_df.head(5)['Model'].tolist()

fig = go.Figure()

metrics = ['RMSE', 'MAE', 'R²']

for model in top_models:
    model_data = results_df[results_df['Model'] == model]
    
    rmse_norm = 1 - (model_data['RMSE'].values[0] - results_df['RMSE'].min()) / (results_df['RMSE'].max() - results_df['RMSE'].min())
    mae_norm = 1 - (model_data['MAE'].values[0] - results_df['MAE'].min()) / (results_df['MAE'].max() - results_df['MAE'].min())
    r2_norm = (model_data['R²'].values[0] - results_df['R²'].min()) / (results_df['R²'].max() - results_df['R²'].min())
    
    fig.add_trace(go.Scatterpolar(
        r=[rmse_norm, mae_norm, r2_norm, rmse_norm],
        theta=['RMSE (inverted)', 'MAE (inverted)', 'R²', 'RMSE (inverted)'],
        fill='toself',
        name=model
    ))

fig.update_layout(
    polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
    title='Top 5 Models - Radar Chart',
    template='plotly_white'
)
fig.write_html('reports/figures/35_top_models_radar.html')
fig.show()

## 5. Detailed Analysis of Best Model

In [12]:
best_model_name = results_df.iloc[0]['Model']
best_predictions = predictions[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Test RMSE: {results_df.iloc[0]['RMSE']:.4f} years")
print(f"Test MAE: {results_df.iloc[0]['MAE']:.4f} years")
print(f"Test R²: {results_df.iloc[0]['R²']:.4f}")

Best Model: XGBoost
Test RMSE: 8.6337 years
Test MAE: 6.0898 years
Test R²: 0.3633


In [13]:
sample_size = 5000
sample_idx = np.random.choice(len(y_test), sample_size, replace=False)

fig = px.scatter(
    x=y_test[sample_idx],
    y=best_predictions[sample_idx],
    title=f'Best Model ({best_model_name}) - Predicted vs Actual',
    labels={'x': 'Actual Year', 'y': 'Predicted Year'},
    opacity=0.5
)

min_val = min(y_test.min(), best_predictions.min())
max_val = max(y_test.max(), best_predictions.max())
fig.add_trace(go.Scatter(
    x=[min_val, max_val],
    y=[min_val, max_val],
    mode='lines',
    name='Perfect',
    line=dict(color='red', dash='dash')
))

fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/36_best_model_scatter.html')
fig.show()

In [14]:
residuals = y_test - best_predictions

fig = make_subplots(rows=1, cols=2, subplot_titles=['Residual Distribution', 'Residuals vs Predicted'])

fig.add_trace(
    go.Histogram(x=residuals, nbinsx=100, name='Residuals'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=best_predictions[sample_idx],
        y=residuals[sample_idx],
        mode='markers',
        name='Residuals',
        opacity=0.5
    ),
    row=1, col=2
)
fig.add_hline(y=0, line_dash='dash', line_color='red', row=1, col=2)

fig.update_layout(title=f'{best_model_name} - Residual Analysis', template='plotly_white', showlegend=False)
fig.write_html('reports/figures/37_residual_analysis.html')
fig.show()

print(f"\nResidual Statistics:")
print(f"Mean: {residuals.mean():.4f}")
print(f"Std: {residuals.std():.4f}")
print(f"Skewness: {pd.Series(residuals).skew():.4f}")
print(f"Kurtosis: {pd.Series(residuals).kurtosis():.4f}")


Residual Statistics:
Mean: 0.1291
Std: 8.6328
Skewness: -1.1703
Kurtosis: 3.4857


In [15]:
error_by_year = pd.DataFrame({
    'year': y_test,
    'prediction': best_predictions,
    'error': np.abs(residuals)
})

yearly_performance = error_by_year.groupby('year').agg({
    'error': ['mean', 'std', 'count']
}).reset_index()
yearly_performance.columns = ['year', 'mae', 'std', 'count']

fig = px.scatter(
    yearly_performance,
    x='year',
    y='mae',
    size='count',
    title='Mean Absolute Error by Year',
    labels={'year': 'Actual Year', 'mae': 'MAE (years)', 'count': 'Sample Count'}
)
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/38_mae_by_year.html')
fig.show()

## 6. Ensemble Model

In [16]:
top_3_models = results_df.head(3)['Model'].tolist()
print(f"Top 3 models for ensemble: {top_3_models}")

ensemble_pred = np.mean([predictions[m] for m in top_3_models], axis=0)

ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
ensemble_r2 = r2_score(y_test, ensemble_pred)

print(f"\nEnsemble (Simple Average) Results:")
print(f"RMSE: {ensemble_rmse:.4f} years")
print(f"MAE: {ensemble_mae:.4f} years")
print(f"R²: {ensemble_r2:.4f}")

Top 3 models for ensemble: ['XGBoost', 'Simple MLP', 'LightGBM']

Ensemble (Simple Average) Results:
RMSE: 8.5263 years
MAE: 5.9742 years
R²: 0.3790


In [17]:
val_predictions = {}
for name in top_3_models:
    if name in ml_models:
        val_predictions[name] = ml_models[name].predict(X_val)
    else:
        val_predictions[name] = dl_models[name].predict(X_val, verbose=0).flatten()

val_rmse = {}
for name, pred in val_predictions.items():
    val_rmse[name] = np.sqrt(mean_squared_error(y_val, pred))

total_inv_rmse = sum(1/v for v in val_rmse.values())
weights = {name: (1/rmse)/total_inv_rmse for name, rmse in val_rmse.items()}

print("Ensemble weights:")
for name, w in weights.items():
    print(f"  {name}: {w:.4f}")

weighted_ensemble_pred = np.sum([predictions[name] * weights[name] for name in top_3_models], axis=0)

weighted_rmse = np.sqrt(mean_squared_error(y_test, weighted_ensemble_pred))
weighted_mae = mean_absolute_error(y_test, weighted_ensemble_pred)
weighted_r2 = r2_score(y_test, weighted_ensemble_pred)

print(f"\nWeighted Ensemble Results:")
print(f"RMSE: {weighted_rmse:.4f} years")
print(f"MAE: {weighted_mae:.4f} years")
print(f"R²: {weighted_r2:.4f}")

Ensemble weights:
  XGBoost: 0.3358
  Simple MLP: 0.3328
  LightGBM: 0.3314

Weighted Ensemble Results:
RMSE: 8.5261 years
MAE: 5.9741 years
R²: 0.3790


In [18]:
ensemble_results = [
    {'Model': 'Ensemble (Average)', 'Type': 'Ensemble', 'MSE': ensemble_rmse**2, 
     'RMSE': ensemble_rmse, 'MAE': ensemble_mae, 'R²': ensemble_r2},
    {'Model': 'Ensemble (Weighted)', 'Type': 'Ensemble', 'MSE': weighted_rmse**2, 
     'RMSE': weighted_rmse, 'MAE': weighted_mae, 'R²': weighted_r2}
]

final_results_df = pd.concat([results_df, pd.DataFrame(ensemble_results)]).sort_values('RMSE')

print("\nFinal Results with Ensembles:")
print(final_results_df.to_string(index=False))


Final Results with Ensembles:
              Model     Type        MSE      RMSE      MAE       R²
Ensemble (Weighted) Ensemble  72.693874  8.526070 5.974105 0.379038
 Ensemble (Average) Ensemble  72.698598  8.526347 5.974249 0.378997
            XGBoost       ML  74.541039  8.633715 6.089814 0.363259
         Simple MLP       DL  75.797401  8.706170 6.048491 0.352527
           LightGBM       ML  76.818789  8.764633 6.158441 0.343802
             ResNet       DL  78.201385  8.843155 6.198777 0.331992
      Random Forest       ML  83.396859  9.132188 6.602323 0.287611
  Linear Regression       ML  88.070745  9.384602 6.716830 0.247686
              Ridge       ML  88.070746  9.384602 6.716830 0.247686
              Lasso       ML  88.113879  9.386899 6.716760 0.247318
            Deep NN       DL 106.637131 10.326526 7.348140 0.089089
      Wide and Deep       DL 107.956291 10.390202 8.498646 0.077821


## 7. Save Results

In [19]:
final_results_df.to_csv('reports/metrics/06_final_model_comparison.csv', index=False)
print("Final results saved to reports/metrics/06_final_model_comparison.csv")

Final results saved to reports/metrics/06_final_model_comparison.csv


In [20]:
ensemble_config = {
    'models': top_3_models,
    'weights': weights,
    'weighted_rmse': weighted_rmse,
    'weighted_mae': weighted_mae,
    'weighted_r2': weighted_r2
}

with open('models/ensemble_config.json', 'w') as f:
    json.dump(ensemble_config, f, indent=2)

print("Ensemble configuration saved")

Ensemble configuration saved


In [21]:
predictions_df = pd.DataFrame({
    'actual': y_test,
    'best_model': best_predictions,
    'ensemble_avg': ensemble_pred,
    'ensemble_weighted': weighted_ensemble_pred
})
predictions_df.to_csv('reports/test_predictions.csv', index=False)
print("Test predictions saved")

Test predictions saved


## 8. Final Summary

In [22]:
fig = px.bar(
    final_results_df,
    x='Model',
    y='RMSE',
    color='Type',
    title='Final Model Comparison - RMSE (Including Ensembles)',
    labels={'RMSE': 'RMSE (years)'},
    color_discrete_map={'ML': '#636EFA', 'DL': '#EF553B', 'Ensemble': '#00CC96'}
)
fig.update_layout(template='plotly_white', xaxis_tickangle=-45)
fig.write_html('reports/figures/39_final_comparison.html')
fig.show()

In [23]:
print("\n" + "="*70)
print("FINAL MODEL EVALUATION SUMMARY")
print("="*70)
print(f"\nDataset: Year Prediction MSD")
print(f"Test samples: {len(y_test)}")
print(f"Target range: {y_test.min()} - {y_test.max()}")

print(f"\n{'Model':<25} {'RMSE':>10} {'MAE':>10} {'R²':>10}")
print("-"*55)
for _, row in final_results_df.head(5).iterrows():
    print(f"{row['Model']:<25} {row['RMSE']:>10.4f} {row['MAE']:>10.4f} {row['R²']:>10.4f}")

best_overall = final_results_df.iloc[0]
print(f"\nBest Model: {best_overall['Model']}")
print(f"  - RMSE: {best_overall['RMSE']:.4f} years")
print(f"  - MAE: {best_overall['MAE']:.4f} years")
print(f"  - R²: {best_overall['R²']:.4f}")
print("="*70)


FINAL MODEL EVALUATION SUMMARY

Dataset: Year Prediction MSD
Test samples: 51514
Target range: 1924 - 2010

Model                           RMSE        MAE         R²
-------------------------------------------------------
Ensemble (Weighted)           8.5261     5.9741     0.3790
Ensemble (Average)            8.5263     5.9742     0.3790
XGBoost                       8.6337     6.0898     0.3633
Simple MLP                    8.7062     6.0485     0.3525
LightGBM                      8.7646     6.1584     0.3438

Best Model: Ensemble (Weighted)
  - RMSE: 8.5261 years
  - MAE: 5.9741 years
  - R²: 0.3790
