# 07 - Final Model and Predictions
## Year Prediction from Audio Features

### Objectives
1. Load the best model
2. Create prediction pipeline
3. Generate final predictions
4. Create submission file
5. Summary and interpretation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json

import joblib
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load Best Model and Scaler

In [2]:
scaler = joblib.load('models/ml/standard_scaler.joblib')
print("Scaler loaded")

with open('data/processed/feature_names.txt', 'r') as f:
    feature_names = [line.strip() for line in f.readlines()]
print(f"Feature names loaded: {len(feature_names)} features")

Scaler loaded
Feature names loaded: 90 features


In [3]:

models = {}

try:
    xgb_model = xgb.XGBRegressor()
    xgb_model.load_model('models/ml/xgboost.json')
    models['XGBoost'] = xgb_model
    print("Loaded: XGBoost")
except Exception as e:
    print(f"XGBoost not found: {e}")

try:
    models['LightGBM'] = joblib.load('models/ml/lightgbm.joblib')
    print("Loaded: LightGBM")
except Exception as e:
    print(f"LightGBM not found: {e}")

try:
    models['Random Forest'] = joblib.load('models/ml/random_forest.joblib')
    print("Loaded: Random Forest")
except Exception as e:
    print(f"Random Forest not found: {e}")

try:
    models['Deep NN'] = keras.models.load_model('models/dl/deep_nn_final.keras')
    print("Loaded: Deep NN")
except Exception as e:
    print(f"Deep NN not found: {e}")

print(f"\nTotal models loaded: {len(models)}")

Loaded: XGBoost
Loaded: LightGBM
Loaded: Random Forest
Loaded: Deep NN

Total models loaded: 4


In [4]:
try:
    with open('models/ensemble_config.json', 'r') as f:
        ensemble_config = json.load(f)
    print("Ensemble configuration loaded")
    print(f"Ensemble models: {ensemble_config['models']}")
    print(f"Weights: {ensemble_config['weights']}")
except:
    ensemble_config = None
    print("No ensemble configuration found")

Ensemble configuration loaded
Ensemble models: ['XGBoost', 'Simple MLP', 'LightGBM']
Weights: {'XGBoost': 0.3357882624377155, 'Simple MLP': 0.33284345481940336, 'LightGBM': 0.3313682827428813}


## 2. Create Prediction Pipeline

In [5]:
class YearPredictor:
    """
    A class to handle year prediction from audio features.
    """
    
    def __init__(self, scaler, models, ensemble_weights=None):
        self.scaler = scaler
        self.models = models
        self.ensemble_weights = ensemble_weights
        self.feature_names = ['year'] + [f'timbre_avg_{i}' for i in range(1, 13)] + [f'timbre_cov_{i}' for i in range(1, 79)]
    
    def preprocess(self, X):
        """Apply preprocessing (scaling)."""
        if isinstance(X, pd.DataFrame):
            X = X.values
        return self.scaler.transform(X)
    
    def predict_single_model(self, X, model_name):
        """Get predictions from a single model."""
        if model_name not in self.models:
            raise ValueError(f"Model {model_name} not found")
        
        X_scaled = self.preprocess(X)
        model = self.models[model_name]
        
        if hasattr(model, 'predict'):
            if 'keras' in str(type(model)) or 'tensorflow' in str(type(model)):
                return model.predict(X_scaled, verbose=0).flatten()
            else:
                return model.predict(X_scaled)
    
    def predict_ensemble(self, X, method='weighted'):
        """Get ensemble predictions."""
        X_scaled = self.preprocess(X)
        predictions = []
        
        for name, model in self.models.items():
            if 'keras' in str(type(model)) or 'tensorflow' in str(type(model)):
                pred = model.predict(X_scaled, verbose=0).flatten()
            else:
                pred = model.predict(X_scaled)
            predictions.append(pred)
        
        if method == 'average':
            return np.mean(predictions, axis=0)
        elif method == 'weighted' and self.ensemble_weights:
            weights = [self.ensemble_weights.get(name, 1/len(self.models)) for name in self.models.keys()]
            weights = np.array(weights) / np.sum(weights)
            return np.average(predictions, axis=0, weights=weights)
        else:
            return np.mean(predictions, axis=0)
    
    def predict(self, X, use_ensemble=True):
        """Main prediction method."""
        if use_ensemble and len(self.models) > 1:
            return self.predict_ensemble(X)
        else:
            model_name = list(self.models.keys())[0]
            return self.predict_single_model(X, model_name)

weights = ensemble_config['weights'] if ensemble_config else None
predictor = YearPredictor(scaler, models, weights)
print("Predictor initialized")

Predictor initialized


## 3. Test Predictions

In [6]:
X_test = np.load('data/splits/X_test.npy')
y_test = np.load('data/splits/y_test.npy')

X_test_original = scaler.inverse_transform(X_test)

print(f"Test set size: {len(X_test)}")

Test set size: 51514


In [7]:
print("Testing individual models...")
for model_name in models.keys():
    pred = predictor.predict_single_model(X_test_original, model_name)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{model_name}: RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")

Testing individual models...
XGBoost: RMSE=8.6337, MAE=6.0898, R²=0.3633
LightGBM: RMSE=8.7646, MAE=6.1584, R²=0.3438
Random Forest: RMSE=9.1322, MAE=6.6023, R²=0.2876
Deep NN: RMSE=10.3265, MAE=7.3481, R²=0.0891


In [8]:
print("\nTesting ensemble...")
ensemble_pred = predictor.predict(X_test_original, use_ensemble=True)
rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
mae = mean_absolute_error(y_test, ensemble_pred)
r2 = r2_score(y_test, ensemble_pred)
print(f"Ensemble: RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")


Testing ensemble...
Ensemble: RMSE=8.7983, MAE=6.2358, R²=0.3388


## 4. Example Predictions

In [9]:
sample_indices = np.random.choice(len(y_test), 10, replace=False)

example_df = pd.DataFrame({
    'Actual Year': y_test[sample_indices].astype(int),
    'Predicted Year': ensemble_pred[sample_indices].round().astype(int),
    'Error': (y_test[sample_indices] - ensemble_pred[sample_indices]).round(2)
})

print("\nExample Predictions:")
print(example_df.to_string(index=False))


Example Predictions:
 Actual Year  Predicted Year  Error
        2002            1993   9.27
        1997            1999  -1.71
        2005            2005  -0.39
        1999            1999  -0.04
        2005            1999   5.82
        1994            1996  -2.10
        1978            1990 -12.43
        2004            1999   5.38
        1977            1991 -13.56
        2002            2000   1.63


## 5. Generate Final Submission

In [10]:
submission_df = pd.DataFrame({
    'id': range(len(y_test)),
    'actual_year': y_test,
    'predicted_year': ensemble_pred.round().astype(int),
    'predicted_year_raw': ensemble_pred
})

submission_df.to_csv('reports/submission.csv', index=False)
print("Submission saved to reports/submission.csv")
print(f"Total predictions: {len(submission_df)}")

Submission saved to reports/submission.csv
Total predictions: 51514


## 6. Final Visualizations

In [11]:
fig = make_subplots(rows=1, cols=2, subplot_titles=['Actual Years', 'Predicted Years'])

fig.add_trace(go.Histogram(x=y_test, name='Actual', nbinsx=50), row=1, col=1)
fig.add_trace(go.Histogram(x=ensemble_pred, name='Predicted', nbinsx=50), row=1, col=2)

fig.update_layout(title='Distribution of Actual vs Predicted Years', template='plotly_white')
fig.write_html('reports/figures/40_final_distribution.html')
fig.show()

In [12]:
errors = np.abs(y_test - ensemble_pred)
decades = (y_test // 10) * 10

error_by_decade = pd.DataFrame({
    'decade': decades,
    'error': errors
}).groupby('decade').agg({'error': ['mean', 'std', 'count']}).reset_index()
error_by_decade.columns = ['decade', 'mae', 'std', 'count']

fig = px.bar(
    error_by_decade,
    x='decade',
    y='mae',
    error_y='std',
    title='Mean Absolute Error by Decade',
    labels={'decade': 'Decade', 'mae': 'MAE (years)'}
)
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/41_error_by_decade.html')
fig.show()

In [13]:
actual_decades = (y_test // 10) * 10
predicted_decades = (ensemble_pred.round() // 10) * 10

confusion = pd.crosstab(
    actual_decades, predicted_decades,
    rownames=['Actual Decade'],
    colnames=['Predicted Decade'],
    normalize='index'
)

fig = px.imshow(
    confusion,
    labels={'color': 'Proportion'},
    title='Decade Prediction Confusion Matrix (Row-normalized)',
    color_continuous_scale='Blues'
)
fig.update_layout(template='plotly_white')
fig.write_html('reports/figures/42_decade_confusion.html')
fig.show()

## 7. Model Interpretation

In [15]:
if 'XGBoost' in models:
    xgb_model = models['XGBoost']
    importance = xgb_model.feature_importances_
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    fig = px.bar(
        importance_df.head(20),
        x='importance',
        y='feature',
        orientation='h',
        title='XGBoost Feature Importance (Top 20)',
        labels={'importance': 'Importance', 'feature': 'Feature'}
    )
    fig.update_layout(template='plotly_white', yaxis={'categoryorder': 'total ascending'})
    fig.write_html('reports/figures/43_xgb_importance.html')
    fig.show()
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10).to_string(index=False))


Top 10 Most Important Features:
      feature  importance
 timbre_avg_1    0.074808
 timbre_avg_3    0.037673
 timbre_cov_2    0.033273
 timbre_cov_8    0.026922
 timbre_avg_2    0.025898
timbre_cov_45    0.023344
 timbre_avg_6    0.023221
timbre_cov_51    0.018388
timbre_cov_24    0.017521
timbre_cov_11    0.017099


## 8. Final Summary Report

In [16]:
final_summary = {
    'Dataset': 'Year Prediction MSD',
    'Total Samples': 515345,
    'Training Samples': len(np.load('data/splits/y_train.npy')),
    'Validation Samples': len(np.load('data/splits/y_val.npy')),
    'Test Samples': len(y_test),
    'Features': 90,
    'Target': 'Release Year (1922-2011)',
    'Best Model': 'Ensemble',
    'Test RMSE': f'{rmse:.4f} years',
    'Test MAE': f'{mae:.4f} years',
    'Test R²': f'{r2:.4f}',
    'Models in Ensemble': list(models.keys())
}

print("\n" + "="*70)
print("FINAL PROJECT SUMMARY")
print("="*70)
for key, value in final_summary.items():
    print(f"{key}: {value}")
print("="*70)


FINAL PROJECT SUMMARY
Dataset: Year Prediction MSD
Total Samples: 515345
Training Samples: 394074
Validation Samples: 69543
Test Samples: 51514
Features: 90
Target: Release Year (1922-2011)
Best Model: Ensemble
Test RMSE: 8.7983 years
Test MAE: 6.2358 years
Test R²: 0.3388
Models in Ensemble: ['XGBoost', 'LightGBM', 'Random Forest', 'Deep NN']


In [17]:
with open('reports/final_summary.json', 'w') as f:
    summary_json = {k: str(v) if isinstance(v, np.ndarray) else v for k, v in final_summary.items()}
    json.dump(summary_json, f, indent=2)

print("Final summary saved to reports/final_summary.json")

Final summary saved to reports/final_summary.json
