## Final Model Comparison

Summarise MLflow runs and evaluate the current best model on the 2024 validation set.

In [1]:
# Setup
from pathlib import Path
import json

import numpy as np
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from IPython.display import display
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from movie_lists import normalize_domestic_titles
from model_utils import (
    prepare_features,
    compute_ranking_metrics,
    get_top10_predictions,
)

pd.options.display.float_format = lambda x: f'{x:,.2f}'
np.set_printoptions(suppress=True)

# Point MLflow at the local tracking directory regardless of notebook cwd
# Check both possible locations for mlruns directory
_mlruns_candidates = [
    Path('../mlruns'),     # Main project directory (most likely)
    Path('mlruns'),        # Current directory
    Path('..') / 'mlruns'  # Parent of current directory
]
tracking_path = None
for candidate in _mlruns_candidates:
    if candidate.exists():
        tracking_path = candidate.resolve()
        print(f"Found MLflow tracking directory: {tracking_path}")
        break
if tracking_path is None:
    raise FileNotFoundError('Unable to locate local mlruns directory.')
TRACKING_URI = tracking_path.as_uri()
mlflow.set_tracking_uri(TRACKING_URI)

Found MLflow tracking directory: /Users/jasmineplows/Documents/California/Projects/box_office/mlruns


In [2]:
# Load feature dataset
candidate_paths = [
    Path('../data/dataset_domestic_processed_modeling.csv'),
    Path('../data/dataset_domestic_processed.csv'),
]
data_path = next((p for p in candidate_paths if p.exists()), None)
if data_path is None:
    raise FileNotFoundError('Could not find a processed dataset in ../data/')

df_raw = pd.read_csv(data_path)
df_raw = normalize_domestic_titles(df_raw)

df, feature_cols, target = prepare_features(
    df_raw,
    target='revenue_domestic',
    verbose=False,
)

print('📁 Loaded dataset: {} ({} rows)'.format(data_path, len(df)))
print('   Feature columns: {}'.format(len(feature_cols)))
print('   Target column: {}'.format(target))

scope_path = Path('../data/dataset_scope.json')
if scope_path.exists():
    scope_config = json.loads(scope_path.read_text())
    studio_scope = scope_config.get('studio_note', 'all studios')
    language_scope = scope_config.get('language_note', 'all languages')
    print('   Scope: studios={}, languages={}'.format(studio_scope, language_scope))
else:
    scope_config = {}


No title corrections needed
📁 Loaded dataset: ../data/dataset_domestic_processed_modeling.csv (1462 rows)
   Feature columns: 69
   Target column: revenue_domestic
   Scope: studios=all studios, languages=all languages


In [3]:
# Fetch MLflow runs
experiment_name = 'box_office_modeling'
client = MlflowClient()
mlflow_results = pd.DataFrame()
best_run = None

# Try to get the specific experiment first
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    print(f"⚠️ MLflow experiment '{experiment_name}' not found.")
    # Try to find any experiments with runs
    try:
        all_experiments = client.search_experiments()
        experiments_with_runs = []
        
        for exp in all_experiments:
            runs = mlflow.search_runs(
                experiment_ids=[exp.experiment_id],
                filter_string="attributes.status = 'FINISHED'",
                max_results=1
            )
            if not runs.empty:
                experiments_with_runs.append(exp)
        
        if experiments_with_runs:
            # Use the most recent experiment with runs
            experiment = max(experiments_with_runs, 
                           key=lambda x: getattr(x, 'last_update_time', 0) or 0)
            print(f"   Using experiment '{experiment.name}' instead (has finished runs).")
        else:
            print("   No experiments with finished runs found.")
            print("   💡 Run notebook 4 (modeling) first to generate MLflow runs.")
            experiment = None
            
    except Exception as e:
        print(f"   Error searching experiments: {e}")
        experiment = None

if experiment is not None:
    runs_df = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string="attributes.status = 'FINISHED'",
        order_by=['metrics.rmse ASC'],
    )
    if runs_df.empty:
        print('⚠️ No finished MLflow runs found in this experiment.')
        print('   💡 Run notebook 4 (modeling) first to generate MLflow runs.')
    else:
        rename_map = {
            'tags.mlflow.runName': 'run_name',
            'params.model': 'model',
            'params.strategy': 'strategy',
            'params.data_scope_studios': 'data_scope_studios',
            'params.data_scope_language': 'data_scope_language',
            'metrics.rmse': 'rmse',
            'metrics.mae': 'mae',
            'metrics.mape': 'mape',
            'metrics.r2': 'r2',
            'metrics.recall_at_10': 'recall_at_10',
            'metrics.precision_at_10': 'precision_at_10',
            'metrics.ndcg_at_10': 'ndcg_at_10',
            'metrics.spearman_corr': 'spearman_corr',
            'metrics.kendall_corr': 'kendall_corr',
        }
        keep_cols = [col for col in rename_map if col in runs_df.columns] + ['run_id', 'artifact_uri']
        mlflow_results = runs_df[keep_cols].rename(columns=rename_map).copy()

        numeric_cols = [
            'rmse', 'mae', 'mape', 'r2', 'recall_at_10',
            'precision_at_10', 'ndcg_at_10', 'spearman_corr', 'kendall_corr'
        ]
        for col in numeric_cols:
            if col in mlflow_results.columns:
                mlflow_results[col] = pd.to_numeric(mlflow_results[col], errors='coerce')

        display_cols = ['run_name', 'model', 'strategy', 'rmse', 'mae', 'mape', 'r2',
                        'recall_at_10', 'precision_at_10', 'ndcg_at_10']
        available_display = [c for c in display_cols if c in mlflow_results.columns]
        formatted = mlflow_results.copy()
        if 'rmse' in formatted.columns:
            formatted['rmse'] = formatted['rmse'].map(lambda x: f'${x:,.0f}' if pd.notna(x) else 'n/a')
        if 'mae' in formatted.columns:
            formatted['mae'] = formatted['mae'].map(lambda x: f'${x:,.0f}' if pd.notna(x) else 'n/a')
        if 'mape' in formatted.columns:
            formatted['mape'] = formatted['mape'].map(lambda x: f'{x:.1f}%' if pd.notna(x) else 'n/a')
        if 'recall_at_10' in formatted.columns:
            formatted['recall_at_10'] = formatted['recall_at_10'].map(lambda x: f'{x:.2%}' if pd.notna(x) else 'n/a')
        if 'precision_at_10' in formatted.columns:
            formatted['precision_at_10'] = formatted['precision_at_10'].map(lambda x: f'{x:.2%}' if pd.notna(x) else 'n/a')
        if 'ndcg_at_10' in formatted.columns:
            formatted['ndcg_at_10'] = formatted['ndcg_at_10'].map(lambda x: f'{x:.3f}' if pd.notna(x) else 'n/a')

        print(f'🏁 Top MLflow runs from experiment "{experiment.name}" (sorted by RMSE):')
        display(formatted[available_display].head(10))

        if not mlflow_results.empty:
            best_run = mlflow_results.sort_values('rmse').iloc[0]
            print('🥇 Best run: {} (RMSE ${:,.0f})'.format(best_run.get('run_name'), best_run.get('rmse')))
else:
    print('❌ No MLflow experiments found with finished runs.')
    print('   💡 Run notebook 4 (modeling) first to generate MLflow runs.')

🏁 Top MLflow runs from experiment "box_office_modeling" (sorted by RMSE):


Unnamed: 0,run_name,model,rmse,mae,mape,r2,recall_at_10,precision_at_10,ndcg_at_10
0,LightGBM | No Pandemic Era,LGBMRegressor,"$76,372,525","$30,964,249",162.7%,0.45,70.00%,70.00%,0.762
1,LightGBM | No Pandemic Era,LGBMRegressor,"$76,372,525","$30,964,249",162.7%,0.45,70.00%,70.00%,0.762
2,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,411,707","$30,049,196",159.2%,0.45,70.00%,70.00%,0.655
3,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,411,707","$30,049,196",159.2%,0.45,70.00%,70.00%,0.655
4,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,828,355","$32,286,430",165.7%,0.46,70.00%,70.00%,0.739
5,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,828,355","$32,286,430",165.7%,0.46,70.00%,70.00%,0.739
6,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,828,355","$32,286,430",165.7%,0.46,70.00%,,0.739
7,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,828,355","$32,286,430",165.7%,0.46,,,
8,LightGBM | All Eras (2015-2023),LGBMRegressor,"$76,828,355","$32,286,430",165.7%,0.46,,,
9,DecisionTree | Weighted (30% pandemic era),DecisionTreeRegressor,"$77,668,814","$32,192,350",202.0%,0.43,20.00%,20.00%,0.586


🥇 Best run: LightGBM | No Pandemic Era (RMSE $76,372,525)


### Evaluate best MLflow model on 2024 validation set

In [4]:
best_model = None
if best_run is None:
    print('❌ No MLflow run available to evaluate.')
else:
    run_id = best_run.get('run_id')
    try:
        best_model = mlflow.sklearn.load_model('runs:/{}/model'.format(run_id))
    except Exception:
        best_model = mlflow.pyfunc.load_model('runs:/{}/model'.format(run_id))

    if best_model is None:
        print('⚠️ Unable to load model artifacts for run {}'.format(run_id))
    else:
        val_2024 = df[df['release_year'] == 2024].copy()
        val_2024 = val_2024.dropna(subset=[target])
        if val_2024.empty:
            print('⚠️ No 2024 rows available for validation.')
        else:
            X_val = val_2024[feature_cols]
            preds_log = best_model.predict(X_val)
            preds = np.expm1(preds_log)
            val_2024['predicted_revenue'] = preds

            rmse = float(np.sqrt(mean_squared_error(val_2024[target], preds)))
            mae = float(mean_absolute_error(val_2024[target], preds))
            mape = float(np.mean(np.abs((val_2024[target] - preds) / val_2024[target])) * 100)
            r2 = float(r2_score(val_2024[target], preds))
            ranking_metrics = compute_ranking_metrics(
                val_2024, preds, target_col=target, title_col='title', k=10
            )

            print('🎯 2024 validation metrics:')
            print('   RMSE: ${:,.0f}'.format(rmse))
            print('   MAE:  ${:,.0f}'.format(mae))
            print('   MAPE: {:.1f}%'.format(mape))
            print('   R²:   {:.3f}'.format(r2))
            if 'recall_at_10' in ranking_metrics:
                print('   Recall@10: {:.2%}'.format(ranking_metrics['recall_at_10']))
            if 'precision_at_10' in ranking_metrics:
                print('   Precision@10: {:.2%}'.format(ranking_metrics['precision_at_10']))
            if 'ndcg_at_10' in ranking_metrics:
                print('   NDCG@10: {:.3f}'.format(ranking_metrics['ndcg_at_10']))

            top10_2024 = get_top10_predictions(best_model, df, 2024, feature_cols)
            if top10_2024 is not None:
                formatted = top10_2024.copy()
                if 'predicted_revenue' in formatted.columns:
                    formatted['predicted_revenue'] = formatted['predicted_revenue'].map(lambda x: f'${x:,.0f}')
                if 'actual_revenue' in formatted.columns:
                    formatted['actual_revenue'] = formatted['actual_revenue'].map(lambda x: f'${x:,.0f}')
                if 'prediction_error_pct' in formatted.columns:
                    formatted['prediction_error_pct'] = formatted['prediction_error_pct'].map(lambda x: f'{x:+.1f}%')
                print('Top 10 predicted vs actual for 2024 ({}):'.format(best_run.get('run_name')))
                display(formatted[['title', 'predicted_revenue', 'actual_revenue', 'prediction_error_pct']])

                actual_top10 = val_2024.nlargest(10, target)['title'].tolist()
                predicted_top10 = top10_2024['title'].tolist()
                overlap = len(set(actual_top10) & set(predicted_top10))
                print('Overlap with actual 2024 top 10: {}/10'.format(overlap))


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🎯 2024 validation metrics:
   RMSE: $76,372,525
   MAE:  $30,964,249
   MAPE: 162.7%
   R²:   0.449
   Recall@10: 70.00%
   Precision@10: 70.00%
   NDCG@10: 0.762
Top 10 predicted vs actual for 2024 (LightGBM | No Pandemic Era):


Unnamed: 0,title,predicted_revenue,actual_revenue,prediction_error_pct
0,Mufasa: The Lion King,"$302,616,525","$254,567,693",-18.9%
1,Deadpool & Wolverine,"$235,356,888","$636,745,858",+63.0%
2,Moana 2,"$215,698,430","$460,405,297",+53.2%
3,Twisters,"$195,459,321","$267,762,265",+27.0%
4,Venom: The Last Dance,"$174,895,145","$139,755,882",-25.1%
5,Sonic the Hedgehog 3,"$169,848,233","$236,115,100",+28.1%
6,Despicable Me 4,"$159,471,229","$361,004,205",+55.8%
7,Kingdom of the Planet of the Apes,"$144,463,389","$171,130,165",+15.6%
8,Inside Out 2,"$124,054,858","$652,980,194",+81.0%
9,Godzilla x Kong: The New Empire,"$108,316,683","$196,350,016",+44.8%


Overlap with actual 2024 top 10: 7/10
