## Final Model Comparison

Summarise MLflow runs and evaluate the current best model on the 2024 validation set.

In [1]:
# Setup
from pathlib import Path
import json

import numpy as np
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from IPython.display import display
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from movie_lists import normalize_domestic_titles
from model_utils import (
    prepare_features,
    compute_ranking_metrics,
    get_top10_predictions,
)

pd.options.display.float_format = lambda x: f'{x:,.2f}'
np.set_printoptions(suppress=True)

# Point MLflow at the local tracking directory regardless of notebook cwd
# Check both possible locations for mlruns directory
_mlruns_candidates = [
    Path('../mlruns'),     # Main project directory (most likely)
    Path('mlruns'),        # Current directory
    Path('..') / 'mlruns'  # Parent of current directory
]
tracking_path = None
for candidate in _mlruns_candidates:
    if candidate.exists():
        tracking_path = candidate.resolve()
        print(f"Found MLflow tracking directory: {tracking_path}")
        break
if tracking_path is None:
    raise FileNotFoundError('Unable to locate local mlruns directory.')
TRACKING_URI = tracking_path.as_uri()
mlflow.set_tracking_uri(TRACKING_URI)

Found MLflow tracking directory: /Users/jasmineplows/Documents/California/Projects/box_office/mlruns


In [2]:

from pathlib import Path

DATA_DIR = Path('../data')
DEFAULT_DATASET_CANDIDATES = [
    DATA_DIR / 'dataset_domestic_processed_modeling.csv',
    DATA_DIR / 'dataset_domestic_processed.csv',
]

def _first_existing(paths):
    for candidate in paths:
        if candidate.exists():
            return candidate
    return None

DEFAULT_DATASET_PATH = _first_existing(DEFAULT_DATASET_CANDIDATES)

SUBSET_PATHS = {
    ('english_only', 'all_studios', 2010): DATA_DIR / 'dataset_domestic_processed_english_2010_2026.csv',
    ('english_only', 'all_studios', 2015): DATA_DIR / 'dataset_domestic_processed_english_2015_2026.csv',
    ('all_languages', 'major_only', 2010): DATA_DIR / 'dataset_domestic_processed_major_2010_2026.csv',
    ('all_languages', 'major_only', 2015): DATA_DIR / 'dataset_domestic_processed_major_2015_2026.csv',
}

def infer_year_floor(params):
    if not params:
        return 2015
    for key in ('train_year_min', 'train_start_year', 'training_year_start'):
        value = params.get(key)
        if value is None:
            continue
        try:
            return int(value)
        except ValueError:
            continue
    return 2015

def resolve_dataset_path(studio_scope, language_scope, params=None):
    lang_scope = (language_scope or 'all_languages').lower()
    studio_scope = (studio_scope or 'all_studios').lower()

    year_floor = infer_year_floor(params or {})
    year_pref = [2015, 2010] if year_floor and year_floor > 2010 else [2010, 2015]

    candidates = []

    if lang_scope.startswith('english'):
        for yr in [2010, 2015]:
            candidates.append(('english_only', 'all_studios', yr))

    if studio_scope == 'major_only':
        for yr in year_pref:
            candidates.append(('all_languages', 'major_only', yr))

    for yr in year_pref:
        candidates.append((lang_scope if lang_scope in ('english_only', 'all_languages') else 'all_languages',
                           studio_scope if studio_scope in ('all_studios', 'major_only') else 'all_studios',
                           yr))

    seen = set()
    ordered_candidates = []
    for item in candidates:
        if item not in seen:
            seen.add(item)
            ordered_candidates.append(item)

    for key in ordered_candidates:
        path = SUBSET_PATHS.get(key)
        if path and path.exists():
            return path

    if DEFAULT_DATASET_PATH is not None:
        return DEFAULT_DATASET_PATH
    raise FileNotFoundError('No suitable dataset found for the requested scope.')

def load_prepared_dataset(dataset_path):
    if dataset_path is None:
        raise FileNotFoundError('Dataset path not provided.')
    df_raw = pd.read_csv(dataset_path)
    df_raw = normalize_domestic_titles(df_raw)
    df_prepared, feature_cols_prepared, target_name = prepare_features(
        df_raw,
        target='revenue_domestic',
        verbose=False,
    )
    print('📁 Loaded dataset: {} ({} rows)'.format(dataset_path, len(df_prepared)))
    print('   Feature columns: {}'.format(len(feature_cols_prepared)))
    print('   Target column: {}'.format(target_name))
    return df_prepared, feature_cols_prepared, target_name

scope_path = DATA_DIR / 'dataset_scope.json'
if scope_path.exists():
    scope_config = json.loads(scope_path.read_text())
else:
    scope_config = {}

df = None
feature_cols = []
target = 'revenue_domestic'


In [3]:

# Fetch MLflow runs
experiment_name = 'box_office_modeling'
client = MlflowClient()
mlflow_results = pd.DataFrame()
best_run = None
best_run_params = {}
best_run_dataset_path = None

# Try to get the specific experiment first
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    print(f"⚠️ MLflow experiment '{experiment_name}' not found.")
    # Try to find any experiments with runs
    try:
        all_experiments = client.search_experiments()
        experiments_with_runs = []

        for exp in all_experiments:
            runs = mlflow.search_runs(
                experiment_ids=[exp.experiment_id],
                filter_string="attributes.status = 'FINISHED'",
                max_results=1
            )
            if not runs.empty:
                experiments_with_runs.append(exp)

        if experiments_with_runs:
            # Use the most recent experiment with runs
            experiment = max(
                experiments_with_runs,
                key=lambda x: getattr(x, 'last_update_time', 0) or 0
            )
            print(f"   Using experiment '{experiment.name}' instead (has finished runs).")
        else:
            print("   No experiments with finished runs found.")
            print("   💡 Run notebook 4 (modeling) first to generate MLflow runs.")
            experiment = None

    except Exception as e:
        print(f"   Error searching experiments: {e}")
        experiment = None

if experiment is not None:
    runs_df = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string="attributes.status = 'FINISHED'",
        order_by=['metrics.rmse ASC'],
    )
    if runs_df.empty:
        print('⚠️ No finished MLflow runs found in this experiment.')
        print('   💡 Run notebook 4 (modeling) first to generate MLflow runs.')
    else:
        rename_map = {
            'tags.mlflow.runName': 'run_name',
            'params.model': 'model',
            'params.strategy': 'strategy',
            'params.data_scope_studios': 'data_scope_studios',
            'params.data_scope_language': 'data_scope_language',
            'tags.dataset.studio_scope': 'tag_studio_scope',
            'tags.dataset.language_scope': 'tag_language_scope',
            'metrics.rmse': 'rmse',
            'metrics.mae': 'mae',
            'metrics.mape': 'mape',
            'metrics.r2': 'r2',
            'metrics.recall_at_10': 'recall_at_10',
            'metrics.precision_at_10': 'precision_at_10',
            'metrics.ndcg_at_10': 'ndcg_at_10',
            'metrics.spearman_corr': 'spearman_corr',
            'metrics.kendall_corr': 'kendall_corr',
        }
        keep_cols = [col for col in rename_map if col in runs_df.columns] + ['run_id', 'artifact_uri', 'start_time']
        mlflow_results = runs_df[keep_cols].rename(columns=rename_map).copy()

        numeric_cols = [
            'rmse', 'mae', 'mape', 'r2', 'recall_at_10',
            'precision_at_10', 'ndcg_at_10', 'spearman_corr', 'kendall_corr'
        ]
        if 'start_time' in runs_df.columns:
            mlflow_results['start_time'] = pd.to_datetime(runs_df['start_time'], unit='ms', errors='coerce')
        for col in numeric_cols:
            if col in mlflow_results.columns:
                mlflow_results[col] = pd.to_numeric(mlflow_results[col], errors='coerce')

        has_recall = (
            'recall_at_10' in mlflow_results.columns
            and mlflow_results['recall_at_10'].notna().any()
        )
        if has_recall:
            sort_cols = ['recall_at_10']
            sort_dirs = [False]
            if 'rmse' in mlflow_results.columns:
                sort_cols.append('rmse')
                sort_dirs.append(True)
            sort_label = 'Recall@10 (highest first, RMSE tie-breaker)'
        elif 'rmse' in mlflow_results.columns:
            sort_cols = ['rmse']
            sort_dirs = [True]
            sort_label = 'RMSE (lowest first)'
        elif 'start_time' in mlflow_results.columns:
            sort_cols = ['start_time']
            sort_dirs = [False]
            sort_label = 'start time (most recent first)'
        else:
            first_col = mlflow_results.columns[0]
            sort_cols = [first_col]
            sort_dirs = [True]
            sort_label = f"{first_col} (ascending)"
        mlflow_results = mlflow_results.sort_values(sort_cols, ascending=sort_dirs).reset_index(drop=True)

        display_cols = [
            'run_name', 'model', 'strategy', 'data_scope_studios', 'data_scope_language',
            'tag_studio_scope', 'tag_language_scope', 'start_time', 'run_id', 'artifact_uri',
            'rmse', 'mae', 'mape', 'r2', 'recall_at_10', 'precision_at_10', 'ndcg_at_10'
        ]
        available_display = [c for c in display_cols if c in mlflow_results.columns]
        formatted = mlflow_results.copy()
        if 'rmse' in formatted.columns:
            formatted['rmse'] = formatted['rmse'].map(lambda x: f'${x:,.0f}' if pd.notna(x) else 'n/a')
        if 'mae' in formatted.columns:
            formatted['mae'] = formatted['mae'].map(lambda x: f'${x:,.0f}' if pd.notna(x) else 'n/a')
        if 'mape' in formatted.columns:
            formatted['mape'] = formatted['mape'].map(lambda x: f'{x:.1f}%' if pd.notna(x) else 'n/a')
        if 'recall_at_10' in formatted.columns:
            formatted['recall_at_10'] = formatted['recall_at_10'].map(lambda x: f'{x:.2%}' if pd.notna(x) else 'n/a')
        if 'precision_at_10' in formatted.columns:
            formatted['precision_at_10'] = formatted['precision_at_10'].map(lambda x: f'{x:.2%}' if pd.notna(x) else 'n/a')
        if 'ndcg_at_10' in formatted.columns:
            formatted['ndcg_at_10'] = formatted['ndcg_at_10'].map(lambda x: f'{x:.3f}' if pd.notna(x) else 'n/a')

        print(f'🏁 Top MLflow runs from experiment "{experiment.name}" (sorted by {sort_label}):')
        display(formatted[available_display].head(100000))
        if not mlflow_results.empty:
            best_run = mlflow_results.iloc[0]
            best_run_id = best_run.get('run_id')
            best_run_params = {}
            if best_run_id:
                try:
                    run_info = client.get_run(best_run_id)
                    best_run_params = run_info.data.params
                except Exception as exc:
                    print(f"   ⚠️ Unable to fetch parameters for run {best_run_id}: {exc}")
            try:
                best_run_dataset_path = resolve_dataset_path(
                    best_run.get('data_scope_studios'),
                    best_run.get('data_scope_language'),
                    best_run_params,
                )
                print(f"   Dataset selected for evaluation: {best_run_dataset_path}")
            except Exception as exc:
                print(f"   ⚠️ Unable to resolve dataset path: {exc}")
                best_run_dataset_path = DEFAULT_DATASET_PATH
            scope_text = ''
            if 'data_scope_studios' in best_run and 'data_scope_language' in best_run:
                scope_text = ' (studios: {}, language: {})'.format(
                    best_run.get('data_scope_studios', 'unknown'),
                    best_run.get('data_scope_language', 'unknown')
                )

            metrics_fragments = []
            rmse_val = best_run.get('rmse')
            metrics_fragments.append(
                f"RMSE ${rmse_val:,.0f}" if rmse_val is not None and pd.notna(rmse_val) else 'RMSE n/a'
            )
            r2_val = best_run.get('r2')
            if r2_val is not None and pd.notna(r2_val):
                metrics_fragments.append(f'R² {r2_val:.3f}')
            ndcg_val = best_run.get('ndcg_at_10')
            if ndcg_val is not None and pd.notna(ndcg_val):
                metrics_fragments.append(f'NDCG@10 {ndcg_val:.3f}')
            recall_val = best_run.get('recall_at_10')
            if recall_val is not None and pd.notna(recall_val):
                metrics_fragments.append(f'Recall@10 {recall_val:.2%}')
            precision_val = best_run.get('precision_at_10')
            if precision_val is not None and pd.notna(precision_val):
                metrics_fragments.append(f'Precision@10 {precision_val:.2%}')
            spearman_val = best_run.get('spearman_corr')
            kendall_val = best_run.get('kendall_corr')
            corr_bits = []
            if spearman_val is not None and pd.notna(spearman_val):
                corr_bits.append(f'Spearman {spearman_val:.3f}')
            if kendall_val is not None and pd.notna(kendall_val):
                corr_bits.append(f'Kendall {kendall_val:.3f}')
            if corr_bits:
                metrics_fragments.append(', '.join(corr_bits))
            metrics_summary = ', '.join(metrics_fragments) if metrics_fragments else 'metrics unavailable'

            priority_label = 'Recall@10 priority' if has_recall else 'RMSE priority'
            print(
                f"🏅 Best MLflow Run ({priority_label}): {best_run.get('run_name')}{scope_text} ({metrics_summary})"
            )
else:
    print('❌ No MLflow experiments found with finished runs.')
    print('   💡 Run notebook 4 (modeling) first to generate MLflow runs.')


🏁 Top MLflow runs from experiment "box_office_modeling" (sorted by Recall@10 (highest first, RMSE tie-breaker)):


Unnamed: 0,run_name,model,data_scope_studios,data_scope_language,tag_studio_scope,tag_language_scope,start_time,run_id,artifact_uri,rmse,mae,mape,r2,recall_at_10,precision_at_10,ndcg_at_10
0,LightGBM | No Pandemic Era,LGBMRegressor,all_studios,english_only,all_studios,english_only,2025-09-26 22:38:16.011000+00:00,478ec5d928274413934b6c6074ca630d,file:///Users/jasmineplows/Documents/Californi...,"$76,372,525","$30,964,249",162.7%,0.45,70.00%,70.00%,0.762
1,LightGBM | No Pandemic Era,LGBMRegressor,all_studios,english_only,all_studios,english_only,2025-09-26 21:28:55.035000+00:00,526deb7cd73f43e7896e55630f96e897,file:///Users/jasmineplows/Documents/Californi...,"$76,372,525","$30,964,249",162.7%,0.45,70.00%,70.00%,0.762
2,LightGBM | All Eras (2015-2023),LGBMRegressor,all_studios,english_only,all_studios,english_only,2025-09-26 22:37:51.700000+00:00,0921575a6260434f86e6153d348cb510,file:///Users/jasmineplows/Documents/Californi...,"$76,411,707","$30,049,196",159.2%,0.45,70.00%,70.00%,0.655
3,LightGBM | All Eras (2015-2023),LGBMRegressor,all_studios,english_only,all_studios,english_only,2025-09-26 21:28:41.817000+00:00,6002c8cfd5534c1c8e971592bccc8d85,file:///Users/jasmineplows/Documents/Californi...,"$76,411,707","$30,049,196",159.2%,0.45,70.00%,70.00%,0.655
4,LightGBM | All Eras (2015-2023),LGBMRegressor,all_studios,english_only,all_studios,english_only,2025-09-25 23:27:10.941000+00:00,3bf448f430c04162b302647c27228868,file:///Users/jasmineplows/Documents/Californi...,"$76,828,355","$32,286,430",165.7%,0.46,70.00%,70.00%,0.739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,RandomForest | Weighted (30% pandemic era),RandomForestRegressor,all_studios,english_only,all_studios,english_only,2025-09-25 03:43:27.296000+00:00,21473982337347bb8179cee032e92537,file:///Users/jasmineplows/Documents/Californi...,"$83,924,413","$35,092,872",171.4%,0.36,,,
161,RandomForest | Weighted (30% pandemic era),RandomForestRegressor,all_studios,english_only,all_studios,english_only,2025-09-25 02:06:36.810000+00:00,0edb3cf774164926a15fbc6b21ba0ca3,file:///Users/jasmineplows/Documents/Californi...,"$83,924,413","$35,092,872",171.4%,0.36,,,
162,RandomForest | All Eras (2015-2023),RandomForestRegressor,,,,,2025-09-25 01:37:40.050000+00:00,64bc27f67442434aae4e93715c3c28e7,file:///Users/jasmineplows/Documents/Californi...,"$84,672,554","$35,156,886",171.1%,0.35,,,
163,RandomForest | All Eras (2015-2023),RandomForestRegressor,all_studios,english_only,all_studios,english_only,2025-09-25 02:00:39.546000+00:00,66edadbbfc594844a3dd6d18c8809a93,file:///Users/jasmineplows/Documents/Californi...,"$84,672,554","$35,156,886",171.1%,0.35,,,


   Dataset selected for evaluation: ../data/dataset_domestic_processed_english_2010_2026.csv
🏅 Best MLflow Run (Recall@10 priority): LightGBM | No Pandemic Era (studios: all_studios, language: english_only) (RMSE $76,372,525, R² 0.449, NDCG@10 0.762, Recall@10 70.00%, Precision@10 70.00%, Spearman 0.686, Kendall 0.497)


### Evaluate best MLflow model on 2024 validation set

In [4]:

best_model = None
if best_run is None:
    print('❌ No MLflow run available to evaluate.')
else:
    dataset_path = best_run_dataset_path or DEFAULT_DATASET_PATH
    try:
        df, feature_cols, target = load_prepared_dataset(dataset_path)
    except Exception as exc:
        print(f'⚠️ Unable to load dataset {dataset_path}: {exc}')
        df, feature_cols, target = None, [], 'revenue_domestic'

    if df is None or not feature_cols:
        print('⚠️ Dataset not available for evaluation; aborting validation step.')
    else:
        run_id = best_run.get('run_id')
        try:
            best_model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')
        except Exception:
            best_model = mlflow.pyfunc.load_model(f'runs:/{run_id}/model')

        if best_model is None:
            print(f'⚠️ Unable to load model artifacts for run {run_id}')
        else:
            val_2024 = df[df['release_year'] == 2024].copy()
            val_2024 = val_2024.dropna(subset=[target])
            if val_2024.empty:
                print('⚠️ No 2024 rows available for validation.')
            else:
                X_val = val_2024[feature_cols]
                preds_log = best_model.predict(X_val)
                preds = np.expm1(preds_log)
                val_2024['predicted_revenue'] = preds

                rmse = float(np.sqrt(mean_squared_error(val_2024[target], preds)))
                mae = float(mean_absolute_error(val_2024[target], preds))
                mape = float(np.mean(np.abs((val_2024[target] - preds) / val_2024[target])) * 100)
                r2 = float(r2_score(val_2024[target], preds))
                ranking_metrics = compute_ranking_metrics(
                    val_2024, preds, target_col=target, title_col='title', k=10
                )

                print('🎯 2024 validation metrics:')
                print('   RMSE: ${:,.0f}'.format(rmse))
                print('   MAE:  ${:,.0f}'.format(mae))
                print('   MAPE: {:.1f}%'.format(mape))
                print('   R²:   {:.3f}'.format(r2))
                if 'recall_at_10' in ranking_metrics:
                    print('   Recall@10: {:.2%}'.format(ranking_metrics['recall_at_10']))
                if 'precision_at_10' in ranking_metrics:
                    print('   Precision@10: {:.2%}'.format(ranking_metrics['precision_at_10']))
                if 'ndcg_at_10' in ranking_metrics:
                    print('   NDCG@10: {:.3f}'.format(ranking_metrics['ndcg_at_10']))

                top10_2024 = get_top10_predictions(best_model, df, 2024, feature_cols)
                if top10_2024 is not None:
                    formatted = top10_2024.copy()
                    if 'predicted_revenue' in formatted.columns:
                        formatted['predicted_revenue'] = formatted['predicted_revenue'].map(lambda x: f'${x:,.0f}')
                    if 'actual_revenue' in formatted.columns:
                        formatted['actual_revenue'] = formatted['actual_revenue'].map(lambda x: f'${x:,.0f}')
                    if 'prediction_error_pct' in formatted.columns:
                        formatted['prediction_error_pct'] = formatted['prediction_error_pct'].map(lambda x: f'{x:+.1f}%')
                    print('Top 10 predicted vs actual for 2024 ({}):'.format(best_run.get('run_name')))
                    display(formatted[['title', 'predicted_revenue', 'actual_revenue', 'prediction_error_pct']])

                    actual_top10 = val_2024.nlargest(10, target)['title'].tolist()
                    predicted_top10 = top10_2024['title'].tolist()
                    overlap = len(set(actual_top10) & set(predicted_top10))
                    print('Overlap with actual 2024 top 10: {}/10'.format(overlap))


No title corrections needed
📁 Loaded dataset: ../data/dataset_domestic_processed_english_2010_2026.csv (1964 rows)
   Feature columns: 69
   Target column: revenue_domestic


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🎯 2024 validation metrics:
   RMSE: $82,817,626
   MAE:  $35,342,153
   MAPE: 158.0%
   R²:   0.419
   Recall@10: 80.00%
   Precision@10: 80.00%
   NDCG@10: 0.777
Top 10 predicted vs actual for 2024 (LightGBM | No Pandemic Era):


Unnamed: 0,title,predicted_revenue,actual_revenue,prediction_error_pct
0,Mufasa: The Lion King,"$303,440,649","$254,567,693",-19.2%
1,Deadpool & Wolverine,"$226,011,526","$636,745,858",+64.5%
2,Moana 2,"$216,285,847","$460,405,297",+53.0%
3,Twisters,"$195,459,321","$267,762,265",+27.0%
4,Venom: The Last Dance,"$168,858,620","$139,755,882",-20.8%
5,Despicable Me 4,"$158,215,942","$361,004,205",+56.2%
6,Sonic the Hedgehog 3,"$155,091,508","$236,115,100",+34.3%
7,Kingdom of the Planet of the Apes,"$140,139,917","$171,130,165",+18.1%
8,Inside Out 2,"$115,005,454","$652,980,194",+82.4%
9,Beetlejuice Beetlejuice,"$111,673,353","$294,100,435",+62.0%


Overlap with actual 2024 top 10: 8/10
