### 2025 Predictions

In [9]:
# Setup imports and MLflow tracking
from pathlib import Path
import json

import numpy as np
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient

from movie_lists import normalize_domestic_titles
from model_utils import (
    prepare_features,
    compute_ranking_metrics,
    get_top10_predictions,
)

pd.options.display.float_format = lambda x: f'{x:,.2f}'
np.set_printoptions(suppress=True)

TRACKING_DIR_CANDIDATES = [
    Path('../mlruns'),
    Path('mlruns'),
]
tracking_path = None
for candidate in TRACKING_DIR_CANDIDATES:
    if candidate.exists():
        tracking_path = candidate.resolve()
        break
if tracking_path is None:
    raise FileNotFoundError('Unable to locate mlruns tracking directory')

mlflow.set_tracking_uri(tracking_path.as_uri())


In [10]:
# Helper utilities for dataset resolution and loading
DATA_DIR = Path('../data')
DEFAULT_DATASET_CANDIDATES = [
    DATA_DIR / 'dataset_domestic_processed_modeling.csv',
    DATA_DIR / 'dataset_domestic_processed.csv',
]

def _first_existing(paths):
    for candidate in paths:
        if candidate.exists():
            return candidate
    return None

DEFAULT_DATASET_PATH = _first_existing(DEFAULT_DATASET_CANDIDATES)

SUBSET_PATHS = {
    ('english_only', 'all_studios', 2010): DATA_DIR / 'dataset_domestic_processed_english_2010_2026.csv',
    ('english_only', 'all_studios', 2015): DATA_DIR / 'dataset_domestic_processed_english_2015_2026.csv',
    ('all_languages', 'major_only', 2010): DATA_DIR / 'dataset_domestic_processed_major_2010_2026.csv',
    ('all_languages', 'major_only', 2015): DATA_DIR / 'dataset_domestic_processed_major_2015_2026.csv',
}

def infer_year_floor(params):
    if not params:
        return 2015
    getter = params.get if hasattr(params, 'get') else (lambda key: None)
    for key in ('train_year_min', 'train_start_year', 'training_year_start'):
        value = getter(key)
        if value is None:
            continue
        try:
            return int(value)
        except ValueError:
            continue
    return 2015

def resolve_dataset_path(studio_scope, language_scope, params=None):
    lang_scope = (language_scope or 'all_languages').lower()
    studio_scope = (studio_scope or 'all_studios').lower()
    params = params or {}

    year_floor = infer_year_floor(params)
    year_pref = [2015, 2010] if year_floor and year_floor > 2010 else [2010, 2015]

    candidates = []
    if lang_scope.startswith('english'):
        for yr in [2010, 2015]:
            candidates.append(('english_only', 'all_studios', yr))
    if studio_scope == 'major_only':
        for yr in year_pref:
            candidates.append(('all_languages', 'major_only', yr))
    for yr in year_pref:
        candidates.append((
            lang_scope if lang_scope in ('english_only', 'all_languages') else 'all_languages',
            studio_scope if studio_scope in ('all_studios', 'major_only') else 'all_studios',
            yr
        ))

    seen = set()
    ordered_candidates = []
    for key in candidates:
        if key not in seen:
            seen.add(key)
            ordered_candidates.append(key)

    for key in ordered_candidates:
        path = SUBSET_PATHS.get(key)
        if path and path.exists():
            return path

    if DEFAULT_DATASET_PATH is not None:
        return DEFAULT_DATASET_PATH
    raise FileNotFoundError('No suitable dataset found for the requested scope.')

def load_prepared_dataset(dataset_path):
    df_raw = pd.read_csv(dataset_path)
    df_raw = normalize_domestic_titles(df_raw)
    df_prepared, feature_cols_prepared, target_name = prepare_features(
        df_raw,
        target='revenue_domestic',
        verbose=False,
    )
    print(f'Loaded dataset: {dataset_path} ({len(df_prepared)} rows)')
    print(f'   Feature columns: {len(feature_cols_prepared)}')
    print(f'   Target column: {target_name}')
    return df_prepared, feature_cols_prepared, target_name


In [11]:
# Locate the best MLflow run (Recall@10 priority)
experiment_name = 'box_office_modeling'
client = MlflowClient()
best_run = None
best_run_params = {}
best_run_dataset_path = None
mlflow_results = pd.DataFrame()

experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiments = client.search_experiments()
    finished = []
    for exp in experiments:
        runs = mlflow.search_runs(
            [exp.experiment_id],
            filter_string="attributes.status = 'FINISHED'",
            max_results=1,
        )
        if not runs.empty:
            finished.append(exp)
    if finished:
        experiment = max(finished, key=lambda exp: getattr(exp, 'last_update_time', 0) or 0)
        print(f"Default experiment missing; using '{experiment.name}' instead")

if experiment is not None:
    runs_df = mlflow.search_runs(
        [experiment.experiment_id],
        filter_string="attributes.status = 'FINISHED'",
        order_by=['metrics.rmse ASC'],
    )
    if not runs_df.empty:
        rename_map = {
            'tags.mlflow.runName': 'run_name',
            'params.model': 'model',
            'params.strategy': 'strategy',
            'params.data_scope_studios': 'data_scope_studios',
            'params.data_scope_language': 'data_scope_language',
            'metrics.rmse': 'rmse',
            'metrics.recall_at_10': 'recall_at_10',
            'metrics.precision_at_10': 'precision_at_10',
            'metrics.ndcg_at_10': 'ndcg_at_10',
            'metrics.r2': 'r2',
            'metrics.mae': 'mae',
            'metrics.mape': 'mape',
            'metrics.spearman_corr': 'spearman_corr',
            'metrics.kendall_corr': 'kendall_corr',
        }
        keep_cols = [c for c in rename_map if c in runs_df.columns] + ['run_id']
        mlflow_results = runs_df[keep_cols].rename(columns=rename_map)
        for col in ('rmse', 'recall_at_10', 'precision_at_10', 'ndcg_at_10', 'r2', 'mae', 'mape'):
            if col in mlflow_results.columns:
                mlflow_results[col] = pd.to_numeric(mlflow_results[col], errors='coerce')
        if 'recall_at_10' in mlflow_results.columns and mlflow_results['recall_at_10'].notna().any():
            mlflow_results = mlflow_results.sort_values(['recall_at_10', 'rmse'], ascending=[False, True])
        elif 'rmse' in mlflow_results.columns:
            mlflow_results = mlflow_results.sort_values('rmse')
        mlflow_results = mlflow_results.reset_index(drop=True)
        if not mlflow_results.empty:
            best_run = mlflow_results.iloc[0]
            run_id = best_run['run_id']
            try:
                run_info = client.get_run(run_id)
                best_run_params = run_info.data.params
            except Exception as exc:
                print(f"Unable to fetch parameters for run {run_id}: {exc}")
                best_run_params = {}
            try:
                best_run_dataset_path = resolve_dataset_path(
                    best_run.get('data_scope_studios'),
                    best_run.get('data_scope_language'),
                    best_run_params,
                )
                print(f"Dataset selected: {best_run_dataset_path}")
            except Exception as exc:
                print(f"Unable to resolve dataset path: {exc}")
                best_run_dataset_path = DEFAULT_DATASET_PATH
            recall_text = f"Recall@10 {best_run.get('recall_at_10', float('nan')):.2%}"
            rmse_text = f"RMSE ${best_run.get('rmse', float('nan')):,.0f}"
            print(f"Best run: {best_run.get('run_name')} ({recall_text}, {rmse_text})")
    else:
        print('No finished MLflow runs found in the experiment.')
else:
    print('No MLflow experiment with finished runs is available.')


Dataset selected: ../data/dataset_domestic_processed_english_2010_2026.csv
Best run: LightGBM | No Pandemic Era (Recall@10 70.00%, RMSE $76,372,525)


In [12]:
# Evaluate the best model on 2025 data
if best_run is None or best_run_dataset_path is None:
    print('Cannot evaluate because no best MLflow run was identified.')
else:
    df_prepared, feature_cols, target = load_prepared_dataset(best_run_dataset_path)
    test_2025 = df_prepared[df_prepared['release_year'] == 2025].copy()
    if test_2025.empty:
        print('No 2025 rows available in the selected dataset.')
    else:
        run_id = best_run['run_id']
        try:
            best_model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')
        except Exception:
            best_model = mlflow.pyfunc.load_model(f'runs:/{run_id}/model')
        if best_model is None:
            print(f"Unable to load model artifacts for run {run_id}")
        else:
            X_test = test_2025[feature_cols]
            preds_log = best_model.predict(X_test)
            preds = np.expm1(preds_log)
            test_2025['predicted_revenue'] = preds

            if target in test_2025.columns and test_2025[target].notna().any():
                rmse = float(np.sqrt(np.mean((test_2025[target] - preds) ** 2)))
                mae = float(np.mean(np.abs(test_2025[target] - preds)))
                with np.errstate(divide='ignore', invalid='ignore'):
                    mape = float(np.nanmean(np.abs((test_2025[target] - preds) / test_2025[target])) * 100)
                if test_2025[target].nunique() > 1:
                    ss_res = np.sum((test_2025[target] - preds) ** 2)
                    ss_tot = np.sum((test_2025[target] - test_2025[target].mean()) ** 2)
                    r2 = float(1 - ss_res / ss_tot) if ss_tot > 0 else float('nan')
                else:
                    r2 = float('nan')
            else:
                rmse = mae = mape = r2 = float('nan')

            ranking_metrics = compute_ranking_metrics(
                test_2025,
                preds,
                target_col=target,
                title_col='title',
                k=10,
            )

            print('2025 evaluation snapshot:')
            if not np.isnan(rmse):
                print(f'   RMSE: ${rmse:,.0f}')
                print(f'   MAE:  ${mae:,.0f}')
                print(f'   MAPE: {mape:.1f}%')
                print(f'   R^2:  {r2:.3f}')
            else:
                print('   Actual revenue not available for full error metrics.')
            if ranking_metrics:
                if 'recall_at_10' in ranking_metrics:
                    print(f"   Recall@10: {ranking_metrics['recall_at_10']:.2%}")
                if 'precision_at_10' in ranking_metrics:
                    print(f"   Precision@10: {ranking_metrics['precision_at_10']:.2%}")
                if 'ndcg_at_10' in ranking_metrics:
                    print(f"   NDCG@10: {ranking_metrics['ndcg_at_10']:.3f}")

            top10_2025 = get_top10_predictions(best_model, df_prepared, 2025, feature_cols)
            if top10_2025 is not None:
                display(top10_2025)
                predicted_top10 = top10_2025['title'].tolist()
                actual_available = target in test_2025.columns and test_2025[target].notna().any()
                if actual_available:
                    actual_top10 = test_2025.dropna(subset=[target]).nlargest(10, target)['title'].tolist()
                    overlap = len(set(actual_top10) & set(predicted_top10))
                    recall_at_10 = overlap / max(len(actual_top10), 1)
                    false_negatives = sorted(set(actual_top10) - set(predicted_top10))
                    false_positives = sorted(set(predicted_top10) - set(actual_top10))
                    print(f'Recall@10: {recall_at_10:.2%}')
                    if false_negatives:
                        print('Missed (actual top 10 not predicted):', false_negatives)
                    if false_positives:
                        print('False positives (predicted but not actual):', false_positives)
                    print(f'Overlap with actual 2025 top 10: {overlap}/10')
                else:
                    print('Actual 2025 revenues unavailable, skipping overlap metrics.')
            else:
                print('Unable to compute top 10 predictions for 2025.')


No title corrections needed
Loaded dataset: ../data/dataset_domestic_processed_english_2010_2026.csv (2274 rows)
   Feature columns: 69
   Target column: revenue_domestic


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025 evaluation snapshot:
   RMSE: $61,093,390
   MAE:  $26,890,577
   MAPE: 201.8%
   R^2:  0.467
   Recall@10: 80.00%
   Precision@10: 80.00%
   NDCG@10: 0.775


Unnamed: 0,title,predicted_revenue,actual_revenue,prediction_error_pct,is_marvel,is_dc,is_star_wars,is_superhero,is_sequel,is_live_action_remake,is_major_studio,is_disney,is_english,is_origin_usa,is_origin_uk_ie,is_origin_canada,is_origin_us_uk_ca
0,Thunderbolts*,208063242.67,190274328.0,-9.35,1,0,0,1,1,0,1,1,1,1,0,0,1
1,Superman,205426824.17,354136363.0,41.99,0,1,0,1,1,0,1,0,1,1,0,0,1
2,Captain America: Brave New World,165328915.58,200500001.0,17.54,1,0,0,1,1,0,1,1,1,1,0,0,1
3,Snow White,131548514.74,87203963.0,-50.85,0,0,0,0,0,1,1,1,1,1,0,0,1
4,Jurassic World Rebirth,128622373.82,339640400.0,62.13,0,0,0,0,0,0,1,0,1,1,0,0,1
5,How to Train Your Dragon,125106196.13,262958100.0,52.42,0,0,0,0,0,1,1,0,1,1,0,0,1
6,Lilo & Stitch,110828797.27,423773167.0,73.85,0,0,0,0,0,1,1,1,1,1,0,0,1
7,28 Years Later,109665075.75,70446897.0,-55.67,0,0,0,0,1,0,1,0,1,1,1,0,1
8,Mission: Impossible - The Final Reckoning,92921633.7,197413515.0,52.93,0,0,0,0,1,0,1,0,1,1,0,0,1
9,A Minecraft Movie,92083069.56,423949195.0,78.28,0,0,0,0,0,0,1,0,1,1,0,0,1


Recall@10: 80.00%
Missed (actual top 10 not predicted): ['Sinners', 'The Conjuring: Last Rites']
False positives (predicted but not actual): ['28 Years Later', 'Snow White']
Overlap with actual 2025 top 10: 8/10
