## Final Model Comparison

Summarise MLflow runs and evaluate the current best model on the 2024 validation set.

In [11]:
# =============================================================================
# DATASET CONFIGURATION - Easy switching between dataset subsets
# =============================================================================

import sys
from pathlib import Path
sys.path.insert(0, str(Path('../').resolve()))

from dataset_config import (
    DEFAULT_CONFIG, get_dataset_config, get_dataset_path, get_config_summary,
    use_full_dataset, use_english_only, use_major_studios, use_english_major
)

# =============================================================================
# CHOOSE YOUR DATASET SCOPE - Uncomment one line to switch
# =============================================================================

# CURRENT_CONFIG = use_full_dataset()                    # All studios, all languages (2010-2026)
# CURRENT_CONFIG = use_english_only(2010)                # English only (2010-2026)
CURRENT_CONFIG = use_english_only(2015)                # English only (2015-2026)
# CURRENT_CONFIG = use_major_studios(2010)               # Major studios only (2010-2026)
# CURRENT_CONFIG = use_major_studios(2015)               # Major studios only (2015-2026)
# CURRENT_CONFIG = use_english_major(2010)               # English + Major studios (2010-2026)
# CURRENT_CONFIG = use_english_major(2015)               # English + Major studios (2015-2026)

# CURRENT_CONFIG = DEFAULT_CONFIG  # Use default (full dataset)

print("🎯 DATASET CONFIGURATION:")
print("=" * 50)
print(get_config_summary(CURRENT_CONFIG))
print("\n💡 To change scope, uncomment one of the CURRENT_CONFIG lines above and re-run this cell")

🎯 DATASET CONFIGURATION:
📊 Dataset Configuration: English Only
   Description: English movies only
   Year range: 2015-2026
   Training file: dataset_domestic_processed_english_2015_2026.csv
   Full file: dataset_domestic_processed_english_2015_2026.csv


💡 To change scope, uncomment one of the CURRENT_CONFIG lines above and re-run this cell


In [12]:
# Setup
from pathlib import Path
import json

import numpy as np
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from IPython.display import display
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from movie_lists import normalize_domestic_titles
from model_utils import (
    prepare_features,
    compute_ranking_metrics,
    get_top10_predictions,
)

pd.options.display.float_format = lambda x: f'{x:,.2f}'
np.set_printoptions(suppress=True)

# Point MLflow at the local tracking directory regardless of notebook cwd
# Check both possible locations for mlruns directory
_mlruns_candidates = [
    Path('../mlruns'),     # Main project directory (most likely)
    Path('mlruns'),        # Current directory
    Path('..') / 'mlruns'  # Parent of current directory
]
tracking_path = None
for candidate in _mlruns_candidates:
    if candidate.exists():
        tracking_path = candidate.resolve()
        print(f"Found MLflow tracking directory: {tracking_path}")
        break
if tracking_path is None:
    raise FileNotFoundError('Unable to locate local mlruns directory.')
TRACKING_URI = tracking_path.as_uri()
mlflow.set_tracking_uri(TRACKING_URI)

Found MLflow tracking directory: /Users/jasmineplows/Documents/California/Projects/box_office/mlruns


In [13]:
# Dataset loading with configuration system
def load_dataset(training=False):
    """Load the configured dataset subset."""
    import pandas as pd
    from movie_lists import normalize_domestic_titles

    dataset_path = get_dataset_path(training=training, config=CURRENT_CONFIG)
    dataset_config = get_dataset_config(CURRENT_CONFIG)

    print(f"📁 Loading dataset: {dataset_path}")
    
    # Load data
    df = pd.read_csv(dataset_path)

    # Apply additional filtering if needed (for english_major scope)
    if dataset_config['scope'] == 'english_major':
        if 'is_major_studio' in df.columns:
            original_len = len(df)
            df = df[df['is_major_studio'] == 1].copy()
            print(f"   Filtered to major studios: {len(df):,} movies (removed {original_len - len(df):,})")

    # Normalize titles
    df = normalize_domestic_titles(df)

    print(f"   ✅ Loaded {len(df):,} movies")
    if 'release_year' in df.columns:
        print(f"   Year range: {df['release_year'].min()}-{df['release_year'].max()}")

        # Show breakdown by time period
        training_count = len(df[df['release_year'] <= 2023])
        test_2024_count = len(df[df['release_year'] == 2024])
        eval_2025_count = len(df[df['release_year'] == 2025])
        pred_2026_count = len(df[df['release_year'] == 2026])

        print(f"   Training (≤2023): {training_count:,} movies")
        if test_2024_count > 0:
            print(f"   Testing (2024): {test_2024_count:,} movies")
        if eval_2025_count > 0:
            print(f"   Evaluation (2025): {eval_2025_count:,} movies")
        if pred_2026_count > 0:
            print(f"   Prediction (2026): {pred_2026_count:,} movies")

    return df

# Legacy dataset path resolution functions (keep for compatibility)
from pathlib import Path

DATA_DIR = Path('../data')
DEFAULT_DATASET_CANDIDATES = [
    DATA_DIR / 'dataset_domestic_processed.csv',          # FULL dataset with 2024 data (for validation)
    DATA_DIR / 'dataset_domestic_processed_modeling.csv', # Training-only dataset (fallback)
]

def _first_existing(paths):
    for candidate in paths:
        if candidate.exists():
            return candidate
    return None

DEFAULT_DATASET_PATH = _first_existing(DEFAULT_DATASET_CANDIDATES)

SUBSET_PATHS = {
    ('english_only', 'all_studios', 2010): DATA_DIR / 'dataset_domestic_processed_english_2010_2026.csv',
    ('english_only', 'all_studios', 2015): DATA_DIR / 'dataset_domestic_processed_english_2015_2026.csv',
    ('all_languages', 'major_only', 2010): DATA_DIR / 'dataset_domestic_processed_major_2010_2026.csv',
    ('all_languages', 'major_only', 2015): DATA_DIR / 'dataset_domestic_processed_major_2015_2026.csv',
}

def infer_year_floor(params):
    if not params:
        return 2015
    for key in ('train_year_min', 'train_start_year', 'training_year_start'):
        value = params.get(key)
        if value is None:
            continue
        try:
            return int(value)
        except ValueError:
            continue
    return 2015

def resolve_dataset_path(studio_scope, language_scope, params=None):
    """Legacy function - now uses configuration system as fallback"""
    try:
        # Try to use the configuration system first
        return get_dataset_path(training=False, config=CURRENT_CONFIG)
    except:
        # Fallback to legacy logic
        lang_scope = (language_scope or 'all_languages').lower()
        studio_scope = (studio_scope or 'all_studios').lower()

        year_floor = infer_year_floor(params or {})
        year_pref = [2015, 2010] if year_floor and year_floor > 2010 else [2010, 2015]

        candidates = []

        if lang_scope.startswith('english'):
            for yr in [2010, 2015]:
                candidates.append(('english_only', 'all_studios', yr))

        if studio_scope == 'major_only':
            for yr in year_pref:
                candidates.append(('all_languages', 'major_only', yr))

        for yr in year_pref:
            candidates.append((lang_scope if lang_scope in ('english_only', 'all_languages') else 'all_languages',
                               studio_scope if studio_scope in ('all_studios', 'major_only') else 'all_studios',
                               yr))

        seen = set()
        ordered_candidates = []
        for item in candidates:
            if item not in seen:
                seen.add(item)
                ordered_candidates.append(item)

        for key in ordered_candidates:
            path = SUBSET_PATHS.get(key)
            if path and path.exists():
                return path

        if DEFAULT_DATASET_PATH is not None:
            return DEFAULT_DATASET_PATH
        raise FileNotFoundError('No suitable dataset found for the requested scope.')

def load_prepared_dataset(dataset_path):
    """Legacy function - now uses configuration system and prepare_features"""
    # Load the raw dataset using the configuration system
    df_raw = load_dataset(training=False)
    
    # Apply prepare_features to get the processed dataframe and feature columns
    # Make sure we import from the right place
    from model_utils import prepare_features
    
    result = prepare_features(
        df_raw,
        target='revenue_domestic',
        verbose=False,
    )
    
    # Debug: Check what prepare_features returns
    print(f"DEBUG: prepare_features returned {type(result)} with length {len(result) if hasattr(result, '__len__') else 'N/A'}")
    
    # Ensure we get exactly 3 values
    if not isinstance(result, tuple) or len(result) != 3:
        raise ValueError(f"prepare_features should return 3 values, got {type(result)} with {len(result) if hasattr(result, '__len__') else 'unknown'} values")
    
    df_prepared, feature_cols_prepared, target_name = result
    
    print(f'Prepared dataset: {len(df_prepared)} rows, {len(feature_cols_prepared)} features')
    print(f'Target column: {target_name}')
    
    return df_prepared, feature_cols_prepared, target_name

scope_path = DATA_DIR / 'dataset_scope.json'
if scope_path.exists():
    scope_config = json.loads(scope_path.read_text())
else:
    scope_config = {}

df = None
feature_cols = []
target = 'revenue_domestic'

In [14]:
# Fetch MLflow runs
experiment_name = 'box_office_modeling'
client = MlflowClient()
mlflow_results = pd.DataFrame()
best_run = None
best_run_params = {}
best_run_dataset_path = None

# Try to get the specific experiment first
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    print(f"⚠️ MLflow experiment '{experiment_name}' not found.")
    # Try to find any experiments with runs
    try:
        all_experiments = client.search_experiments()
        experiments_with_runs = []

        for exp in all_experiments:
            runs = mlflow.search_runs(
                experiment_ids=[exp.experiment_id],
                filter_string="attributes.status = 'FINISHED'",
                max_results=1
            )
            if not runs.empty:
                experiments_with_runs.append(exp)

        if experiments_with_runs:
            # Use the most recent experiment with runs
            experiment = max(
                experiments_with_runs,
                key=lambda x: getattr(x, 'last_update_time', 0) or 0
            )
            print(f"   Using experiment '{experiment.name}' instead (has finished runs).")
        else:
            print("   No experiments with finished runs found.")
            print("   💡 Run notebook 4 (modeling) first to generate MLflow runs.")
            experiment = None

    except Exception as e:
        print(f"   Error searching experiments: {e}")
        experiment = None

if experiment is not None:
    runs_df = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string="attributes.status = 'FINISHED'",
        order_by=['metrics.rmse ASC'],
    )
    if runs_df.empty:
        print('⚠️ No finished MLflow runs found in this experiment.')
        print('   💡 Run notebook 4 (modeling) first to generate MLflow runs.')
    else:
        rename_map = {
            'tags.mlflow.runName': 'run_name',
            'params.model': 'model',
            'params.strategy': 'strategy',
            'params.data_scope_studios': 'data_scope_studios',
            'params.data_scope_language': 'data_scope_language',
            'params.dataset_start_year': 'dataset_start_year',  # Add dataset_start_year if available
            'tags.dataset.studio_scope': 'tag_studio_scope',
            'tags.dataset.language_scope': 'tag_language_scope',
            'tags.dataset.start_year': 'tag_start_year',  # Add tag version if available
            'metrics.rmse': 'rmse',
            'metrics.mae': 'mae',
            'metrics.mape': 'mape',
            'metrics.r2': 'r2',
            'metrics.recall_at_10': 'recall_at_10',
            'metrics.precision_at_10': 'precision_at_10',
            'metrics.ndcg_at_10': 'ndcg_at_10',
            'metrics.spearman_corr': 'spearman_corr',
            'metrics.kendall_corr': 'kendall_corr',
        }
        keep_cols = [col for col in rename_map if col in runs_df.columns] + ['run_id', 'artifact_uri', 'start_time']
        mlflow_results = runs_df[keep_cols].rename(columns=rename_map).copy()

        # Infer dataset_start_year based on run timing and configuration patterns
        # Convert start_time to datetime if it exists
        if 'start_time' in runs_df.columns:
            mlflow_results['start_time'] = pd.to_datetime(runs_df['start_time'], unit='ms', errors='coerce')
            
            # Sort by start time to determine run order
            mlflow_results_with_time = mlflow_results.sort_values('start_time', ascending=True).reset_index(drop=True)
            
            # Add inferred dataset_start_year if not already present
            if 'dataset_start_year' not in mlflow_results.columns:
                # Create the column
                mlflow_results['dataset_start_year'] = None
                
                # Logic: Recent runs (after a certain cutoff) used 2015, earlier ones used 2010
                # Based on the pattern that the most recent configuration was use_english_only(2015)
                # and earlier runs used the full dataset (which defaults to 2010)
                
                for idx, row in mlflow_results.iterrows():
                    # Get the scope information
                    language_scope = row.get('data_scope_language', '')
                    studio_scope = row.get('data_scope_studios', '')
                    
                    # Recent runs with english_only likely used 2015 configuration
                    # Full dataset runs (all_languages) likely used 2010
                    if language_scope == 'english_only':
                        mlflow_results.loc[idx, 'dataset_start_year'] = 2015
                    elif language_scope == 'all_languages':
                        mlflow_results.loc[idx, 'dataset_start_year'] = 2010
                    else:
                        # Default fallback based on timing - more recent runs likely 2015
                        if pd.notna(row.get('start_time')):
                            # Use a cutoff - runs after Sep 28 2025 likely used 2015
                            cutoff_time = pd.Timestamp('2025-09-28 22:00:00+00:00')
                            if row['start_time'] > cutoff_time:
                                mlflow_results.loc[idx, 'dataset_start_year'] = 2015
                            else:
                                mlflow_results.loc[idx, 'dataset_start_year'] = 2010
                        else:
                            mlflow_results.loc[idx, 'dataset_start_year'] = 2015  # Default to 2015

        numeric_cols = [
            'rmse', 'mae', 'mape', 'r2', 'recall_at_10',
            'precision_at_10', 'ndcg_at_10', 'spearman_corr', 'kendall_corr'
        ]
        for col in numeric_cols:
            if col in mlflow_results.columns:
                mlflow_results[col] = pd.to_numeric(mlflow_results[col], errors='coerce')

        has_recall = (
            'recall_at_10' in mlflow_results.columns
            and mlflow_results['recall_at_10'].notna().any()
        )
        if has_recall:
            sort_cols = ['recall_at_10']
            sort_dirs = [False]
            if 'rmse' in mlflow_results.columns:
                sort_cols.append('rmse')
                sort_dirs.append(True)
            sort_label = 'Recall@10 (highest first, RMSE tie-breaker)'
        elif 'rmse' in mlflow_results.columns:
            sort_cols = ['rmse']
            sort_dirs = [True]
            sort_label = 'RMSE (lowest first)'
        elif 'start_time' in mlflow_results.columns:
            sort_cols = ['start_time']
            sort_dirs = [False]
            sort_label = 'start time (most recent first)'
        else:
            first_col = mlflow_results.columns[0]
            sort_cols = [first_col]
            sort_dirs = [True]
            sort_label = f"{first_col} (ascending)"
        mlflow_results = mlflow_results.sort_values(sort_cols, ascending=sort_dirs).reset_index(drop=True)

        display_cols = [
            'run_name', 'model', 'strategy', 'data_scope_studios', 'data_scope_language', 'dataset_start_year',
            'tag_studio_scope', 'tag_language_scope', 'start_time', 'run_id', 'artifact_uri',
            'rmse', 'mae', 'mape', 'r2', 'recall_at_10', 'precision_at_10', 'ndcg_at_10'
        ]
        available_display = [c for c in display_cols if c in mlflow_results.columns]
        formatted = mlflow_results.copy()
        if 'rmse' in formatted.columns:
            formatted['rmse'] = formatted['rmse'].map(lambda x: f'${x:,.0f}' if pd.notna(x) else 'n/a')
        if 'mae' in formatted.columns:
            formatted['mae'] = formatted['mae'].map(lambda x: f'${x:,.0f}' if pd.notna(x) else 'n/a')
        if 'mape' in formatted.columns:
            formatted['mape'] = formatted['mape'].map(lambda x: f'{x:.1f}%' if pd.notna(x) else 'n/a')
        if 'recall_at_10' in formatted.columns:
            formatted['recall_at_10'] = formatted['recall_at_10'].map(lambda x: f'{x:.2%}' if pd.notna(x) else 'n/a')
        if 'precision_at_10' in formatted.columns:
            formatted['precision_at_10'] = formatted['precision_at_10'].map(lambda x: f'{x:.2%}' if pd.notna(x) else 'n/a')
        if 'ndcg_at_10' in formatted.columns:
            formatted['ndcg_at_10'] = formatted['ndcg_at_10'].map(lambda x: f'{x:.3f}' if pd.notna(x) else 'n/a')

        print(f'🏁 Top MLflow runs from experiment "{experiment.name}" (sorted by {sort_label}):')
        display(formatted[available_display].head(100000))
        if not mlflow_results.empty:
            best_run = mlflow_results.iloc[0]
            best_run_id = best_run.get('run_id')
            best_run_params = {}
            if best_run_id:
                try:
                    run_info = client.get_run(best_run_id)
                    best_run_params = run_info.data.params
                except Exception as exc:
                    print(f"   ⚠️ Unable to fetch parameters for run {best_run_id}: {exc}")
            try:
                best_run_dataset_path = resolve_dataset_path(
                    best_run.get('data_scope_studios'),
                    best_run.get('data_scope_language'),
                    best_run_params,
                )
                print(f"   Dataset selected for evaluation: {best_run_dataset_path}")
            except Exception as exc:
                print(f"   ⚠️ Unable to resolve dataset path: {exc}")
                best_run_dataset_path = DEFAULT_DATASET_PATH
            
            # Get dataset start year info
            dataset_start_year = best_run.get('dataset_start_year', 'unknown')
            scope_text = ''
            if 'data_scope_studios' in best_run and 'data_scope_language' in best_run:
                scope_text = ' (studios: {}, language: {}, start_year: {})'.format(
                    best_run.get('data_scope_studios', 'unknown'),
                    best_run.get('data_scope_language', 'unknown'),
                    dataset_start_year
                )

            metrics_fragments = []
            rmse_val = best_run.get('rmse')
            metrics_fragments.append(
                f"RMSE ${rmse_val:,.0f}" if rmse_val is not None and pd.notna(rmse_val) else 'RMSE n/a'
            )
            r2_val = best_run.get('r2')
            if r2_val is not None and pd.notna(r2_val):
                metrics_fragments.append(f'R² {r2_val:.3f}')
            ndcg_val = best_run.get('ndcg_at_10')
            if ndcg_val is not None and pd.notna(ndcg_val):
                metrics_fragments.append(f'NDCG@10 {ndcg_val:.3f}')
            recall_val = best_run.get('recall_at_10')
            if recall_val is not None and pd.notna(recall_val):
                metrics_fragments.append(f'Recall@10 {recall_val:.2%}')
            precision_val = best_run.get('precision_at_10')
            if precision_val is not None and pd.notna(precision_val):
                metrics_fragments.append(f'Precision@10 {precision_val:.2%}')
            spearman_val = best_run.get('spearman_corr')
            kendall_val = best_run.get('kendall_corr')
            corr_bits = []
            if spearman_val is not None and pd.notna(spearman_val):
                corr_bits.append(f'Spearman {spearman_val:.3f}')
            if kendall_val is not None and pd.notna(kendall_val):
                corr_bits.append(f'Kendall {kendall_val:.3f}')
            if corr_bits:
                metrics_fragments.append(', '.join(corr_bits))
            metrics_summary = ', '.join(metrics_fragments) if metrics_fragments else 'metrics unavailable'

            priority_label = 'Recall@10 priority' if has_recall else 'RMSE priority'
            print(
                f"🏅 Best MLflow Run ({priority_label}): {best_run.get('run_name')}{scope_text} ({metrics_summary})"
            )
else:
    print('❌ No MLflow experiments found with finished runs.')
    print('   💡 Run notebook 4 (modeling) first to generate MLflow runs.')

🏁 Top MLflow runs from experiment "box_office_modeling" (sorted by Recall@10 (highest first, RMSE tie-breaker)):


Unnamed: 0,run_name,model,data_scope_studios,data_scope_language,dataset_start_year,tag_studio_scope,tag_language_scope,start_time,run_id,artifact_uri,rmse,mae,mape,r2,recall_at_10,precision_at_10,ndcg_at_10
0,LightGBM | No Pandemic Era,LGBMRegressor,all_studios,english_only,2015,all_studios,english_only,2025-09-29 02:24:29.175000+00:00,83a974e5cf674ff69df450f264d2128d,file:///Users/jasmineplows/Documents/Californi...,"$78,937,117","$34,568,826",173.8%,0.47,80.00%,80.00%,0.781
1,LightGBM | All Eras (2015-2023),LGBMRegressor,all_studios,all_languages,2010,all_studios,all_languages,2025-09-28 21:47:14.585000+00:00,7f20d22f3b41481fae3c6774048e11a2,file:///Users/jasmineplows/Documents/Californi...,"$78,369,088","$32,117,803",161.0%,0.41,60.00%,60.00%,0.608
2,LightGBM | Weighted (30% pandemic era),LGBMRegressor,all_studios,all_languages,2010,all_studios,all_languages,2025-09-28 21:47:59.237000+00:00,b3e8715cc62c4a5fa6e7c49c6c935143,file:///Users/jasmineplows/Documents/Californi...,"$79,063,391","$32,400,177",160.4%,0.4,60.00%,60.00%,0.61
3,LightGBM | No Pandemic Era,LGBMRegressor,all_studios,all_languages,2010,all_studios,all_languages,2025-09-28 21:47:38.135000+00:00,f1d61149d1a847c4a73792183256ec8a,file:///Users/jasmineplows/Documents/Californi...,"$80,834,650","$32,764,627",178.9%,0.38,60.00%,60.00%,0.57
4,XGBoost | All Eras (2015-2023),XGBRegressor,all_studios,all_languages,2010,all_studios,all_languages,2025-09-28 21:42:57.834000+00:00,62851019916843648f76902d28e4e2af,file:///Users/jasmineplows/Documents/Californi...,"$81,196,732","$32,608,159",177.2%,0.37,60.00%,60.00%,0.597
5,LightGBM | All Eras (2010-2023),LGBMRegressor,all_studios,english_only,2015,all_studios,english_only,2025-09-29 02:24:04.239000+00:00,5b686a3742e747fb8b5a20fe54e88bb7,file:///Users/jasmineplows/Documents/Californi...,"$85,427,938","$36,721,230",169.6%,0.38,60.00%,60.00%,0.595
6,LightGBM | Weighted (30% pandemic era),LGBMRegressor,all_studios,english_only,2015,all_studios,english_only,2025-09-29 02:24:56.781000+00:00,79d861b268fc4da1b78394c19a463956,file:///Users/jasmineplows/Documents/Californi...,"$87,508,685","$36,966,412",160.8%,0.35,60.00%,60.00%,0.596
7,XGBoost | No Pandemic Era,XGBRegressor,all_studios,english_only,2015,all_studios,english_only,2025-09-29 02:22:01.070000+00:00,11776b4388074d88a1d5d9a1cb0258de,file:///Users/jasmineplows/Documents/Californi...,"$90,848,920","$39,362,826",215.4%,0.3,60.00%,60.00%,0.624
8,XGBoost | Weighted (30% pandemic era),XGBRegressor,all_studios,all_languages,2010,all_studios,all_languages,2025-09-28 21:46:48.153000+00:00,509c413c249f436cbf311ae206ef99c9,file:///Users/jasmineplows/Documents/Californi...,"$81,085,006","$32,607,847",188.2%,0.37,50.00%,50.00%,0.574
9,DecisionTree | No Pandemic Era,DecisionTreeRegressor,all_studios,english_only,2015,all_studios,english_only,2025-09-29 02:08:54.407000+00:00,71b3e6ac24a847b69628e3c82c1be0ee,file:///Users/jasmineplows/Documents/Californi...,"$83,154,195","$38,459,133",242.0%,0.41,50.00%,50.00%,0.659


   Dataset selected for evaluation: ../data/dataset_domestic_processed_english_2015_2026.csv
🏅 Best MLflow Run (Recall@10 priority): LightGBM | No Pandemic Era (studios: all_studios, language: english_only, start_year: 2015) (RMSE $78,937,117, R² 0.472, NDCG@10 0.781, Recall@10 80.00%, Precision@10 80.00%, Spearman 0.705, Kendall 0.517)


### Evaluate best MLflow model on 2024 validation set

In [15]:
best_model = None
if best_run is None:
    print('❌ No MLflow run available to evaluate.')
else:
    dataset_path = best_run_dataset_path or DEFAULT_DATASET_PATH
    try:
        print(f"🔍 Attempting to load dataset: {dataset_path}")
        df, feature_cols, target = load_prepared_dataset(dataset_path)
        print(f"✅ Successfully loaded dataset with {len(df)} rows and {len(feature_cols)} features")
    except Exception as exc:
        print(f'⚠️ Unable to load dataset {dataset_path}: {exc}')
        import traceback
        traceback.print_exc()
        df, feature_cols, target = None, [], 'revenue_domestic'

    if df is None or not feature_cols:
        print('⚠️ Dataset not available for evaluation; aborting validation step.')
    else:
        run_id = best_run.get('run_id')
        try:
            best_model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')
        except Exception:
            best_model = mlflow.pyfunc.load_model(f'runs:/{run_id}/model')

        if best_model is None:
            print(f'⚠️ Unable to load model artifacts for run {run_id}')
        else:
            val_2024 = df[df['release_year'] == 2024].copy()
            val_2024 = val_2024.dropna(subset=[target])
            if val_2024.empty:
                print('⚠️ No 2024 rows available for validation.')
            else:
                X_val = val_2024[feature_cols]
                preds_log = best_model.predict(X_val)
                preds = np.expm1(preds_log)
                val_2024['predicted_revenue'] = preds

                rmse = float(np.sqrt(mean_squared_error(val_2024[target], preds)))
                mae = float(mean_absolute_error(val_2024[target], preds))
                mape = float(np.mean(np.abs((val_2024[target] - preds) / val_2024[target])) * 100)
                r2 = float(r2_score(val_2024[target], preds))
                ranking_metrics = compute_ranking_metrics(
                    val_2024, preds, target_col=target, title_col='title', k=10
                )

                print('🎯 2024 validation metrics:')
                print('   RMSE: ${:,.0f}'.format(rmse))
                print('   MAE:  ${:,.0f}'.format(mae))
                print('   MAPE: {:.1f}%'.format(mape))
                print('   R²:   {:.3f}'.format(r2))
                if 'recall_at_10' in ranking_metrics:
                    print('   Recall@10: {:.2%}'.format(ranking_metrics['recall_at_10']))
                if 'precision_at_10' in ranking_metrics:
                    print('   Precision@10: {:.2%}'.format(ranking_metrics['precision_at_10']))
                if 'ndcg_at_10' in ranking_metrics:
                    print('   NDCG@10: {:.3f}'.format(ranking_metrics['ndcg_at_10']))

                top10_2024 = get_top10_predictions(best_model, df, 2024, feature_cols)
                if top10_2024 is not None:
                    formatted = top10_2024.copy()
                    if 'predicted_revenue' in formatted.columns:
                        formatted['predicted_revenue'] = formatted['predicted_revenue'].map(lambda x: f'${x:,.0f}')
                    if 'actual_revenue' in formatted.columns:
                        formatted['actual_revenue'] = formatted['actual_revenue'].map(lambda x: f'${x:,.0f}')
                    if 'prediction_error_pct' in formatted.columns:
                        formatted['prediction_error_pct'] = formatted['prediction_error_pct'].map(lambda x: f'{x:+.1f}%')
                    print('Top 10 predicted vs actual for 2024 ({}):'.format(best_run.get('run_name')))
                    display(formatted[['title', 'predicted_revenue', 'actual_revenue', 'prediction_error_pct']])

                    actual_top10 = val_2024.nlargest(10, target)['title'].tolist()
                    predicted_top10 = top10_2024['title'].tolist()
                    overlap = len(set(actual_top10) & set(predicted_top10))
                    print('Overlap with actual 2024 top 10: {}/10'.format(overlap))

🔍 Attempting to load dataset: ../data/dataset_domestic_processed_english_2015_2026.csv
📁 Loading dataset: ../data/dataset_domestic_processed_english_2015_2026.csv
No title corrections needed
   ✅ Loaded 1,307 movies
   Year range: 2015-2026
   Training (≤2023): 1,003 movies
   Testing (2024): 145 movies
   Evaluation (2025): 114 movies
   Prediction (2026): 45 movies
DEBUG: prepare_features returned <class 'tuple'> with length 3
Prepared dataset: 1307 rows, 68 features
Target column: revenue_domestic
✅ Successfully loaded dataset with 1307 rows and 68 features


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🎯 2024 validation metrics:
   RMSE: $78,937,117
   MAE:  $34,568,826
   MAPE: 173.8%
   R²:   0.472
   Recall@10: 80.00%
   Precision@10: 80.00%
   NDCG@10: 0.781
Top 10 predicted vs actual for 2024 (LightGBM | No Pandemic Era):


Unnamed: 0,title,predicted_revenue,actual_revenue,prediction_error_pct
0,Mufasa: The Lion King,"$289,174,214","$254,567,693",-13.6%
1,Deadpool & Wolverine,"$260,891,834","$636,745,858",+59.0%
2,Moana 2,"$208,554,082","$460,405,297",+54.7%
3,Twisters,"$206,372,602","$267,762,265",+22.9%
4,Despicable Me 4,"$189,054,748","$361,004,205",+47.6%
5,Sonic the Hedgehog 3,"$187,846,573","$236,115,100",+20.4%
6,Venom: The Last Dance,"$172,494,917","$139,755,882",-23.4%
7,Kingdom of the Planet of the Apes,"$171,166,472","$171,130,165",-0.0%
8,Inside Out 2,"$156,672,840","$652,980,194",+76.0%
9,Beetlejuice Beetlejuice,"$115,139,679","$294,100,435",+60.9%


Overlap with actual 2024 top 10: 8/10
