<a href="https://colab.research.google.com/github/juairiahaquemahi/ML-Projects/blob/main/82_Juairia_Haque_Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**AVATAR 2025 MOVIE BOX-OFFICE OPENING PREDICTION**

In [None]:
import time
import warnings
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
warnings.filterwarnings("ignore")
!pip install xgboost pandas scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import xgboost as xgb




LOAD DATA

In [None]:
df = pd.read_csv("boxoffice-dataset.csv")
df.head(5)

Unnamed: 0,tmdb_id,title,release_date,imdb_id,budget,revenue_tmdb,genres,runtime,country,mpaa_rating,...,worldwide_gross,distributor_bom,opening_weekend_revenue,opening_date_bom,mpaa_rating_bom,runtime_bom,genres_bom,percent_of_total,opening_week_revenue,bom_url
0,1084242,Zootopia 2,11/26/2025,tt26443597,150000000,1137000000,Animation|Comedy|Adventure|Family|Mystery,107,US,PG,...,1136670000.0,Walt Disney Studios Motion PicturesSee full co...,100262540.0,"November 26, 2025\n (14 markets)",PG,1 hr 48 min,Action\n \n Adventure\n \n ...,38.72,140367556.0,https://www.boxofficemojo.com/title/tt26443597/
1,798645,The Running Man,11/11/2025,tt14107334,110000000,66263567,Action|Thriller|Science Fiction,133,US,R,...,68405730.0,Paramount Pictures InternationalSee full compa...,16495564.0,"November 5, 2025\n (Belgium)",R,2 hr 13 min,Action\n \n Adventure\n \n ...,43.86,23093789.0,https://www.boxofficemojo.com/title/tt14107334/
2,812583,Wake Up Dead Man: A Knives Out Mystery,11/26/2025,tt14364480,210000000,4000000,Thriller|Mystery|Drama,145,US,PG-13,...,1600000.0,NetflixSee full company information,600000.0,"November 26, 2025\n (Domestic)",PG-13,2 hr 24 min,Comedy\n \n Crime\n \n Dra...,37.5,840000.0,https://www.boxofficemojo.com/title/tt14364480/
3,533533,TRON: Ares,10/8/2025,tt6604188,220000000,142073994,Science Fiction|Adventure|Action,119,US,PG-13,...,142250000.0,Walt Disney Studios Motion PicturesSee full co...,33241433.0,"October 8, 2025\n (EMEA, APAC)",PG-13,1 hr 59 min,Action\n \n Adventure\n \n ...,45.44,46538006.0,https://www.boxofficemojo.com/title/tt6604188/
4,1228246,Five Nights at Freddy's 2,12/3/2025,tt30274401,36000000,173790015,Horror|Thriller,104,US,PG-13,...,173790000.0,Universal Pictures International (UPI)See full...,64007430.0,"December 3, 2025\n (EMEA, APAC)",PG-13,1 hr 44 min,Horror\n \n Mystery\n \n T...,67.04,89610402.0,https://www.boxofficemojo.com/title/tt30274401/


DATA PREPROCESSING

In [None]:
# Date features
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_month'] = df['release_date'].dt.month
df['release_day'] = df['release_date'].dt.day
df['release_day_of_week'] = df['release_date'].dt.dayofweek
df['release_quarter'] = df['release_date'].dt.quarter

target_col = 'opening_week_revenue'

# Drop leakage columns
leakage_columns = [
        'revenue_tmdb',
        'domestic_total_gross',
        'international_gross',
        'worldwide_gross',
        'opening_weekend_revenue',
        'percent_of_total'
    ]

for col in leakage_columns:
    if col in df.columns:
        df = df.drop(columns=[col])

df = df[df[target_col].notna()]

# Drop duplicate movies
df = df.drop_duplicates(subset=['tmdb_id'], keep='first')

# Impute numeric features
numeric_features = ['budget', 'runtime', 'popularity', 'vote_average']
for col in numeric_features:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Binary feature
if 'is_franchise' in df.columns:
    df['is_franchise'] = df['is_franchise'].fillna(df['is_franchise'].mode()[0])

df.shape

(3838, 29)

ENCODE CATEGORICAL FEATURES

In [None]:
categorical_cols = ["director", "genres"]

encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le



In [None]:
def encode_categorical_features(df, target_encodings=None, target='opening_week_revenue'):

    df = df.copy()
    if target_encodings is None:
        target_encodings = {}

    if 'mpaa_rating' in df.columns:
        mpaa_order = {'G': 1, 'PG': 2, 'PG-13': 3, 'R': 4, 'NC-17': 5, 'NR': 0}
        df['mpaa_rating_encoded'] = df['mpaa_rating'].map(mpaa_order).fillna(0)

    if 'country' in df.columns:
        if 'country' not in target_encodings and target in df.columns:
            country_means = df.groupby('country')[target].mean()
            target_encodings['country'] = country_means

        if 'country' in target_encodings:
            df['country_encoded'] = df['country'].map(target_encodings['country'])
            df['country_encoded'] = df['country_encoded'].fillna(target_encodings['country'].mean())

    if 'distributor_bom' in df.columns:
        if 'distributor' not in target_encodings and target in df.columns:
            distributor_means = df.groupby('distributor_bom')[target].mean()
            target_encodings['distributor'] = distributor_means

        if 'distributor' in target_encodings:
            df['distributor_encoded'] = df['distributor_bom'].map(target_encodings['distributor'])
            df['distributor_encoded'] = df['distributor_encoded'].fillna(target_encodings['distributor'].mean())

    # Drop original text and ID cols
    text_cols = [
        'title', 'genres', 'actors', 'director', 'production_companies',
        'release_date', 'mpaa_rating', 'country', 'distributor_bom',
        'tmdb_id', 'imdb_id', 'distributor_tmdb', 'opening_date_bom',
        'mpaa_rating_bom', 'runtime_bom', 'genres_bom', 'bom_url', 'release_year'
    ]

    cols_to_drop = [col for col in text_cols if col in df.columns]
    df = df.drop(columns=cols_to_drop)

    return df, target_encodings


FEATURE ENGINEERING

In [None]:
# Holiday Indicators
df['is_summer'] = df['release_month'].isin([5, 6, 7, 8]).astype(int)
df['is_holiday_season'] = df['release_month'].isin([11, 12]).astype(int)
df['is_weekend_release'] = df['release_day_of_week'].isin([4, 5]).astype(int)


# GENRE
if 'genres' in df.columns:
    top_genres = [
        'Action', 'Adventure', 'Comedy', 'Drama', 'Thriller',
        'Horror', 'Science Fiction', 'Fantasy', 'Animation'
    ]

    for genre in top_genres:
        col_name = f'genre_{genre.lower().replace(" ", "_")}'
        df[col_name] = df['genres'].apply(
            lambda x: 1 if isinstance(x, str) and genre in x else 0
        )

    df['genre_count'] = df['genres'].apply(
        lambda x: len(x.split('|')) if isinstance(x, str) and x else 1
    )

In [None]:
# MAJOR STUDIO PRODUCTION COMPANIES
if 'production_companies' in df.columns:
    major_studios = [
        'Warner Bros', 'Universal', 'Disney', 'Paramount',
        'Sony', '20th Century', 'Columbia', 'Marvel', 'Lucasfilm'
    ]

    df['is_major_studio'] = df['production_companies'].apply(
        lambda x: 1 if isinstance(x, str) and any(studio in x for studio in major_studios) else 0
    )

DIRECTOR STRENGTH: Past success of directors can help predict box-office collection.

In [None]:
director_avg = df.groupby("director")["opening_week_revenue"].mean()
df["Director_Past_Avg_Gross"] = df["director"].map(director_avg)
df["Director_Past_Avg_Gross"] = df["Director_Past_Avg_Gross"].fillna(df["opening_week_revenue"].median())

CATEGORISE BUDGET TO DETERMINE BOX OFFICE SUCCESS

In [None]:
def categorize_budget(budget):
    if budget >= 100000000:
        return 'Blockbuster'
    elif budget >= 50000000:
        return 'High Budget'
    elif budget >= 10000000:
        return 'Medium Budget'
    else:
        return 'Low Budget'

df['budget_category'] = df['budget'].apply(categorize_budget)
df['is_blockbuster'] = (df['budget'] >= 100000000).astype(int)

**LOG TRANSFORMATIONS**

for budget, popularity, and runtime, as well as interaction features combining budget with genre and seasonal indicators

In [None]:
df = df.copy()

if 'budget' in df.columns:
    df['log_budget'] = np.log10(df['budget'] + 1)

if 'popularity' in df.columns:
    df['log_popularity'] = np.log10(df['popularity'] + 1)

if 'runtime' in df.columns:
    df['log_runtime'] = np.log10(df['runtime'] + 1)

if 'budget' in df.columns:
    if 'genre_action' in df.columns:
        df['budget_action_ratio'] = df['budget'] * df['genre_action']

    if 'genre_science_fiction' in df.columns:
        df['budget_scifi_ratio'] = df['budget'] * df['genre_science_fiction']

    if 'is_summer' in df.columns:
        df['budget_summer_boost'] = df['budget'] * df['is_summer']

    if 'is_holiday_season' in df.columns:
        df['budget_holiday_boost'] = df['budget'] * df['is_holiday_season']

In [None]:
# FRANCHISE × BUDGET
if 'is_franchise' in df.columns and 'budget' in df.columns:
    df['franchise_budget'] = df['is_franchise'] * df['budget']

# GENRE COMBINATIONS
if 'genre_action' in df.columns and 'genre_science_fiction' in df.columns:
    df['action_scifi'] = df['genre_action'] * df['genre_science_fiction']

if 'genre_action' in df.columns and 'genre_adventure' in df.columns:
    df['action_adventure'] = df['genre_action'] * df['genre_adventure']

### Implement Advanced Features

Add polynomial features (budget and runtime squared) and ratio features (budget per genre, budget per minute, popularity per rating) to the DataFrame.

In [None]:
df = df.copy()

# Polynomial features
if 'budget' in df.columns:
    df['budget_squared'] = df['budget']**2

if 'runtime' in df.columns:
    df['runtime_squared'] = df['runtime']**2

# Ratio features
if 'budget' in df.columns and 'genre_count' in df.columns:
    df['budget_per_genre'] = df['budget'] / df['genre_count']

if 'budget' in df.columns and 'runtime' in df.columns:
    # Handle division by zero for runtime
    df['budget_per_minute'] = df['budget'] / df['runtime'].replace(0, np.nan)
    df['budget_per_minute'] = df['budget_per_minute'].replace([np.inf, -np.inf], np.nan)
    df['budget_per_minute'] = df['budget_per_minute'].fillna(df['budget_per_minute'].median())

if 'popularity' in df.columns and 'vote_average' in df.columns:
    # Handle division by zero for vote_average
    df['popularity_per_rating'] = df['popularity'] / df['vote_average'].replace(0, np.nan)
    df['popularity_per_rating'] = df['popularity_per_rating'].replace([np.inf, -np.inf], np.nan)
    df['popularity_per_rating'] = df['popularity_per_rating'].fillna(df['popularity_per_rating'].median())

print("Polynomial and ratio features added successfully.")

Polynomial and ratio features added successfully.


### TRAIN-TEST-SPLIT DATA

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Define target column
target_col = 'opening_week_revenue'

# Separate target variable
y = df[target_col]

# Identify columns to drop from X (original text/ID columns, and the target itself)
columns_to_drop_from_X = [
    target_col,
    'tmdb_id', 'imdb_id', 'title', 'actors', 'production_companies',
    'country', 'mpaa_rating', 'distributor_tmdb', 'release_date',
    'distributor_bom', 'opening_date_bom', 'mpaa_rating_bom',
    'runtime_bom', 'genres_bom', 'bom_url', 'release_year', 'budget_category'
]

# Ensure only existing columns are dropped
columns_to_drop_from_X = [col for col in columns_to_drop_from_X if col in df.columns]

# Create feature set X
X = df.drop(columns=columns_to_drop_from_X)

# Select only numeric columns for X to ensure compatibility with models and handle any non-numeric columns that might have remained
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
X = X[numeric_cols]

# Handle any remaining NaNs in X by imputing with the median (safety net after previous imputations)
X = X.fillna(X.median())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Feature set X and target variable y defined and data split.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Feature set X and target variable y defined and data split.
Shape of X_train: (3070, 43)
Shape of X_test: (768, 43)
Shape of y_train: (3070,)
Shape of y_test: (768,)


## Initialize and Train XGBoost Model


Budget does not scale linearly. Besides, holiday releases have threshold effects. Also, franchise + director interactions affect box office collection. Considering these, I made the choice of using XGBoost instead of LR models or simple neural networks like MLPClassifier to avoid blockbluster under-prediction.

In [None]:
from xgboost import XGBRegressor

# Instantiate the XGBRegressor model
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

print("XGBoost Regressor model initialized and trained successfully.")

XGBoost Regressor model initialized and trained successfully.


## Hyperparameter Tuning with Cross-Validation and Early Stopping

Utilize GridSearchCV with K-Fold cross-validation to find the optimal hyperparameters for the XGBoost model.

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

# Create a temporary validation set from your X_train and y_train data
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define a parameter grid, adjusting n_estimators for explicit tuning
param_grid = {
    'n_estimators': [100, 200, 500], # Adjusted n_estimators to be explicitly tuned
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize an XGBRegressor model
xgb_model_grid = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000, # Keep a large number, as the best n_estimators will be selected from param_grid
    random_state=42,
    eval_metric='mae' # eval_metric remains in constructor for consistent metric tracking
)

# Instantiate GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model_grid,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Define empty fit_params, as early stopping via eval_set and early_stopping_rounds
# is causing a TypeError with GridSearchCV when cv > 1.
# N_estimators will now be explicitly tuned instead of relying on early stopping.
fit_params = {
    "verbose": False
}

# Fit GridSearchCV to your X_train_sub and y_train_sub data
grid_search.fit(X_train_sub, y_train_sub, **fit_params)

# Print the best hyperparameters found by GridSearchCV
print("Best hyperparameters found: ", grid_search.best_params_)

# Print the best cross-validation score (MAE) achieved by GridSearchCV
# Note: neg_mean_absolute_error is used for scoring, so we take the negative to get MAE
print("Best cross-validation MAE: ", -grid_search.best_score_)

# Store the best trained model (estimator) found by GridSearchCV
best_xgb_model = grid_search.best_estimator_

print("GridSearchCV completed. Best model stored in 'best_xgb_model'.")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best hyperparameters found:  {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.8}
Best cross-validation MAE:  9016489.331011731
GridSearchCV completed. Best model stored in 'best_xgb_model'.


## Evaluate Model Performance

using regression metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R2).


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set using the best model
y_pred = best_xgb_model.predict(X_test)

# Ensure predictions are non-negative for RMSLE calculation if using it
y_pred[y_pred < 0] = 0

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate RMSLE - handling potential log(0) if predictions or actuals are 0
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

rmsle_score = rmsle(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:,.2f}")
print(f"Mean Squared Error (MSE): {mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"Root Mean Squared Logarithmic Error (RMSLE): {rmsle_score:,.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Absolute Error (MAE): 9,786,028.29
Mean Squared Error (MSE): 388,448,835,895,728.31
Root Mean Squared Error (RMSE): 19,709,105.41
Root Mean Squared Logarithmic Error (RMSLE): 1.8123
R-squared (R2): 0.7373



### Model Evaluation Interpretation

*   The model's predictions for opening week revenue deviate by approximately \$9.79 million from actual values.
*   The average magnitude of errors, weighted towards larger discrepancies, is approximately \$19.71 million.
*   The Root Mean Squared Logarithmic Error of 1.8123 suggests relative errors.
*   The model demonstrates 73.73% variance in opening week revenue (R-squared).

## Prepare Avatar Movie Data for Prediction


In [None]:
avatar_details = {
    'tmdb_id': 1000001, # Placeholder ID
    'title': 'Avatar 4',
    'release_date': '12/19/2025', # December 19, 2025
    'imdb_id': 'tt0000000',
    'budget': 350000000, # Example high budget
    'revenue_tmdb': np.nan, # Not available pre-release
    'genres': 'Science Fiction|Action|Adventure|Fantasy',
    'runtime': 195, # Example runtime
    'country': 'US',
    'mpaa_rating': 'PG-13',
    'popularity': 150.0, # Example popularity
    'is_franchise': 1, # Part of Avatar franchise
    'director': 'James Cameron',
    'actors': 'Sam Worthington|Zoe Saldana|Sigourney Weaver',
    'vote_average': 8.0, # Example rating
    'production_companies': '20th Century Studios|Lightstorm Entertainment',
    'distributor_tmdb': 'Walt Disney Studios Motion Pictures',
    'release_year': 2025,
    'domestic_total_gross': np.nan,
    'international_gross': np.nan,
    'worldwide_gross': np.nan,
    'distributor_bom': 'Walt Disney Studios Motion Pictures',
    'opening_weekend_revenue': np.nan,
    'opening_date_bom': 'December 19, 2025',
    'mpaa_rating_bom': 'PG-13',
    'runtime_bom': '3 hr 15 min',
    'genres_bom': 'Science Fiction\n    \n        Action\n    \n        Adventure\n    \n        Fantasy',
    'percent_of_total': np.nan,
    'opening_week_revenue': np.nan, # Target variable, will be predicted
    'bom_url': 'https://www.boxofficemojo.com/title/tt1000001/' # Placeholder URL
}

avatar_df = pd.DataFrame([avatar_details])
avatar_df['release_date'] = pd.to_datetime(avatar_df['release_date'], errors='coerce')

print("Avatar movie details DataFrame created.")
print(avatar_df.head())

Avatar movie details DataFrame created.
   tmdb_id     title release_date    imdb_id     budget  revenue_tmdb  \
0  1000001  Avatar 4   2025-12-19  tt0000000  350000000           NaN   

                                     genres  runtime country mpaa_rating  ...  \
0  Science Fiction|Action|Adventure|Fantasy      195      US       PG-13  ...   

   worldwide_gross                      distributor_bom  \
0              NaN  Walt Disney Studios Motion Pictures   

  opening_weekend_revenue   opening_date_bom  mpaa_rating_bom  runtime_bom  \
0                     NaN  December 19, 2025            PG-13  3 hr 15 min   

                                          genres_bom  percent_of_total  \
0  Science Fiction\n    \n        Action\n    \n ...               NaN   

   opening_week_revenue                                         bom_url  
0                   NaN  https://www.boxofficemojo.com/title/tt1000001/  

[1 rows x 30 columns]


In [None]:
avatar_df['release_month'] = avatar_df['release_date'].dt.month
avatar_df['release_day'] = avatar_df['release_date'].dt.day
avatar_df['release_day_of_week'] = avatar_df['release_date'].dt.dayofweek
avatar_df['release_quarter'] = avatar_df['release_date'].dt.quarter

print("Date features extracted for Avatar movie.")
print(avatar_df[['release_month', 'release_day', 'release_day_of_week', 'release_quarter']].head())

Date features extracted for Avatar movie.
   release_month  release_day  release_day_of_week  release_quarter
0             12           19                    4                4


In [None]:
numeric_features_to_impute = ['budget', 'runtime', 'popularity', 'vote_average']

for col in numeric_features_to_impute:
    if col in avatar_df.columns and avatar_df[col].isnull().any():
        median_value = df[col].median()
        avatar_df[col] = avatar_df[col].fillna(median_value)

print("Missing numeric features in Avatar movie DataFrame imputed with medians from training data.")
print(avatar_df[numeric_features_to_impute].head())

Missing numeric features in Avatar movie DataFrame imputed with medians from training data.
      budget  runtime  popularity  vote_average
0  350000000      195       150.0           8.0


In [None]:
if 'is_franchise' in avatar_df.columns and avatar_df['is_franchise'].isnull().any():
    mode_value = df['is_franchise'].mode()[0]
    avatar_df['is_franchise'] = avatar_df['is_franchise'].fillna(mode_value)

print("Missing 'is_franchise' values in Avatar movie DataFrame imputed with mode from training data.")
print(avatar_df[['is_franchise']].head())

Missing 'is_franchise' values in Avatar movie DataFrame imputed with mode from training data.
   is_franchise
0             1


In [None]:
for col in categorical_cols:
    if col in avatar_df.columns:
        le = encoders[col]
        # Get the mode of the encoded column from the training data for unknown categories
        mode_encoded_value = df[col].mode()[0]

        # Use a mapping approach to handle unknown categories
        # First, create a dictionary mapping classes to their encoded values
        mapping = {cls: i for i, cls in enumerate(le.classes_)}

        # Apply the mapping. For values not in the mapping, use fillna with the mode.
        # The .astype(str) is important because LabelEncoder fits on string representations
        # and the avatar_df[col] might be an object type holding a string.
        avatar_df[col] = avatar_df[col].astype(str).map(mapping).fillna(mode_encoded_value).astype(int)

print("Categorical features 'director' and 'genres' encoded for Avatar movie.")
print(avatar_df[categorical_cols].head())

Categorical features 'director' and 'genres' encoded for Avatar movie.
   director  genres
0       764     602


In [None]:
if 'mpaa_rating' in avatar_df.columns:
    mpaa_order = {'G': 1, 'PG': 2, 'PG-13': 3, 'R': 4, 'NC-17': 5, 'NR': 0}
    avatar_df['mpaa_rating_encoded'] = avatar_df['mpaa_rating'].map(mpaa_order).fillna(0)

if 'country' in avatar_df.columns:
    # Calculate target encoding from the training data (df)
    country_means = df.groupby('country')['opening_week_revenue'].mean()
    overall_country_mean = df['opening_week_revenue'].mean()

    avatar_df['country_encoded'] = avatar_df['country'].map(country_means)
    avatar_df['country_encoded'] = avatar_df['country_encoded'].fillna(overall_country_mean)

if 'distributor_bom' in avatar_df.columns:
    # Calculate target encoding from the training data (df)
    distributor_means = df.groupby('distributor_bom')['opening_week_revenue'].mean()
    overall_distributor_mean = df['opening_week_revenue'].mean()

    avatar_df['distributor_encoded'] = avatar_df['distributor_bom'].map(distributor_means)
    avatar_df['distributor_encoded'] = avatar_df['distributor_encoded'].fillna(overall_distributor_mean)

print("Categorical features mpaa_rating, country, and distributor_bom encoded for Avatar movie.")
print(avatar_df[['mpaa_rating_encoded', 'country_encoded', 'distributor_encoded']].head())

Categorical features mpaa_rating, country, and distributor_bom encoded for Avatar movie.
   mpaa_rating_encoded  country_encoded  distributor_encoded
0                    3     3.084895e+07         2.225944e+07


In [None]:
text_cols_to_drop = [
    'title', 'genres', 'actors', 'director', 'production_companies',
    'release_date', 'mpaa_rating', 'country', 'distributor_bom',
    'tmdb_id', 'imdb_id', 'distributor_tmdb', 'opening_date_bom',
    'mpaa_rating_bom', 'runtime_bom', 'genres_bom', 'bom_url', 'release_year'
]

cols_to_drop_from_avatar = [col for col in text_cols_to_drop if col in avatar_df.columns]
avatar_df = avatar_df.drop(columns=cols_to_drop_from_avatar)

print("Original text and ID columns dropped from Avatar movie DataFrame.")
print(avatar_df.head())

Original text and ID columns dropped from Avatar movie DataFrame.
      budget  revenue_tmdb  runtime  popularity  is_franchise  vote_average  \
0  350000000           NaN      195       150.0             1           8.0   

   domestic_total_gross  international_gross  worldwide_gross  \
0                   NaN                  NaN              NaN   

   opening_weekend_revenue  percent_of_total  opening_week_revenue  \
0                      NaN               NaN                   NaN   

   release_month  release_day  release_day_of_week  release_quarter  \
0             12           19                    4                4   

   mpaa_rating_encoded  country_encoded  distributor_encoded  
0                    3     3.084895e+07         2.225944e+07  


In [None]:
avatar_df['is_summer'] = avatar_df['release_month'].isin([5, 6, 7, 8]).astype(int)
avatar_df['is_holiday_season'] = avatar_df['release_month'].isin([11, 12]).astype(int)
avatar_df['is_weekend_release'] = avatar_df['release_day_of_week'].isin([4, 5]).astype(int)

print("Holiday indicators created for Avatar movie.")
print(avatar_df[['is_summer', 'is_holiday_season', 'is_weekend_release']].head())

Holiday indicators created for Avatar movie.
   is_summer  is_holiday_season  is_weekend_release
0          0                  1                   1


In [None]:
import pandas as pd

# Create a temporary 'genres' column from avatar_details for feature engineering.
# The original 'genres' column in avatar_df was processed and dropped in previous steps,
# but the string format is needed for these binary genre features.
avatar_df['temp_genres_str'] = avatar_details['genres']

top_genres = [
    'Action', 'Adventure', 'Comedy', 'Drama', 'Thriller',
    'Horror', 'Science Fiction', 'Fantasy', 'Animation'
]

for genre in top_genres:
    col_name = f'genre_{genre.lower().replace(" ", "_")}'
    avatar_df[col_name] = avatar_df['temp_genres_str'].apply(
        lambda x: 1 if isinstance(x, str) and genre in x else 0
    )

avatar_df['genre_count'] = avatar_df['temp_genres_str'].apply(
    lambda x: len(x.split('|')) if isinstance(x, str) and x else 1
)

# Drop the temporary genres string column after creating the features
avatar_df = avatar_df.drop(columns=['temp_genres_str'])

print("Genre indicators created for Avatar movie.")
# Print the created columns to verify
print(avatar_df[[f'genre_{g.lower().replace(" ", "_")}' for g in top_genres] + ['genre_count']].head())

Genre indicators created for Avatar movie.
   genre_action  genre_adventure  genre_comedy  genre_drama  genre_thriller  \
0             1                1             0            0               0   

   genre_horror  genre_science_fiction  genre_fantasy  genre_animation  \
0             0                      1              1                0   

   genre_count  
0            4  


In [None]:
import pandas as pd

# Temporarily re-add 'production_companies' from avatar_details for feature engineering
avatar_df['temp_production_companies_str'] = avatar_details['production_companies']

major_studios = [
    'Warner Bros', 'Universal', 'Disney', 'Paramount',
    'Sony', '20th Century', 'Columbia', 'Marvel', 'Lucasfilm'
]

avatar_df['is_major_studio'] = avatar_df['temp_production_companies_str'].apply(
    lambda x: 1 if isinstance(x, str) and any(studio in x for studio in major_studios) else 0
)

# Drop the temporary production_companies string column
avatar_df = avatar_df.drop(columns=['temp_production_companies_str'])

print("Major studio indicator created for Avatar movie.")
print(avatar_df[['is_major_studio']].head())

Major studio indicator created for Avatar movie.
   is_major_studio
0                1


In [None]:
director_avg = df.groupby("director")["opening_week_revenue"].mean()

# The 'director' column was encoded and then dropped. We need to re-introduce its encoded value.
# Get the original director name for Avatar 4 from avatar_details
original_director_name = avatar_details['director']

# Use the pre-fitted LabelEncoder to get its encoded value
le_director = encoders['director']
try:
    # Transform the director name using the fitted encoder
    encoded_director_value = le_director.transform([original_director_name])[0]
except ValueError:
    # If the director is not in the encoder's known classes, use the mode of encoded directors from df
    encoded_director_value = df['director'].mode()[0]

# Add this encoded director value back to avatar_df
avatar_df['director'] = encoded_director_value

# Now, map this encoded director to its average gross
avatar_df["Director_Past_Avg_Gross"] = avatar_df["director"].map(director_avg)

# Fill any NaN values (e.g., if a director's average couldn't be found or mapped, or if director was unknown)
avatar_df["Director_Past_Avg_Gross"] = avatar_df["Director_Past_Avg_Gross"].fillna(df["opening_week_revenue"].median())

print("Director_Past_Avg_Gross feature created for Avatar movie.")
print(avatar_df[['Director_Past_Avg_Gross']].head())

Director_Past_Avg_Gross feature created for Avatar movie.
   Director_Past_Avg_Gross
0              147787994.5


In [None]:
def categorize_budget(budget):
    if budget >= 100000000:
        return 'Blockbuster'
    elif budget >= 50000000:
        return 'High Budget'
    elif budget >= 10000000:
        return 'Medium Budget'
    else:
        return 'Low Budget'

avatar_df['budget_category'] = avatar_df['budget'].apply(categorize_budget)
avatar_df['is_blockbuster'] = (avatar_df['budget'] >= 100000000).astype(int)

print("Budget category features created for Avatar movie.")
print(avatar_df[['budget_category', 'is_blockbuster']].head())

Budget category features created for Avatar movie.
  budget_category  is_blockbuster
0     Blockbuster               1


In [None]:
avatar_df = avatar_df.copy()

if 'budget' in avatar_df.columns:
    avatar_df['log_budget'] = np.log10(avatar_df['budget'] + 1)

if 'popularity' in avatar_df.columns:
    avatar_df['log_popularity'] = np.log10(avatar_df['popularity'] + 1)

if 'runtime' in avatar_df.columns:
    avatar_df['log_runtime'] = np.log10(avatar_df['runtime'] + 1)

print("Log-transformed features created for Avatar movie.")
print(avatar_df[['log_budget', 'log_popularity', 'log_runtime']].head())

Log-transformed features created for Avatar movie.
   log_budget  log_popularity  log_runtime
0    8.544068        2.178977     2.292256


In [None]:
avatar_df = avatar_df.copy()

# 1. Add interaction features
if 'budget' in avatar_df.columns:
    if 'genre_action' in avatar_df.columns:
        avatar_df['budget_action_ratio'] = avatar_df['budget'] * avatar_df['genre_action']

    if 'genre_science_fiction' in avatar_df.columns:
        avatar_df['budget_scifi_ratio'] = avatar_df['budget'] * avatar_df['genre_science_fiction']

    if 'is_summer' in avatar_df.columns:
        avatar_df['budget_summer_boost'] = avatar_df['budget'] * avatar_df['is_summer']

    if 'is_holiday_season' in avatar_df.columns:
        avatar_df['budget_holiday_boost'] = avatar_df['budget'] * avatar_df['is_holiday_season']

# 2. Add franchise and genre combination features
if 'is_franchise' in avatar_df.columns and 'budget' in avatar_df.columns:
    avatar_df['franchise_budget'] = avatar_df['is_franchise'] * avatar_df['budget']

if 'genre_action' in avatar_df.columns and 'genre_science_fiction' in avatar_df.columns:
    avatar_df['action_scifi'] = avatar_df['genre_action'] * avatar_df['genre_science_fiction']

if 'genre_action' in avatar_df.columns and 'genre_adventure' in avatar_df.columns:
    avatar_df['action_adventure'] = avatar_df['genre_action'] * avatar_df['genre_adventure']

# 3. Add polynomial features
if 'budget' in avatar_df.columns:
    avatar_df['budget_squared'] = avatar_df['budget']**2

if 'runtime' in avatar_df.columns:
    avatar_df['runtime_squared'] = avatar_df['runtime']**2

# 4. Add ratio features
if 'budget' in avatar_df.columns and 'genre_count' in avatar_df.columns:
    # Calculate budget_per_genre
    avatar_df['budget_per_genre'] = avatar_df['budget'] / avatar_df['genre_count']

if 'budget' in avatar_df.columns and 'runtime' in avatar_df.columns:
    # Handle division by zero for runtime, imputing with median from training data 'df'
    df['budget_per_minute'] = df['budget'] / df['runtime'].replace(0, np.nan)
    df_median_budget_per_minute = df['budget_per_minute'].median()

    avatar_df['budget_per_minute'] = avatar_df['budget'] / avatar_df['runtime'].replace(0, np.nan)
    avatar_df['budget_per_minute'] = avatar_df['budget_per_minute'].replace([np.inf, -np.inf], np.nan)
    avatar_df['budget_per_minute'] = avatar_df['budget_per_minute'].fillna(df_median_budget_per_minute)

if 'popularity' in avatar_df.columns and 'vote_average' in avatar_df.columns:
    # Handle division by zero for vote_average, imputing with median from training data 'df'
    df['popularity_per_rating'] = df['popularity'] / df['vote_average'].replace(0, np.nan)
    df_median_popularity_per_rating = df['popularity_per_rating'].median()

    avatar_df['popularity_per_rating'] = avatar_df['popularity'] / avatar_df['vote_average'].replace(0, np.nan)
    avatar_df['popularity_per_rating'] = avatar_df['popularity_per_rating'].replace([np.inf, -np.inf], np.nan)
    avatar_df['popularity_per_rating'] = avatar_df['popularity_per_rating'].fillna(df_median_popularity_per_rating)

print("Interaction, polynomial, and ratio features added to Avatar movie DataFrame.")
print(avatar_df[['budget_action_ratio', 'budget_scifi_ratio', 'budget_summer_boost', 'budget_holiday_boost',
                 'franchise_budget', 'action_scifi', 'action_adventure', 'budget_squared', 'runtime_squared',
                 'budget_per_genre', 'budget_per_minute', 'popularity_per_rating']].head())

Interaction, polynomial, and ratio features added to Avatar movie DataFrame.
   budget_action_ratio  budget_scifi_ratio  budget_summer_boost  \
0            350000000           350000000                    0   

   budget_holiday_boost  franchise_budget  action_scifi  action_adventure  \
0             350000000         350000000             1                 1   

       budget_squared  runtime_squared  budget_per_genre  budget_per_minute  \
0  122500000000000000            38025        87500000.0       1.794872e+06   

   popularity_per_rating  
0                  18.75  


In [None]:
final_avatar_df = avatar_df.copy()

# Ensure all columns from X_train are present in final_avatar_df
missing_cols_in_avatar = set(X_train.columns) - set(final_avatar_df.columns)
for col in missing_cols_in_avatar:
    # Fill missing columns with 0 or a suitable default from training data if known
    final_avatar_df[col] = 0 # Default to 0, or median/mean of the column from X_train

# Drop columns from final_avatar_df that are not in X_train
extra_cols_in_avatar = set(final_avatar_df.columns) - set(X_train.columns)
final_avatar_df = final_avatar_df.drop(columns=list(extra_cols_in_avatar))

# Reorder columns to match X_train
final_avatar_df = final_avatar_df[X_train.columns]

print("Avatar DataFrame columns aligned with X_train.")
print(f"Shape of final_avatar_df: {final_avatar_df.shape}")
print("Columns in final_avatar_df match X_train columns:", list(final_avatar_df.columns) == list(X_train.columns))


Avatar DataFrame columns aligned with X_train.
Shape of final_avatar_df: (1, 43)
Columns in final_avatar_df match X_train columns: True


In [None]:
predicted_opening_week_revenue = best_xgb_model.predict(final_avatar_df)

print(f"Predicted Opening Week Revenue for Avatar 2025: ${predicted_opening_week_revenue[0]:,.2f}")

Predicted Opening Week Revenue for Avatar 2025: $201,666,272.00


**Predicted Opening Week Revenue for Avatar 2025: $201,666,272.00**


##