### **AVATAR**: **FIRE AND ASH - OPENING COLLECTION PREDICTION**

In [2]:
!pip install requests pytrends xgboost scikit-learn tqdm


Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2


In [3]:
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from pytrends.request import TrendReq
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import time

## **SETUP**

In [4]:
TMDB_KEY = "613cb6fdb467ef300725744de1797cb4"
YOUTUBE_KEY = "YOUR_YT_KEY"


In [5]:
def get_tmdb_movies(pages=250):  # 250 × 20 = 5000 movies
    movies = []
    for page in tqdm(range(1, pages+1)):
        url = f"https://api.themoviedb.org/3/discover/movie"
        params = {
            "api_key": TMDB_KEY,
            "sort_by": "popularity.desc",
            "page": page,
            "primary_release_date.lte": "2024-12-31"
        }
        r = requests.get(url, params=params).json()
        movies.extend(r["results"])
    return movies


**FETCH MOVIES**

In [14]:
def movie_details(tmdb_id):
    # Fetch main movie details
    r_movie = requests.get(
        f"https://api.themoviedb.org/3/movie/{tmdb_id}",
        params={"api_key": TMDB_KEY}
    ).json()

    # Fetch movie credits
    r_credits = requests.get(
        f"https://api.themoviedb.org/3/movie/{tmdb_id}/credits",
        params={"api_key": TMDB_KEY}
    ).json()

    director = 'Unknown'
    for crew_member in r_credits.get('crew', []):
        if crew_member.get('job') == 'Director':
            director = crew_member.get('name')
            break

    return {
        "runtime": r_movie.get("runtime", 0),
        "budget": r_movie.get("budget", 0),
        "genres": ",".join([g["name"] for g in r_movie.get("genres", [])]),
        "franchise": 1 if r_movie.get("belongs_to_collection") else 0,
        "popularity": r_movie.get("popularity", 0),
        "vote_count": r_movie.get("vote_count", 0),
        "vote_average": r_movie.get("vote_average", 0),
        "revenue": r_movie.get("revenue", 0),
        "director": director
    }

In [7]:
import requests

API_KEY = "613cb6fdb467ef300725744de1797cb4"
url = "https://api.themoviedb.org/3/movie/550"
params = {"api_key": API_KEY}

r = requests.get(url, params=params)
print(r.status_code)
print(r.json()["title"])


200
Fight Club


In [8]:
import time
import pandas as pd
from tqdm import tqdm
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_URL = "https://api.themoviedb.org/3"
movies = []

# Configure requests session with retries
session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for page in tqdm(range(1, 501), desc="Fetching initial movie list"):  # ~10k movies
    try:
        r = session.get(
            f"{BASE_URL}/discover/movie",
            params={
                "api_key": API_KEY,
                "primary_release_date.gte": "2010-01-01",
                "primary_release_date.lte": "2025-12-31",
                "sort_by": "popularity.desc",
                "page": page
            }
        )
        r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        movies.extend(r.json()["results"])
        time.sleep(0.1)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page}: {e}")
        break

print(f"Initial list of {len(movies)} movies fetched.")

# Now, fetch additional details for each movie
for movie in tqdm(movies, desc="Fetching detailed movie info"):
    try:
        details = movie_details(movie["id"])
        movie.update(details) # Merge the details into the existing movie dictionary
        time.sleep(0.05) # Small delay for detailed API calls to avoid hitting rate limits
    except requests.exceptions.RequestException as e:
        print(f"Error fetching details for movie {movie['id']}: {e}")
        # Optionally, you can decide to skip this movie or re-try
        continue

len(movies) # This will still be the same length, but movies dictionaries are now enriched.

Fetching initial movie list: 100%|██████████| 500/500 [03:08<00:00,  2.65it/s]


Initial list of 9999 movies fetched.


Fetching detailed movie info:  96%|█████████▌| 9570/9999 [29:28<01:03,  6.80it/s]

Error fetching details for movie 522241: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Fetching detailed movie info: 100%|██████████| 9999/9999 [30:47<00:00,  5.41it/s]


9999

In [9]:
import time
import pandas as pd
from tqdm import tqdm
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_URL = "https://api.themoviedb.org/3"
movies = []

# Configure requests session with retries
session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for page in tqdm(range(1, 501), desc="Fetching initial movie list"):  # ~10k movies
    try:
        r = session.get(
            f"{BASE_URL}/discover/movie",
            params={
                "api_key": API_KEY,
                "primary_release_date.gte": "2010-01-01",
                "primary_release_date.lte": "2025-12-31",
                "sort_by": "popularity.desc",
                "page": page
            }
        )
        r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        movies.extend(r.json()["results"])
        time.sleep(0.1)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page}: {e}")
        break

print(f"Initial list of {len(movies)} movies fetched.")

# Now, fetch additional details for each movie
for movie in tqdm(movies, desc="Fetching detailed movie info"):
    try:
        details = movie_details(movie["id"])
        movie.update(details) # Merge the details into the existing movie dictionary
        time.sleep(0.05) # Small delay for detailed API calls to avoid hitting rate limits
    except requests.exceptions.RequestException as e:
        print(f"Error fetching details for movie {movie['id']}: {e}")
        # Optionally, you can decide to skip this movie or re-try
        continue

len(movies) # This will still be the same length, but movies dictionaries are now enriched.

Fetching initial movie list: 100%|██████████| 500/500 [01:01<00:00,  8.07it/s]


Initial list of 9999 movies fetched.


Fetching detailed movie info:  13%|█▎        | 1304/9999 [01:59<11:13, 12.91it/s]

Error fetching details for movie 1029955: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Fetching detailed movie info: 100%|██████████| 9999/9999 [15:03<00:00, 11.06it/s]


9999

In [10]:
movies = []

for page in range(1, 501):  # ~10k movies
    r = requests.get(
        f"{BASE_URL}/discover/movie",
        params={
            "api_key": API_KEY,
            "primary_release_date.gte": "2010-01-01",
            "primary_release_date.lte": "2025-12-31",
            "sort_by": "popularity.desc",
            "page": page
        }
    )
    if r.status_code != 200:
        break
    movies.extend(r.json()["results"])
    time.sleep(0.25)

**INCLUDING DIRECTOR'S STRENGTH**

In [20]:
import time
import pandas as pd
from tqdm import tqdm
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# BASE_URL and API_KEY are already defined in the notebook context
# BASE_URL = "https://api.themoviedb.org/3"
# API_KEY = "613cb6fdb467ef300725744de1797cb4"

full_movies = []

# Configure requests session with retries
session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# The initial 'movies' list is already populated from cell ZFFt2ZOOPKVO.
# Now, fetch additional details for each movie using the improved movie_details function
for m in tqdm(movies, desc="Fetching detailed movie info"):
    try:
        details = movie_details(m["id"])
        # Merge the basic movie info with the detailed info including director
        movie_with_details = m.copy()
        movie_with_details.update(details)
        full_movies.append(movie_with_details)
        time.sleep(0.05) # Small delay for detailed API calls to avoid hitting rate limits
    except requests.exceptions.RequestException as e:
        print(f"Error fetching details for movie {m['id']}: {e}")
        # Continue to the next movie if an error occurs
        continue

df = pd.DataFrame(full_movies)
df = df[df["revenue"] > 0]  # keep only movies with revenue
df["release_date"] = pd.to_datetime(df["release_date"], errors='coerce')
df = df[df["release_date"].dt.year >= 2010]


Fetching detailed movie info: 100%|██████████| 9999/9999 [36:31<00:00,  4.56it/s]


In [21]:
df['director'] = df['director']
director_strength = df.groupby('director')['revenue'].mean().reset_index().rename(columns={'revenue':'director_strength'})
df = df.merge(director_strength, on='director', how='left')

ASSUMING OPENING WEEK COLLECTION 30% OF TOTAL REVENUE

In [23]:
df["opening_week_domestic"] = df["revenue"] * 0.30


**MOVIE DATASET SINCE 2010**

In [24]:
df.to_csv("movies_since_2010_opening_week.csv", index=False)
df.shape


(2941, 22)

In [25]:
df.head(3)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,...,vote_average,vote_count,runtime,budget,genres,franchise,revenue,director,director_strength,opening_week_domestic
0,False,/aHj7d7wSLqrg5cjAcgHhiGr97Ih.jpg,"[28, 53, 878]",798645,en,The Running Man,"Desperate to save his sick daughter, working-c...",418.0257,/dKL78O9zxczVgjtNcQ9UkbYLzqX.jpg,2025-11-11,...,6.871,591,133,110000000,"Action,Thriller,Science Fiction",0,68391082,Edgar Wright,83216990.0,20517324.6
1,False,/5h2EsPKNDdB3MAtOk9MB9Ycg9Rz.jpg,"[16, 35, 12, 10751, 9648]",1084242,en,Zootopia 2,After cracking the biggest case in Zootopia's ...,382.4108,/oJ7g2CifqpStmoYQyaLQgEU32qO.jpg,2025-11-26,...,7.6,641,107,150000000,"Animation,Comedy,Adventure,Family,Mystery",1,1137444817,Jared Bush,1137445000.0,341233445.1
2,False,/yCatt8lmp3oRFEcOZF8KHhsiASQ.jpg,"[28, 10752]",1223601,fi,Sisu 2,Returning to the house where his family was br...,386.9594,/jNsttCWZyPtW66MjhUozBzVsRb7.jpg,2025-10-21,...,7.682,110,89,12200000,"Action,War",1,9169696,Jalmari Helander,11725450.0,2750908.8


**LOAD DATA**

In [26]:
import pandas as pd
import numpy as np

df = pd.read_csv("movies_since_2010_opening_week.csv")


**DROP COLUMNS FOR PREPROCESSING**

In [27]:
DROP_COLS = [
    "title", "revenue", "opening_week_domestic", "release_date"
]

**FEATURE SELECTION FOR TARGET PREDICTION**

In [28]:
FEATURES = [
    "budget",
    "popularity",
    "runtime",
    "vote_average",
    "vote_count"
]

X = df[FEATURES]
y = df["opening_week_domestic"]


**TRAIN-TEST-SPLIT**

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**MODEL SELECTION & TRAINING**

In [30]:
!pip install xgboost




In [31]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

xgb.fit(X_train, y_train)


In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor


In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    max_iter=500,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)




**MODEL EVALUATION**

In [34]:
from sklearn.metrics import mean_absolute_error, r2_score

xgb_preds = xgb.predict(X_test)
mlp_preds = mlp.predict(X_test_scaled)

print("XGBoost MAE:", mean_absolute_error(y_test, xgb_preds))
print("MLP MAE:", mean_absolute_error(y_test, mlp_preds))

print("XGBoost R²:", r2_score(y_test, xgb_preds))
print("MLP R²:", r2_score(y_test, mlp_preds))


XGBoost MAE: 20905700.43957018
MLP MAE: 32299547.105520137
XGBoost R²: 0.6700475889630177
MLP R²: -0.02593229675289077


**ENSEMBLE PREDICTION: XGB+MLP**

In [35]:
ensemble_preds = (xgb_preds + mlp_preds) / 2

print("Ensemble MAE:", mean_absolute_error(y_test, ensemble_preds))
print("Ensemble R²:", r2_score(y_test, ensemble_preds))


Ensemble MAE: 21330742.42382545
Ensemble R²: 0.500694238186019


PREDICTION FOR AVATAR BASED ON LOG BUDGET, HIGH FRANCHISE POPULARITY, ETC

In [36]:
avatar = pd.DataFrame([{
    "budget": 400_000_000,     # estimated
    "popularity": 500,         # extremely high franchise hype
    "runtime": 190,
    "vote_average": 0,         # unknown pre-release → set 0
    "vote_count": 0
}])


In [37]:
avatar_scaled = scaler.transform(avatar)


ESTIMATED PREDICTION

In [45]:
xgb_pred = xgb.predict(avatar)[0]
mlp_pred = mlp.predict(avatar_scaled)[0]

final_prediction = (xgb_pred + mlp_pred) / 2

int(final_prediction)


188833767