In [6]:
import os
import json
import time
import requests
import pandas as pd
from tqdm import tqdm

# =========================
# CONFIG
# =========================
TMDB_API_KEY = os.getenv("TMDB_API_KEY", "522ee874f1ad7fda68394ed5805a2f79")
BASE_URL = "https://api.themoviedb.org/3"

START_YEAR = 2010
END_YEAR = 2025
PAGES_PER_YEAR = 30
TOP_N_ACTORS = 5

OUT_JSONL = "tmdb_2020_2025_enriched.jsonl"
OUT_CSV = "tmdb_2020_2025_enriched.csv"
CACHE_DIR = "person_cache"

os.makedirs(CACHE_DIR, exist_ok=True)

SLEEP = 0.12
TIMEOUT = 30


# =========================
# TMDB REQUEST HELPER
# =========================
def tmdb_get(path, params):
    if TMDB_API_KEY == "YOUR_KEY_HERE":
        raise RuntimeError("Set your TMDB_API_KEY")

    url = f"{BASE_URL}{path}"
    params["api_key"] = TMDB_API_KEY

    for _ in range(5):
        r = requests.get(url, params=params, timeout=TIMEOUT)
        if r.status_code == 429:
            time.sleep(1.5)
            continue
        r.raise_for_status()
        time.sleep(SLEEP)
        return r.json()

    raise RuntimeError(f"Failed request: {url}")


# =========================
# DISCOVER MOVIES
# =========================
def discover_movie_ids():
    ids = []

    for year in range(START_YEAR, END_YEAR + 1):
        print(f"\nDiscovering year {year}...")
        for page in range(1, PAGES_PER_YEAR + 1):
            data = tmdb_get(
                "/discover/movie",
                {
                    "page": page,
                    "primary_release_date.gte": f"{year}-01-01",
                    "primary_release_date.lte": f"{year}-12-31",
                    "sort_by": "popularity.desc",
                    "include_adult": "false",
                },
            )
            results = data.get("results", [])
            ids.extend([m["id"] for m in results if "id" in m])

    return list(dict.fromkeys(ids))  # unique


# =========================
# PERSON CACHE
# =========================
def get_person_cached(person_id):
    cache_path = os.path.join(CACHE_DIR, f"{person_id}.json")

    if os.path.exists(cache_path):
        with open(cache_path, "r") as f:
            return json.load(f)

    person = tmdb_get(f"/person/{person_id}", {})
    with open(cache_path, "w") as f:
        json.dump(person, f)

    return person


# =========================
# FETCH MOVIE WITH CREDITS
# =========================
def fetch_movie(movie_id):
    return tmdb_get(
        f"/movie/{movie_id}",
        {"append_to_response": "credits,keywords"},
    )


# =========================
# BUILD ROW
# =========================
def build_row(movie_json):
    row = {}

    # ---- Movie Metadata ----
    row["movie_id"] = movie_json.get("id")
    row["title"] = movie_json.get("title")
    row["release_date"] = movie_json.get("release_date")
    row["runtime"] = movie_json.get("runtime")
    row["original_language"] = movie_json.get("original_language")
    row["popularity"] = movie_json.get("popularity")
    row["vote_average"] = movie_json.get("vote_average")
    row["vote_count"] = movie_json.get("vote_count")
    row["budget"] = movie_json.get("budget")
    row["revenue"] = movie_json.get("revenue")
    row["status"] = movie_json.get("status")
    row["overview"] = movie_json.get("overview")

    row["genres"] = [g["name"] for g in movie_json.get("genres", [])]
    kw = movie_json.get("keywords", {}).get("keywords", [])
    row["keywords"] = [k["name"] for k in kw]

    # ---- Director ----
    crew = movie_json.get("credits", {}).get("crew", [])
    directors = [c for c in crew if c.get("job") == "Director"]

    if directors:
        d = directors[0]
        person = get_person_cached(d["id"])

        row["director_id"] = d["id"]
        row["director_name"] = d.get("name")
        row["director_gender"] = person.get("gender")
        row["director_popularity"] = person.get("popularity")
        row["director_department"] = person.get("known_for_department")
    else:
        row["director_id"] = None
        row["director_name"] = None
        row["director_gender"] = None
        row["director_popularity"] = 0.0
        row["director_department"] = None

    # ---- Top Cast ----
    cast = movie_json.get("credits", {}).get("cast", [])
    cast_sorted = sorted(cast, key=lambda x: x.get("order", 99999))
    top_cast = cast_sorted[:TOP_N_ACTORS]

    pops = []

    for i in range(TOP_N_ACTORS):
        prefix = f"actor{i+1}"

        if i < len(top_cast):
            actor = top_cast[i]
            person = get_person_cached(actor["id"])

            row[f"{prefix}_id"] = actor["id"]
            row[f"{prefix}_name"] = actor.get("name")
            row[f"{prefix}_character"] = actor.get("character")
            row[f"{prefix}_gender"] = person.get("gender")
            row[f"{prefix}_popularity"] = person.get("popularity")
            row[f"{prefix}_department"] = person.get("known_for_department")

            pops.append(person.get("popularity", 0.0))
        else:
            row[f"{prefix}_id"] = None
            row[f"{prefix}_name"] = None
            row[f"{prefix}_character"] = None
            row[f"{prefix}_gender"] = None
            row[f"{prefix}_popularity"] = 0.0
            row[f"{prefix}_department"] = None

    # ---- Cast Aggregates ----
    pops = [p for p in pops if p is not None]
    row["cast_pop_mean"] = sum(pops) / len(pops) if pops else 0.0
    row["cast_pop_max"] = max(pops) if pops else 0.0

    return row


# =========================
# RUN EXTRACTION
# =========================
movie_ids = discover_movie_ids()
print(f"\nTotal movies discovered: {len(movie_ids)}")

rows = []

with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for mid in tqdm(movie_ids, desc="Fetching movies"):
        try:
            mj = fetch_movie(mid)
            row = build_row(mj)
            rows.append(row)
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
        except Exception as e:
            print(f"Error for movie {mid}: {e}")

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False)

print("\nExtraction complete.")
print("Saved:", OUT_JSONL)
print("Saved:", OUT_CSV)
print("Shape:", df.shape)

df.head()



Discovering year 2010...

Discovering year 2011...

Discovering year 2012...

Discovering year 2013...

Discovering year 2014...

Discovering year 2015...

Discovering year 2016...

Discovering year 2017...

Discovering year 2018...

Discovering year 2019...

Discovering year 2020...

Discovering year 2021...

Discovering year 2022...

Discovering year 2023...

Discovering year 2024...

Discovering year 2025...

Total movies discovered: 9548


Fetching movies: 100%|██████████| 9548/9548 [26:11<00:00,  6.08it/s]



Extraction complete.
Saved: tmdb_2020_2025_enriched.jsonl
Saved: tmdb_2020_2025_enriched.csv
Shape: (9548, 51)


Unnamed: 0,movie_id,title,release_date,runtime,original_language,popularity,vote_average,vote_count,budget,revenue,...,actor4_popularity,actor4_department,actor5_id,actor5_name,actor5_character,actor5_gender,actor5_popularity,actor5_department,cast_pop_mean,cast_pop_max
0,27205,Inception,2010-07-15,148,en,32.8952,8.37,38655,160000000,839030630,...,9.6156,Acting,27578.0,Elliot Page,Ariadne,3.0,4.8289,Acting,7.38976,12.2774
1,38757,Tangled,2010-11-24,100,en,19.876,7.61,12179,260000000,592461732,...,4.9563,Acting,22132.0,M.C. Gainey,Captain of the Guard (voice),2.0,2.1777,Acting,2.62706,4.9563
2,10138,Iron Man 2,2010-04-28,124,en,13.79,6.85,22057,200000000,623933331,...,17.8153,Acting,6807.0,Sam Rockwell,Justin Hammer,2.0,4.4974,Acting,8.101,17.8153
3,20352,Despicable Me,2010-07-08,95,en,18.4625,7.3,15829,69000000,543284256,...,0.7743,Acting,122851.0,Elsie Fisher,Agnes (voice),1.0,2.4183,Acting,3.36062,5.8392
4,12444,Harry Potter and the Deathly Hallows: Part 1,2010-11-17,146,en,13.8146,7.736,20083,250000000,954305868,...,3.1401,Acting,1283.0,Helena Bonham Carter,Bellatrix Lestrange,1.0,4.5939,Acting,5.72064,9.4552


In [2]:
df

Unnamed: 0,movie_id,title,release_date,runtime,original_language,popularity,vote_average,vote_count,budget,revenue,...,actor4_popularity,actor4_department,actor5_id,actor5_name,actor5_character,actor5_gender,actor5_popularity,actor5_department,cast_pop_mean,cast_pop_max
0,27205,Inception,2010-07-15,148,en,32.8952,8.370,38655,160000000,839030630,...,9.6156,Acting,27578.0,Elliot Page,Ariadne,3.0,4.8289,Acting,7.38976,12.2774
1,38757,Tangled,2010-11-24,100,en,19.8760,7.610,12179,260000000,592461732,...,4.9563,Acting,22132.0,M.C. Gainey,Captain of the Guard (voice),2.0,2.1777,Acting,2.62706,4.9563
2,10138,Iron Man 2,2010-04-28,124,en,13.7900,6.850,22057,200000000,623933331,...,17.8153,Acting,6807.0,Sam Rockwell,Justin Hammer,2.0,4.4974,Acting,8.10100,17.8153
3,20352,Despicable Me,2010-07-08,95,en,18.4625,7.300,15829,69000000,543284256,...,0.7743,Acting,122851.0,Elsie Fisher,Agnes (voice),1.0,2.4183,Acting,3.36062,5.8392
4,12444,Harry Potter and the Deathly Hallows: Part 1,2010-11-17,146,en,13.8146,7.736,20083,250000000,954305868,...,3.1401,Acting,1283.0,Helena Bonham Carter,Bellatrix Lestrange,1.0,4.5939,Acting,5.72064,9.4552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9285,732707,Vasiliy,2025-01-23,0,ru,4.4978,7.200,6,0,2558496,...,0.5058,Acting,235919.0,Yan Tsapnik,Sergey Ivanovich,2.0,0.9664,Acting,1.26738,2.4987
9286,1149953,Bunker,2025-03-07,0,en,4.4976,8.500,8,0,0,...,2.5786,Acting,112328.0,Christopher Backus,Soren,2.0,1.1103,Acting,3.16590,5.0480
9287,1297763,Batman Ninja vs. Yakuza League,2025-03-17,90,ja,4.4868,6.753,219,0,0,...,3.4360,Acting,992869.0,Ayane Sakura,Green Lantern (voice),1.0,4.1109,Acting,4.19348,5.0683
9288,1418736,Karmadonna,2025-09-10,118,sr,4.4819,7.000,1,0,0,...,0.3983,Acting,1392578.0,Branislav Jevtić,Duško,0.0,1.7842,Acting,0.95142,1.7842


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9290 entries, 0 to 9289
Data columns (total 51 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   movie_id             9290 non-null   int64  
 1   title                9290 non-null   object 
 2   release_date         9290 non-null   object 
 3   runtime              9290 non-null   int64  
 4   original_language    9290 non-null   object 
 5   popularity           9290 non-null   float64
 6   vote_average         9290 non-null   float64
 7   vote_count           9290 non-null   int64  
 8   budget               9290 non-null   int64  
 9   revenue              9290 non-null   int64  
 10  status               9290 non-null   object 
 11  overview             9290 non-null   object 
 12  genres               9290 non-null   object 
 13  keywords             9290 non-null   object 
 14  director_id          9218 non-null   float64
 15  director_name        9218 non-null   o

In [4]:
df.shape

(9290, 51)

In [5]:
df.to_csv('data/movies_2010_2025.csv')