In [3]:
import requests
import pandas as pd
from datetime import datetime
import time

# Your TMDB API key
API_KEY = "2ea4941ed01886e1416c4f99347c48e1"

def discover_movies(year, pages=10):
    """Get movie IDs for a given year (max 500 pages)"""
    ids = []
    for page in range(1, pages+1):
        url = f"https://api.themoviedb.org/3/discover/movie"
        params = {
            "api_key": API_KEY,
            "language": "en-US",
            "primary_release_year": year,
            "page": page
        }
        r = requests.get(url, params=params).json()
        ids.extend([m["id"] for m in r.get("results", [])])
    return ids

In [4]:
def get_movie_full(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {
        "api_key": API_KEY,
        "language": "en-US",
        "append_to_response": "credits"
    }
    r = requests.get(url, params=params).json()
    
    # Extract director
    director = None
    for crew in r["credits"].get("crew", []):
        if crew.get("job") == "Director":
            director = crew["name"]
            break
    
    # Extract top 3 cast
    cast_names = [c["name"] for c in r["credits"].get("cast", [])[:3]]
    while len(cast_names) < 3:
        cast_names.append(None)
    
    return {
        "id": r.get("id"),
        "title": r.get("title"),
        "date_string": r.get("release_date"),
        "language": r.get("original_language"),
        "genre": ",".join(str(g["id"]) for g in r.get("genres", [])),
        "duration": r.get("runtime"),
        "rating": r.get("vote_average"),
        "rating_count": r.get("vote_count"),
        "cast1": cast_names[0],
        "cast2": cast_names[1],
        "cast3": cast_names[2],
        "director": director,
        "budget": r.get("budget"),
        "box_office": r.get("revenue"),
        "spoken_languages": ",".join(l["english_name"] for l in r.get("spoken_languages", []))
    }

In [5]:
def collect_all_years(start_year=1900, end_year=None, pages=10):
    if end_year is None:
        end_year = datetime.now().year
    
    all_movies = []  # collect all years here
    
    for year in range(start_year, end_year+1):
        print(f"Collecting {year} ...")
        ids = discover_movies(year, pages=pages)
        
        for mid in ids:
            try:
                details = get_movie_full(mid)
                all_movies.append(details)   # append to master list
            except Exception as e:
                print("Error on", mid, e)
            time.sleep(0.25)  # avoid hitting rate limit
        
        print(f"Finished {year} ({len(all_movies)} movies total so far)")
    
    # convert once at the end
    df = pd.DataFrame(all_movies)
    df.to_csv("tmdb_all_movies.csv", index=False)
    print(f"Saved {len(df)} movies from {start_year}–{end_year}")
    return df

In [6]:
if __name__ == "__main__":
    collect_all_years(start_year=1900, pages=10)

Collecting 1900 ...
Finished 1900 (200 movies total so far)
Collecting 1901 ...
Finished 1901 (400 movies total so far)
Collecting 1902 ...
Finished 1902 (600 movies total so far)
Collecting 1903 ...
Finished 1903 (800 movies total so far)
Collecting 1904 ...
Finished 1904 (1000 movies total so far)
Collecting 1905 ...
Finished 1905 (1200 movies total so far)
Collecting 1906 ...
Finished 1906 (1400 movies total so far)
Collecting 1907 ...
Finished 1907 (1600 movies total so far)
Collecting 1908 ...
Finished 1908 (1800 movies total so far)
Collecting 1909 ...
Finished 1909 (2000 movies total so far)
Collecting 1910 ...
Finished 1910 (2200 movies total so far)
Collecting 1911 ...
Finished 1911 (2400 movies total so far)
Collecting 1912 ...
Finished 1912 (2600 movies total so far)
Collecting 1913 ...
Finished 1913 (2800 movies total so far)
Collecting 1914 ...
Finished 1914 (3000 movies total so far)
Collecting 1915 ...
Finished 1915 (3200 movies total so far)
Collecting 1916 ...
Finished