In [1]:
import os
from dotenv import load_dotenv
import requests
import json
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [2]:
load_dotenv()

TMDB_API_KEY_MOVIE = os.getenv("TMDB_API_KEY_MOVIE")

INPUT_PATH = "../../data/id_list/movie_ids_12_31_2025.jsonl"
OUTPUT_PATH = "../../data/movie/movie_12_31_2025.jsonl"
MAX_WORKERS = 5
CHUNK_SIZE=25

print(TMDB_API_KEY_MOVIE)

ec0322cb09837ffd019107392ceb0db0


In [3]:
url_movie_detail = f"https://api.themoviedb.org/3/movie"
headers_detail = {
    "Accept": "application/json"
}
params_details = {
    'api_key': TMDB_API_KEY_MOVIE,
    'language': 'en-US',
    'append_to_response': 'videos,images'
}


In [4]:
def get_movie_data(movie_id):
    """Get movie detail, cast and crew data from API by movie id."""

    # Get response from API
    movie_resp = requests.get(f"{url_movie_detail}/{movie_id}", headers=headers_detail, params=params_details)
    credit_resp = requests.get(f"{url_movie_detail}/{movie_id}/credits", headers=headers_detail, params=params_details)
    credit_info =  credit_resp.json()

    # Prepare cast data
    cast_list = []
    if credit_info.get("cast") is None:
        cast_list = []
        print("No cast found")
    else:
        cast_info = credit_info["cast"]
        for info in cast_info:
            cast = {
                "person_id": info.get("id"),
                "known_for_department": info.get("known_for_department"),
                "cast_id": info.get("cast_id"),
                "character": info.get("character"),
                "credit_id": info.get("credit_id"),
            }
            cast_list.append(cast)

    # Prepare crew data
    crew_list = []
    if credit_info.get("crew") is None:
        cast_list = []
        print("No crew found")
    else:
        crew_info = credit_info["crew"]
        for info in crew_info:
            crew = {
                "person_id": info.get("id"),
                "known_for_department": info.get("known_for_department"),
                "department": info.get("department"),
                "job": info.get("job"),
            }
            crew_list.append(crew)

    # Create final data
    final_resp = {
        "movie_id": movie_id,
        "movie_detail": movie_resp.json(),
        "casts_info": cast_list,
        "crews_info": crew_list,
    }

    return final_resp

def count_lines(file_path):
    """Count number of lines in a file."""
    with open(file_path, "rb") as f:
        return sum(1 for _ in f)

In [5]:
# Prepare reader and tqdm info
reader = pd.read_json(INPUT_PATH, lines=True, chunksize=CHUNK_SIZE)
total_lines = count_lines(INPUT_PATH)
pbar = tqdm(total=total_lines, desc="Sedding movie data", unit=" movie")

# Start reading
idx_line = 0
for chunk in reader:
    batch_list = chunk.to_dict(orient="records")
    movie_ids = [item["id"] for item in batch_list]
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = []

        for movie_id in movie_ids:
            futures.append(executor.submit(get_movie_data, movie_id))

        for future in futures:
            try:
                movie_data = future.result()
                if movie_data:
                    results.append(movie_data)
            except Exception as e:
                pbar.write(f"Error in thread: {e}")
            finally:
                pbar.set_postfix({"idx_in_file": idx_line}, refresh=False)
                pbar.update(1)
                idx_line += 1

    # for id, item in enumerate(batch_list):
    #     pbar.set_postfix({"processing_movie_id": item["id"], "idx_in_file": idx_line}, refresh=False)
    #     try:
    #         movie_data = get_movie_data(item["id"])
    #     except Exception as e:
    #         pbar.write(f"Error processing movie {item['id']}: {e}")
    #     finally:
    #         pbar.update(1)
    #         idx_line += 1
    #
    #     results.append(movie_data)

    with open(OUTPUT_PATH, "a", encoding="utf-8") as f:
        for res in results:
            f.write(json.dumps(res) + "\n")

pbar.close()
print("Done!")

Sedding movie data:   0%|          | 1/1144617 [00:01<546:49:01,  1.72s/ movie, idx_in_file=0]

No cast found
No crew found


Sedding movie data:   0%|          | 4718/1144617 [21:57<87:51:54,  3.60 movie/s, idx_in_file=4717] 

No cast found
No crew found


Sedding movie data:   1%|          | 13031/1144617 [1:01:30<106:38:12,  2.95 movie/s, idx_in_file=13030]

KeyboardInterrupt: 