### Data Crawing

#### 1. Batch Processing Strategy
Since the ID list contains millions of entries, the crawling process is designed for **multi-day execution**. The dataset is partitioned into smaller batches (100,000 IDs per batch) to ensure systematic processing, maintain data integrity, and prevent data loss during unexpected interruptions.

#### 2. Runtime & Resource Management
When utilizing cloud environments like **Kaggle**, it is crucial to account for session time limits (12-hour window). Users should estimate the average processing time per batch and configure an optimal `RUN_LIMIT` to ensure the script terminates gracefully and saves progress before a session timeout occurs.

In [41]:
import os
from dotenv import load_dotenv
import requests
import json
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:
load_dotenv()

TMDB_API_KEY_PERSON = os.getenv("TMDB_API_KEY_PERSON")

INPUT_PATH = "../../data/id_list/person_ids_12_31_2025.jsonl"
OUTPUT_PATH = "data/person/person_12_31_2025.jsonl"
MAX_WORKERS = 3
CHUNK_SIZE=25

print(TMDB_API_KEY_PERSON)

In [None]:
url_person_detail = f"https://api.themoviedb.org/3/person"
headers_detail = {
    "Accept": "application/json"
}
params_details = {
    'api_key': TMDB_API_KEY_PERSON,
    'language': 'en-US',
    'append_to_response': 'videos,images'
}


In [44]:
def get_person_data(person_id):

    person_resp = requests.get(f"{url_person_detail}/{person_id}", headers=headers_detail, params=params_details)

    if person_resp.json().get("id") is not None:
        person_detail = person_resp.json()
    else:
        person_detail = None
        print("No person found")

    final_resp = {
        "person_id": person_id,
        "person_detail": person_detail,
    }

    return final_resp

def count_lines(file_path):
    with open(file_path, "rb") as f:
        return sum(1 for _ in f)

In [45]:
reader = pd.read_json(INPUT_PATH, lines=True, chunksize=CHUNK_SIZE)
total_lines = count_lines(INPUT_PATH)
pbar = tqdm(total=total_lines, desc="Sedding tv series data", unit=" tv series")

idx_line = 0
for chunk in reader:
    batch_list = chunk.to_dict(orient="records")
    person_ids = [item["id"] for item in batch_list]
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = []

        for person_id in person_ids:
            futures.append(executor.submit(get_person_data, person_id))

        for future in futures:
            try:
                person_data = future.result()
                if person_data:
                    results.append(person_data)
            except Exception as e:
                pbar.write(f"Error in thread: {e}")
            finally:
                pbar.set_postfix({"idx_in_file": idx_line}, refresh=False)
                pbar.update(1)
                idx_line += 1

    # for id, item in enumerate(batch_list):
    #     pbar.set_postfix({"processing_person_id": item["id"], "idx_in_file": idx_line}, refresh=False)
    #     try:
    #         person_data = get_person_data(item["id"])
    #     except Exception as e:
    #         pbar.write(f"Error processing person {item['id']}: {e}")
    #     finally:
    #         pbar.update(1)
    #         idx_line += 1
    #
    #     results.append(person_data)

    with open(OUTPUT_PATH, "a", encoding="utf-8") as f:
        for res in results:
            f.write(json.dumps(res, ensure_ascii=False) + "\n")

pbar.close()
print("Done!")

Sedding tv series data:   0%|          | 29/4448709 [00:21<905:13:53,  1.37 tv series/s, idx_in_file=28]
Sedding tv series data:   0%|          | 4019/4448709 [15:55<219:52:55,  5.61 tv series/s, idx_in_file=4018]

KeyboardInterrupt: 