## Title

### By:
Author Name

### Date:
2024-MM-DD

### Description:

General description of the notebook

In [1]:
import os

os.chdir("/Users/agomezj/Desktop/Juan-G/ml-movie-recommender/")

In [2]:
pwd

'/Users/agomezj/Desktop/Juan-G/ml-movie-recommender'

## 1. Imports and authentication


In [3]:
import requests
import pandas as pd
import time
import numpy as np
from datetime import datetime, timedelta

# Set your TMDb API Bearer Token here
headers = {
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI3YTYxOGZmMTA1YWE2OTAwODQzMTFlZWI1NzEwMjk2NCIsIm5iZiI6MTc0NjYzMDg2MC4xNjcsInN1YiI6IjY4MWI3OGNjYmM0MjUwZDgzOTg1MGEzNyIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.0qkfx1kPvWKzK0zZ-IY7M6Mp9pvPACCedEeIJ7iTNUE",  # Replace with your token
    "accept": "application/json",
}

## 2. Get movies from a paginated endpoint

In [4]:
def get_movies(endpoint: str, pages: int, label: str = None) -> list:
    """
    Get movies from a paginated TMDb endpoint.

    Args:
        endpoint: API path (e.g., "/movie/popular")
        pages: number of pages to read
        label: optional label to tag the source

    Returns:
        List of movies
    """
    movies = []
    for page in range(1, pages + 1):
        url = f"https://api.themoviedb.org/3{endpoint}?language=es-ES&page={page}"
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            results = response.json().get("results", [])
            for movie in results:
                if label:
                    movie["source"] = label
            movies.extend(results)
        time.sleep(0.2)
    return movies

## 3. Build base dataset with recent movies

In [5]:
def build_dataset_base(pages: int = 400, days_range: int = 60) -> pd.DataFrame:
    """
    Build a base dataset with recent and valid movies.
    Assign a system entry date to each movie.

    Args:
        pages: number of pages to read
        days_range: number of days to spread entry dates

    Returns:
        DataFrame with movies and entry date
    """
    movies = []
    for page in range(1, pages + 1):
        url = (
            f"https://api.themoviedb.org/3/discover/movie"
            f"?sort_by=release_date.desc&vote_count.gte=10&page={page}"
        )
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json().get("results", [])
            for movie in data:
                movie["source"] = "exploratory"
            movies.extend(data)
        else:
            print(f"Error on page {page}: {response.status_code}")

    df = pd.DataFrame(movies).drop_duplicates(subset="id").reset_index(drop=True)

    # Assign a system entry date (spread across last N days)
    entry_dates = [
        pd.Timestamp.today().normalize() - timedelta(days=int(x))
        for x in np.random.randint(0, days_range, size=len(df))
    ]
    df["entry_date"] = entry_dates
    df["was_ingested"] = False
    return df

## 4. Get popular movie IDs from TMDb

In [6]:
def get_popular_ids(pages: int = 10) -> set:
    """
    Get IDs of currently popular movies.

    Args:
        pages: number of pages to fetch from popular list

    Returns:
        Set of movie IDs
    """
    popular_movies = get_movies("/movie/popular", pages)
    return set(movie["id"] for movie in popular_movies)

## 5. Create the movie dataset

In [7]:
df_movies = build_dataset_base()

## 6. Get popular IDs and flag each movie

In [8]:
popular_ids = get_popular_ids()
df_movies["is_popular"] = df_movies["id"].isin(popular_ids)

## 7. Enrich with movie details

In [9]:
def enrich_movie_details(movie_id: int) -> dict:
    """
    Get detailed info for one movie using /movie/{id}.

    Returns a dictionary with selected fields.
    """
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=es-ES"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        return {
            "id": movie_id,
            "runtime": data.get("runtime"),
            "budget": data.get("budget"),
            "revenue": data.get("revenue"),
            "status": data.get("status"),
            "original_language": data.get("original_language"),
            "tagline": data.get("tagline"),
            "genres": [g["name"] for g in data.get("genres", [])],
            "spoken_languages": [
                lang["name"] for lang in data.get("spoken_languages", [])
            ],
        }
    else:
        print(f"Error getting details for ID {movie_id}")
        return {
            "id": movie_id,
            "runtime": None,
            "budget": None,
            "revenue": None,
            "status": None,
            "original_language": None,
            "tagline": None,
            "genres": [],
            "spoken_languages": [],
        }

## 8. Apply detail enrichment to all movies

In [10]:
enriched_data = [enrich_movie_details(mid) for mid in df_movies["id"]]
df_enriched = pd.DataFrame(enriched_data)
df_movies = df_movies.merge(df_enriched, on="id", how="left")

## 9. Enrich with keywords for each movie

In [11]:
def get_keywords(movie_id: int) -> list:
    """
    Get keyword list from /movie/{id}/keywords.
    """
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        return [kw["name"] for kw in data.get("keywords", [])]
    else:
        print(f"Error getting keywords for ID {movie_id}")
        return []

## 10. Apply keyword enrichment

In [12]:
df_movies["keywords"] = df_movies["id"].apply(get_keywords)

In [17]:
df_movies.to_parquet("data/01_raw/movies_dataset.parquet", index=False)

# Prod

## 11. Fetch and enrich new movies (dynamic ingestion)

In [None]:
# def fetch_new_movies(pages: int = 50, existing_ids: set = None) -> pd.DataFrame:
#     """
#     Get new movies not present in the current dataset.
#     Assign today's date as entry_date.
#     """
#     movies = []
#     for page in range(1, pages + 1):
#         url = (
#             f"https://api.themoviedb.org/3/discover/movie"
#             f"?sort_by=release_date.desc&vote_count.gte=10&page={page}"
#         )
#         r = requests.get(url, headers=headers)
#         if r.status_code == 200:
#             for movie in r.json().get("results", []):
#                 if existing_ids is None or movie["id"] not in existing_ids:
#                     movie["source"] = "exploratory"
#                     movie["entry_date"] = datetime.today().normalize()
#                     movies.append(movie)
#         time.sleep(0.2)

#     return pd.DataFrame(movies).drop_duplicates(subset="id").reset_index(drop=True)

## 12. Enrich new movies with details and keywords

In [None]:
# def enrich_movies(df: pd.DataFrame) -> pd.DataFrame:
#     """
#     Add details and keywords to new movies.
#     """
#     details = [enrich_movie_details(mid) for mid in df["id"]]
#     df = df.merge(pd.DataFrame(details), on="id", how="left")
#     df["keywords"] = df["id"].apply(get_keywords)
#     return df

## 13. Simulate one ingestion run

In [None]:
# existing_ids = set(df_movies["id"])

# # Fetch and enrich only new movies
# df_new = fetch_new_movies(pages=50, existing_ids=existing_ids)

# if not df_new.empty:
#     df_new["is_popular"] = df_new["id"].isin(get_popular_ids())
#     df_new = enrich_movies(df_new)
#     df_movies = pd.concat([df_movies, df_new]).drop_duplicates(subset="id").reset_index(drop=True)