### Extract Local Film Data

In [None]:
from pathlib import Path
import csv
import sys

# --------------- Ruta Modificable Aqu√≠ --------------
root = Path(r"E:\VIDEOS\FILME")
out_path = Path("filme_report.csv")
# ----------------------------------------------------

VIDEO_EXTS = {
    '.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm',
    '.mpg', '.mpeg', '.m4v', '.3gp', '.ts', '.rmvb'
}


def human_readable_size(nbytes: int) -> str:
    if nbytes < 1024:
        return f"{nbytes} B"
    for unit in ("KB", "MB", "GB", "TB"):
        nbytes /= 1024.0
        if nbytes < 1024.0:
            return f"{nbytes:3.1f} {unit}"
    return f"{nbytes:.1f} PB"


def get_folder_size(folder: Path) -> int:
    """Suma recursivamente el tama√±o de todos los archivos dentro de una carpeta."""
    total = 0
    for f in folder.rglob("*"):
        if f.is_file():
            try:
                total += f.stat().st_size
            except OSError:
                pass
    return total


def examine_films(root: Path):
    results = []
    folders_with_subfolders = 0

    if not root.exists() or not root.is_dir():
        raise FileNotFoundError(f"La carpeta ra√≠z indicada no existe o no es un directorio: {root}")

    for entry in sorted(root.iterdir(), key=lambda p: p.name.lower()):
        if not entry.is_dir():
            continue

        carpeta = entry
        archivos = [f for f in carpeta.iterdir() if f.is_file()]
        subcarpetas = [d for d in carpeta.iterdir() if d.is_dir()]

        num_docs = len(archivos)
        nombres_docs = [f.name for f in archivos]

        # Detectar archivos de v√≠deo por extensi√≥n
        video_files = [f for f in archivos if f.suffix.lower() in VIDEO_EXTS]
        num_videos = len(video_files)
        nombres_videos = [f.name for f in video_files]

        # Tama√±os (bytes y legible)
        tama√±os_bytes = []
        tama√±os_legibles = []
        formatos = []
        for f in video_files:
            try:
                size = f.stat().st_size
            except OSError:
                size = 0
            tama√±os_bytes.append(str(size))
            tama√±os_legibles.append(human_readable_size(size))
            formatos.append(f.suffix.lower().lstrip('.'))

        # Detectar carpeta VIDEO_TS y calcular su tama√±o total
        has_videots = None
        for d in subcarpetas:
            if d.name.lower() == "video_ts":
                has_videots = d
                break

        tam_videots_bytes = ""
        tam_videots_legible = ""
        if has_videots:
            formato_field = "DVD"
            size_videots = get_folder_size(has_videots)
            tam_videots_bytes = str(size_videots)
            tam_videots_legible = human_readable_size(size_videots)
        else:
            # Unir formatos √∫nicos en orden alfab√©tico
            unique_formats = sorted(set(formatos))
            formato_field = "; ".join(unique_formats) if unique_formats else ""

        num_subcarpetas = len(subcarpetas)
        nombres_subcarpetas = [d.name for d in subcarpetas]

        if num_subcarpetas > 0:
            folders_with_subfolders += 1

        results.append({
            "CARPETA": carpeta.name,
            "Numero de documentos": num_docs,
            "Nombres de documentos": "; ".join(nombres_docs),
            "N√∫mero de documentos de tipo video": num_videos,
            "Nombre de documentos de tipo video": "; ".join(nombres_videos),
            "Tama√±o (bytes)": "; ".join(tama√±os_bytes),
            "Tama√±o (legible)": "; ".join(tama√±os_legibles),
            "Formato de video": formato_field,
            "Tama√±o VIDEO_TS (bytes)": tam_videots_bytes,
            "Tama√±o VIDEO_TS (legible)": tam_videots_legible,
            "Numero de subcarpetas": num_subcarpetas,
            "Nombre de subcarpetas": "; ".join(nombres_subcarpetas),
        })

    return results, folders_with_subfolders


def write_csv(rows, out_path: Path, delimiter="|"):
    headers = [
        "CARPETA",
        "Numero de documentos",
        "Nombres de documentos",
        "N√∫mero de documentos de tipo video",
        "Nombre de documentos de tipo video",
        "Tama√±o (bytes)",
        "Tama√±o (legible)",
        "Formato de video",
        "Tama√±o VIDEO_TS (bytes)",
        "Tama√±o VIDEO_TS (legible)",
        "Numero de subcarpetas",
        "Nombre de subcarpetas",
    ]
    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=headers, delimiter=delimiter)
        writer.writeheader()
        writer.writerows(rows)


if __name__ == "__main__":
    try:
        rows, folders_with_subfolders = examine_films(root)
        write_csv(rows, out_path)
        print(f"\n‚úÖ CSV generado en: {out_path.resolve()}")
        print(f"üìÅ Carpetas con subcarpetas: {folders_with_subfolders}")
    except Exception as e:
        print("‚ùå Error:", e)

### Identify Local Films to Enrich Film Data

In [None]:
from pathlib import Path
import requests
import time
import csv
import re
from bs4 import BeautifulSoup
from urllib.parse import quote_plus

# ---------------- CONFIG ----------------
root = Path(r"E:\VIDEOS\FILME")
out_csv = Path("filme_metadata.csv")
OMDB_API_KEY = "http://www.omdbapi.com/?i=tt3896198&apikey=c53c64d6"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "\
             "(KHTML, like Gecko) Chrome/117.0 Safari/537.36"
REQUEST_DELAY = 3.0
# ----------------------------------------

session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})


def safe_get(url, params=None, timeout=15):
    """Petici√≥n GET con manejo b√°sico de errores y delays."""
    try:
        r = session.get(url, params=params, timeout=timeout)
        time.sleep(REQUEST_DELAY)
        r.raise_for_status()
        return r
    except Exception as e:
        # print(f"[WARN] petici√≥n fallida a {url}: {e}")
        return None


def search_omdb(title, year=None):
    """
    Busca en OMDb por t√≠tulo (y a√±o opcional) y devuelve el JSON si lo encuentra.
    """
    if not OMDB_API_KEY or OMDB_API_KEY == "TU_OMDB_API_KEY_AQUI":
        return None
    params = {"apikey": OMDB_API_KEY, "t": title}
    if year:
        params["y"] = year
    r = safe_get("http://www.omdbapi.com/", params=params)
    if not r:
        return None
    data = r.json()
    if data.get("Response", "False") == "True":
        return data
    # fallback: intentar b√∫squeda por s (search) y luego obtener por imdbID
    params_search = {"apikey": OMDB_API_KEY, "s": title}
    r2 = safe_get("http://www.omdbapi.com/", params=params_search)
    if not r2:
        return None
    sr = r2.json()
    if sr.get("Response") == "True" and sr.get("Search"):
        first = sr["Search"][0]
        imdbid = first.get("imdbID")
        r3 = safe_get("http://www.omdbapi.com/", params={"apikey": OMDB_API_KEY, "i": imdbid, "plot": "short"})
        if r3:
            d = r3.json()
            if d.get("Response") == "True":
                return d
    return None


def parse_awards_for_oscars(awards_text):
    """
    Intenta extraer nominaciones y victorias de los Oscars y edici√≥n si aparece.
    Devuelve (nominaciones, ganados, edicion_texto)
    """
    if not awards_text:
        return "", "", ""
    text = awards_text
    # buscar 'Oscar' seguido de n√∫meros
    nominaciones = ""
    ganados = ""
    edicion = ""
    # ejemplos en OMDb: "Nominated for 1 Oscar. Another 3 wins & 5 nominations."
    # buscar "Nominated for X Oscar" o "Won X Oscar"
    m_nom = re.search(r"Nominated for (\d+) Oscar", text)
    m_won = re.search(r"Won (\d+) Oscar", text)
    if m_nom:
        nominaciones = m_nom.group(1)
    if m_won:
        ganados = m_won.group(1)
    # a veces aparece "Nominated for 2 Oscars. Another 3 wins & 4 nominations."
    # Tambi√©n intentar buscar "Oscars" plural
    if not nominaciones:
        m_nom2 = re.search(r"Nominated for (\d+) Oscars", text)
        if m_nom2:
            nominaciones = m_nom2.group(1)
    if not ganados:
        m_won2 = re.search(r"Won (\d+) Oscars", text)
        if m_won2:
            ganados = m_won2.group(1)
    # edici√≥n: intentar capturar "Nominated for X Oscar (YEAR?)" no siempre est√°
    # No hay formato est√°ndar; devolvemos texto completo si contiene 'Oscar'
    if "Oscar" in text:
        edicion = text
    return nominaciones, ganados, edicion


def search_filmaffinity(folder_name):
    """
    Buscar en FilmAffinity el t√≠tulo y devolver (fa_id, fa_rating, fa_synopsis).
    M√©todo: usar el buscador de FilmAffinity y tomar el primer resultado.
    NOTA: FilmAffinity puede cambiar HTML o bloquear scraping.
    """
    try:
        query = quote_plus(folder_name)
        url = f"https://www.filmaffinity.com/en/search.php?stype=title&stext={query}"
        # en ocasiones el dominio /es/ o sin /en/; probar /en/ para consistencia
        r = safe_get(url)
        if not r:
            # intentar versi√≥n sin /en/
            url2 = f"https://www.filmaffinity.com/search.php?stype=title&stext={query}"
            r = safe_get(url2)
            if not r:
                return "", "", ""
        soup = BeautifulSoup(r.text, "html.parser")
        # buscar primer resultado en la lista
        # en la p√°gina: resultados suelen estar en .movie-card / .fa-search-result / .mc-title
        link = soup.select_one("div.movie-card a[href]") or soup.select_one("div.mc-title a[href]") or soup.select_one("a.movie-link[href]")
        if not link:
            # intentar otro selector general
            link = soup.select_one("a[href*='film']")  # enlace que contenga 'film'
        if not link:
            return "", "", ""
        href = link.get("href")
        # obtener id si est√° en la URL, ej: /en/film123456.html o /film123456.html
        m = re.search(r"film(\d+)\.html", href)
        fa_id = m.group(1) if m else href
        # solicitar la p√°gina del film
        film_url = href if href.startswith("http") else ("https://www.filmaffinity.com" + href)
        r2 = safe_get(film_url)
        if not r2:
            return fa_id, "", ""
        s2 = BeautifulSoup(r2.text, "html.parser")
        # rating: puede estar en el selector .rating .average or span[itemprop="ratingValue"]
        rating_el = s2.select_one('div.avg-rating') or s2.select_one('div.rating') or s2.select_one('span[itemprop="ratingValue"]')
        rating = ""
        if rating_el:
            rating = rating_el.get_text(strip=True)
        # sinopsis: buscar #synopsis or .synopsis or #sinopsis
        sinopsis_el = s2.select_one("#synopsis") or s2.select_one("div.synopsis") or s2.select_one("div#movie-synopsis")
        sinopsis = ""
        if sinopsis_el:
            sinopsis = sinopsis_el.get_text(" ", strip=True)
        # versi√≥n en espa√±ol: FilmAffinity muestra t√≠tulo original y t√≠tulo espa√±ol en la misma p√°gina; extraer t√≠tulo original si hay
        return fa_id, rating, sinopsis
    except Exception:
        return "", "", ""


def search_rottentomatoes(folder_name):
    """
    Busca en RottenTomatoes y devuelve (rt_id, tomatometer, popcornmeter?).
    Usamos la b√∫squeda p√∫blica: https://www.rottentomatoes.com/search?search=...
    """
    try:
        q = quote_plus(folder_name)
        url = f"https://www.rottentomatoes.com/search?search={q}"
        r = safe_get(url)
        if not r:
            return "", "", ""
        soup = BeautifulSoup(r.text, "html.parser")
        # La estructura JS a veces rellena resultados; intentar leer resultados "search-page-media-row" o links a /m/slug
        link = soup.select_one("search-page-media-row a") or soup.select_one("a[href^='/m/']") or soup.select_one("a[href^='/m/'], a[href^='/movies/']")
        if not link:
            return "", "", ""
        href = link.get("href")
        rt_id = href
        film_url = href if href.startswith("http") else ("https://www.rottentomatoes.com" + href)
        r2 = safe_get(film_url)
        if not r2:
            return rt_id, "", ""
        s2 = BeautifulSoup(r2.text, "html.parser")
        # Tomatometer: selector puede ser score-board with data-meter or scoreBoard
        tm = ""
        # Buscamos elementos con class 'mop-ratings-wrap__percentage' o 'score-board' data-meter
        score_el = s2.select_one(".mop-ratings-wrap__percentage") or s2.select_one("score-board")
        if score_el:
            tm_text = score_el.get_text(" ", strip=True)
            m = re.search(r"(\d+)%", tm_text)
            if m:
                tm = m.group(1)
        # si score-board tiene atributo data-meter
        sb = s2.select_one("score-board")
        if sb and not tm:
            tm = sb.get("tomatometerscore") or sb.get("tomatometerscore")
        return rt_id, tm, ""
    except Exception:
        return "", "", ""


def search_popcornmeter(folder_name):
    """
    Intento simple para PopcornMeter (si existe); devolver el id o puntuaci√≥n si se encuentra.
    """
    try:
        q = quote_plus(folder_name)
        url = f"https://www.popcornmeter.com/search?search={q}"
        r = safe_get(url)
        if not r:
            return "", ""
        soup = BeautifulSoup(r.text, "html.parser")
        link = soup.select_one("a[href*='/movie/']")
        if not link:
            return "", ""
        href = link.get("href")
        # la puntuaci√≥n puede aparecer en .popcornmeter-score o similar
        r2 = safe_get(href if href.startswith("http") else ("https://www.popcornmeter.com" + href))
        if not r2:
            return href, ""
        s2 = BeautifulSoup(r2.text, "html.parser")
        score_el = s2.select_one(".score") or s2.select_one(".rating-value")
        score = score_el.get_text(strip=True) if score_el else ""
        return href, score
    except Exception:
        return "", ""


def collect_for_folder(folder: Path):
    """
    Dada la carpeta de la pel√≠cula, retorna un dict con los campos solicitados.
    """
    folder_name = folder.name
    row = {
        "T√≠tulo original": "",
        "T√≠tulo en espa√±ol": "",
        "Direcci√≥n": "",
        "Gui√≥n": "",
        "Pa√≠s": "",
        "a√±o": "",
        "Duraci√≥n": "",
        "Lista de G√©neros": "",
        "Puntuaci√≥n filmaffinity": "",
        "Puntuaci√≥n IMDB": "",
        "Tomatometer": "",
        "Popcornmeter": "",
        "Lista de actores": "",
        "Nominaciones Oscar": "",
        "Oscar ganados": "",
        "Edici√≥n Oscar": "",
        "Sinopsis filmaffinity": "",
        "ID OMDb": "",
        "ID IMDB": "",
        "ID filmaffinity": "",
        "ID rottentomatoes": "",
        "Nombre de la carpeta": folder_name,
    }

    # 1) Intentar OMDb (clave obligatoria)
    omdb_data = search_omdb(folder_name)
    if omdb_data:
        # rellenar campos desde OMDb
        row["T√≠tulo original"] = omdb_data.get("Title", "")
        # OMDb no distingue t√≠tulo en espa√±ol; dejar T√≠tulo en espa√±ol vac√≠o si no hay
        row["T√≠tulo en espa√±ol"] = ""  # opcional: podr√≠amos intentar traducir o buscar en FilmAffinity
        row["Direcci√≥n"] = omdb_data.get("Director", "")
        row["Gui√≥n"] = omdb_data.get("Writer", "")
        row["Pa√≠s"] = omdb_data.get("Country", "")
        row["a√±o"] = omdb_data.get("Year", "")
        row["Duraci√≥n"] = omdb_data.get("Runtime", "")
        row["Lista de G√©neros"] = omdb_data.get("Genre", "")
        # IMDb rating
        row["Puntuaci√≥n IMDB"] = omdb_data.get("imdbRating", "")
        row["Lista de actores"] = omdb_data.get("Actors", "")
        # IMDb ID
        row["ID IMDB"] = omdb_data.get("imdbID", "")
        # OMDb ID: no tiene id propio, usaremos imdbID como referencia
        row["ID OMDb"] = row["ID IMDB"]
        # Awards (para Oscars)
        nomin, gan, ed = parse_awards_for_oscars(omdb_data.get("Awards", ""))
        row["Nominaciones Oscar"] = nomin
        row["Oscar ganados"] = gan
        row["Edici√≥n Oscar"] = ed

    # 2) FilmAffinity (scrape)
    fa_id, fa_rating, fa_synopsis = search_filmaffinity(folder_name)
    if fa_id:
        row["ID filmaffinity"] = fa_id
    if fa_rating:
        row["Puntuaci√≥n filmaffinity"] = fa_rating
    if fa_synopsis:
        row["Sinopsis filmaffinity"] = fa_synopsis

    # 3) RottenTomatoes
    rt_id, tomatometer, _ = search_rottentomatoes(folder_name)
    if rt_id:
        row["ID rottentomatoes"] = rt_id
    if tomatometer:
        row["Tomatometer"] = tomatometer

    # 4) PopcornMeter
    pcm_id, pcm_score = search_popcornmeter(folder_name)
    if pcm_score:
        row["Popcornmeter"] = pcm_score

    return row


def main():
    if not root.exists() or not root.is_dir():
        print(f"[ERROR] La ruta root no existe: {root}")
        return

    headers = [
        "T√≠tulo original", "T√≠tulo en espa√±ol", "Direcci√≥n", "Gui√≥n", "Pa√≠s", "a√±o",
        "Duraci√≥n", "Lista de G√©neros", "Puntuaci√≥n filmaffinity", "Puntuaci√≥n IMDB",
        "Tomatometer", "Popcornmeter", "Lista de actores", "Nominaciones Oscar",
        "Oscar ganados", "Edici√≥n Oscar", "Sinopsis filmaffinity",
        "ID OMDb", "ID IMDB", "ID filmaffinity", "ID rottentomatoes", "Nombre de la carpeta"
    ]

    rows = []
    for entry in sorted(root.iterdir(), key=lambda p: p.name.lower()):
        if not entry.is_dir():
            continue
        try:
            print(f"Procesando: {entry.name}")
            r = collect_for_folder(entry)
            rows.append(r)
        except Exception as e:
            print(f"[WARN] fallo procesando {entry.name}: {e}")

    # escribir CSV
    with out_csv.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=headers, delimiter="|")
        writer.writeheader()
        for r in rows:
            writer.writerow(r)

    print(f"CSV generado: {out_csv.resolve()}")


if __name__ == "__main__":
    main()
