Notes:

1. If you had not, install selenium using `pip install selenium webdriver-manager`
2. If you had not, install tqdm using `pip install tqdm`
3. instead of storing strings & numbers, or generic objects, it is wiser to store typed objects; two good choices are
    1. __[namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple)__
    2. __[dataclass](https://docs.python.org/3/library/dataclasses.html)__

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from dataclasses import dataclass, field, asdict
from typing import Optional, List
import re
import time
from datetime import datetime

httpHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.imdb.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}
url = 'https://www.imdb.com/search/title/?title_type=tv_movie,feature&release_date=2024-01-01,2024-12-31&country_of_origin=IE'

@dataclass
class Director:
    name: str
    url: str

@dataclass
class Thespian:
    name: str
    url: str

@dataclass
class MovieInfo:
    title: str
    url: str
    imdbRating: Optional[float] = None
    imdbVotes: Optional[int] = None
    metascore: Optional[int] = None
    directors: List[str] = field(default_factory=list)
    thespians: List[str] = field(default_factory=list)

# Generic logger
def logEvent(msg: str, level: str = "INFO", filePath: str = "scrapingLog.log") -> None:
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    for line in msg.strip().splitlines():
        with open(filePath, 'a') as f:
            f.write(f"{timestamp} [{level}] {line}\n")

# Error logger shortcut
def logError(msg: str, filePath: str = "scrapingErrors.log") -> None:
    logEvent(msg, level="ERROR", filePath=filePath)

# Universal try-catcher with controllable flow
def trierCatcher(keepGoing, traceMsg, task, *taskArgs, **taskKwargs):
    if not keepGoing:
        return (False, None)
    try:
        result = task(*taskArgs, **taskKwargs)
        return (True, result)
    except Exception as e:
        logError(f"{traceMsg}\n{repr(e)}")
        return (False, None)

# Extract movie info from current loaded page
def tryParseMovieItem(item) -> Optional[MovieInfo]:
    try:
        titleBlock = item.select_one("div.dli-parent h3")
        if not titleBlock:
            return None
        title = titleBlock.text.strip()
        anchor = item.select_one("a")
        if not anchor:
            return None
        url = "https://www.imdb.com" + anchor['href'].split('?')[0]

        imdbRatingSpan = item.select_one("span.ipc-rating-star--rating")
        imdbVotesSpan = item.select_one("span.ipc-rating-star--voteCount")
        metascoreSpan = item.select_one("span.metacritic-score-box")

        imdbRating = imdbRatingSpan.text if imdbRatingSpan else None
        imdbVotes = imdbVotesSpan.text if imdbVotesSpan else None
        metascore = metascoreSpan.text if metascoreSpan else None

        return MovieInfo(title=title, url=url, imdbRating=imdbRating, imdbVotes=imdbVotes, metascore=metascore)
    except Exception as e:
        logError(f"Error parsing a movie item: {repr(e)}")
        return None

def extractMoviesFromPage(pageSource: str) -> List[MovieInfo]:
    soup = BeautifulSoup(pageSource, 'html.parser')
    movieItems = soup.select("ul.ipc-metadata-list > li")
    movieBatch = []
    for item in movieItems:
        if len(item.attrs) == 1:
            movie = tryParseMovieItem(item)
            if movie:
                movieBatch.append(movie)
    return movieBatch

def getBrowser(someURL):
    driver = webdriver.Chrome()
    driver.get(someURL)
    return driver    

def scrapeIMDbMoviesWithSlidingWindow(someURL: str) -> List[MovieInfo]:
    movieList = []
    batchCounter = 0
    defaultBatchSize = 50
    pageBatchSize = 50
    sleepTimeSeconds = 0.5
    driverWaitTimeout = 10
    keepGoing = True
    nMoreButtonText = "ipc-see-more__button"
    buttonTextRetrievalJSCommand = "return arguments[0].innerText;"
    domPruningJSCommand = """
            const ul = document.querySelector("ul.ipc-metadata-list");
            const lis = ul.querySelectorAll("li");
            for (let i = 0; i < 50 && i < lis.length; i++) { lis[i].remove(); }
        """
    clicketyJSCommand = "arguments[0].click();"
    scrollJSCommand = "arguments[0].scrollIntoView({block: 'center'});"
    metadataList = "ipc-metadata-list-summary-item"
    pruningFailMsg = "JS movie LI cleanup failure"
    movieExtractionFailMsg = "Failed to extract movies from page"
    movieExtensionFailMsg = "Failed to append new movies"
    clickFailMsg = "Clickety failure"
    loadFailMsg = "New movie load wait failure"
    scrollFailMsg = "Scroll failure"
    batchSizeFailMsg = "Batch size update failure"
    sleepFailMsg = "Sleep failure"
    buttonFailMsg = "Button retrieval failure"
    buttonTextFailMsg = "Button text fetch failure"
    driver = getBrowser(someURL)

    while keepGoing:
        keepGoing, newMovies = trierCatcher(keepGoing, movieExtractionFailMsg, extractMoviesFromPage, driver.page_source)
        keepGoing, _ = trierCatcher(keepGoing, movieExtensionFailMsg, movieList.extend, newMovies)
        keepGoing, _ = trierCatcher(keepGoing, pruningFailMsg, driver.execute_script, domPruningJSCommand)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, button = trierCatcher(keepGoing, buttonFailMsg, WebDriverWait(driver, driverWaitTimeout).until, EC.element_to_be_clickable((By.CLASS_NAME, nMoreButtonText)))
        keepGoing, buttonText = trierCatcher(keepGoing, buttonTextFailMsg, driver.execute_script, buttonTextRetrievalJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, scrollFailMsg, driver.execute_script, scrollJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, _ = trierCatcher(keepGoing, clickFailMsg, driver.execute_script, clicketyJSCommand, button)
        keepGoing, match = trierCatcher(keepGoing, batchSizeFailMsg, re.search, r"(\d+)", buttonText)
        pageBatchSize = int(match.group(1)) if keepGoing and match else defaultBatchSize
        keepGoing, _ = trierCatcher(keepGoing, loadFailMsg, WebDriverWait(driver, driverWaitTimeout).until, lambda d: len(d.find_elements(By.CLASS_NAME, metadataList)) >= pageBatchSize)
    driver.quit()
    return movieList

movies = scrapeIMDbMoviesWithSlidingWindow(url)

In [None]:
def scrapeMovieCredits(movieURL: str) -> tuple[List[Director], List[Thespian]]:
    fullCreditsURL = movieURL + "fullcredits/"
    directors = []
    thespians = []
    try:
        response = requests.get(fullCreditsURL, headers=httpHeaders)
        response.raise_for_status()
    except Exception as e:
        logError(f"Failed to retrieve full credits page for {movieURL}\n{repr(e)}")
        return (directors, thespians)

    soup = BeautifulSoup(response.text, 'html.parser')

    # --- DIRECTORS ---
    try:
        director_section = soup.find("div", attrs={"data-testid": "sub-section-director"})
        if director_section:
            ul = director_section.find("ul")
            if ul:
                for li in ul.find_all("li", recursive=False):
                    anchor = li.find("a", class_="name-credits--title-text-big")
                    if anchor:
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor["href"].split("?")[0]
                        directors.append(Director(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing directors for {movieURL}\n{repr(e)}")
        
    # --- CAST (limited to top 5) ---
    try:
        cast_section = soup.find("div", attrs={"data-testid": "sub-section-cast"})
        if cast_section:
            ul = cast_section.find("ul")
            if ul:
                cast_lis = ul.find_all("li", class_="full-credits-page-list-item", recursive=False)[:5]
                for li in cast_lis:
                    anchor = li.find("a", class_="name-credits--title-text-big")
                    if anchor:
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor["href"].split("?")[0]
                        thespians.append(Thespian(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing cast for {movieURL}\n{repr(e)}")

    return (directors, thespians)

In [55]:
from tqdm import tqdm
import requests

for movie in tqdm(movies):
    directors, thespians = scrapeMovieCredits(movie.url)
    movie.directors = directors
    movie.thespians = thespians
    time.sleep(0.5)  # Respect IMDb

100%|██████████████████████████████| 98/98 [04:43<00:00,  2.89s/it]


In [56]:
print(len(movies))

98


In [57]:
print(movies)

[MovieInfo(title='1. Bring Them Down', url='https://www.imdb.com/title/tt14186876/', imdbRating='6.5', imdbVotes='\xa0(2.2K)', metascore='63', directors=[Director(name='Chris Andrews', url='https://www.imdb.com/name/nm1643216/'), Director(name='Chris Andrews', url='https://www.imdb.com/name/nm1643216/'), Director(name='Jonathan Hourigan', url='https://www.imdb.com/name/nm1509306/')], thespians=[Thespian(name='Chris Andrews', url='https://www.imdb.com/name/nm1643216/')]), MovieInfo(title='2. Oddity', url='https://www.imdb.com/title/tt26470109/', imdbRating='6.7', imdbVotes='\xa0(42K)', metascore='78', directors=[Director(name='Damian Mc Carthy', url='https://www.imdb.com/name/nm3374835/')], thespians=[Thespian(name='Damian Mc Carthy', url='https://www.imdb.com/name/nm3374835/')]), MovieInfo(title='3. Small Things Like These', url='https://www.imdb.com/title/tt27196021/', imdbRating='6.8', imdbVotes='\xa0(22K)', metascore='82', directors=[Director(name='Enda Walsh', url='https://www.imdb

In [58]:
import json, csv

# Generic entity saver/loader functions
def saveEntityListAsJSON(entities: List, filename: str):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([asdict(e) for e in entities], f, ensure_ascii=False, indent=2)

def saveEntityListAsCSV(entities: List, filename: str):
    if not entities:
        return
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=asdict(entities[0]).keys())
        writer.writeheader()
        for e in entities:
            writer.writerow(asdict(e))

In [59]:
saveEntityListAsJSON(movies, 'movies.json')
saveEntityListAsCSV(movies, 'movies.csv')

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from dataclasses import dataclass, field, asdict
from typing import Optional, List
import re
import time
from datetime import datetime
import requests
from tqdm import tqdm
import json
import csv

httpHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.imdb.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}
# Changed from ES (Spain) to US (United States)
url = 'https://www.imdb.com/search/title/?title_type=tv_movie,feature&release_date=2024-01-01,2024-12-31&country_of_origin=US&moviemeter=1,1000'

@dataclass
class Director:
    name: str
    url: str

@dataclass
class Thespian:
    name: str
    url: str

@dataclass
class MovieInfo:
    title: str
    url: str
    imdbRating: Optional[float] = None
    imdbVotes: Optional[int] = None
    metascore: Optional[int] = None
    budget: Optional[str] = None
    domesticGross: Optional[str] = None
    worldwideGross: Optional[str] = None
    academyAwards: List[str] = field(default_factory=list)
    directors: List[Director] = field(default_factory=list)
    thespians: List[Thespian] = field(default_factory=list)

# Generic logger
def logEvent(msg: str, level: str = "INFO", filePath: str = "scrapingLog.log") -> None:
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    for line in msg.strip().splitlines():
        with open(filePath, 'a') as f:
            f.write(f"{timestamp} [{level}] {line}\n")

# Error logger shortcut
def logError(msg: str, filePath: str = "scrapingErrors.log") -> None:
    logEvent(msg, level="ERROR", filePath=filePath)

# Universal try-catcher with controllable flow
def trierCatcher(keepGoing, traceMsg, task, *taskArgs, **taskKwargs):
    if not keepGoing:
        return (False, None)
    try:
        result = task(*taskArgs, **taskKwargs)
        return (True, result)
    except Exception as e:
        logError(f"{traceMsg}\n{repr(e)}")
        return (False, None)

# Extract movie info from current loaded page
def tryParseMovieItem(item) -> Optional[MovieInfo]:
    try:
        titleBlock = item.select_one("div.dli-parent h3")
        if not titleBlock:
            return None
        title = titleBlock.text.strip()
        anchor = item.select_one("a")
        if not anchor:
            return None
        url = "https://www.imdb.com" + anchor['href'].split('?')[0]

        imdbRatingSpan = item.select_one("span.ipc-rating-star--rating")
        imdbVotesSpan = item.select_one("span.ipc-rating-star--voteCount")
        metascoreSpan = item.select_one("span.metacritic-score-box")

        imdbRating = imdbRatingSpan.text if imdbRatingSpan else None
        imdbVotes = imdbVotesSpan.text if imdbVotesSpan else None
        metascore = metascoreSpan.text if metascoreSpan else None

        return MovieInfo(title=title, url=url, imdbRating=imdbRating, imdbVotes=imdbVotes, metascore=metascore)
    except Exception as e:
        logError(f"Error parsing a movie item: {repr(e)}")
        return None

def extractMoviesFromPage(pageSource: str) -> List[MovieInfo]:
    soup = BeautifulSoup(pageSource, 'html.parser')
    movieItems = soup.select("ul.ipc-metadata-list > li")
    movieBatch = []
    for item in movieItems:
        if len(item.attrs) == 1:
            movie = tryParseMovieItem(item)
            if movie:
                movieBatch.append(movie)
    return movieBatch

def getBrowser(someURL):
    options = Options()
    options.add_argument("--lang=en-US")
    driver = webdriver.Chrome(options=options)
    driver.get(someURL)
    return driver 

def scrapeIMDbMoviesWithSlidingWindow(someURL: str) -> List[MovieInfo]:
    movieList = []
    batchCounter = 0
    defaultBatchSize = 50
    pageBatchSize = 50
    sleepTimeSeconds = 0.5
    driverWaitTimeout = 10
    keepGoing = True
    nMoreButtonText = "ipc-see-more__button"
    buttonTextRetrievalJSCommand = "return arguments[0].innerText;"
    domPruningJSCommand = """
            const ul = document.querySelector("ul.ipc-metadata-list");
            const lis = ul.querySelectorAll("li");
            for (let i = 0; i < 50 && i < lis.length; i++) { lis[i].remove(); }
        """
    clicketyJSCommand = "arguments[0].click();"
    scrollJSCommand = "arguments[0].scrollIntoView({block: 'center'});"
    metadataList = "ipc-metadata-list-summary-item"
    pruningFailMsg = "JS movie LI cleanup failure"
    movieExtractionFailMsg = "Failed to extract movies from page"
    movieExtensionFailMsg = "Failed to append new movies"
    clickFailMsg = "Clickety failure"
    loadFailMsg = "New movie load wait failure"
    scrollFailMsg = "Scroll failure"
    batchSizeFailMsg = "Batch size update failure"
    sleepFailMsg = "Sleep failure"
    buttonFailMsg = "Button retrieval failure"
    buttonTextFailMsg = "Button text fetch failure"
    driver = getBrowser(someURL)

    while keepGoing:
        keepGoing, newMovies = trierCatcher(keepGoing, movieExtractionFailMsg, extractMoviesFromPage, driver.page_source)
        keepGoing, _ = trierCatcher(keepGoing, movieExtensionFailMsg, movieList.extend, newMovies)
        keepGoing, _ = trierCatcher(keepGoing, pruningFailMsg, driver.execute_script, domPruningJSCommand)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, button = trierCatcher(keepGoing, buttonFailMsg, WebDriverWait(driver, driverWaitTimeout).until, EC.element_to_be_clickable((By.CLASS_NAME, nMoreButtonText)))
        keepGoing, buttonText = trierCatcher(keepGoing, buttonTextFailMsg, driver.execute_script, buttonTextRetrievalJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, scrollFailMsg, driver.execute_script, scrollJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, _ = trierCatcher(keepGoing, clickFailMsg, driver.execute_script, clicketyJSCommand, button)
        keepGoing, match = trierCatcher(keepGoing, batchSizeFailMsg, re.search, r"(\d+)", buttonText)
        pageBatchSize = int(match.group(1)) if keepGoing and match else defaultBatchSize
        keepGoing, _ = trierCatcher(keepGoing, loadFailMsg, WebDriverWait(driver, driverWaitTimeout).until, lambda d: len(d.find_elements(By.CLASS_NAME, metadataList)) >= pageBatchSize)
    driver.quit()
    return movieList

def scrapeMovieCredits(movieURL: str) -> tuple[List[Director], List[Thespian]]:
    fullCreditsURL = movieURL + "fullcredits/"
    directors = []
    thespians = []
    try:
        response = requests.get(fullCreditsURL, headers=httpHeaders)
        response.raise_for_status()
    except Exception as e:
        logError(f"Failed to retrieve full credits page for {movieURL}\n{repr(e)}")
        return (directors, thespians)

    soup = BeautifulSoup(response.text, 'html.parser')

    # --- DIRECTORS ---
    try:
        director_section = soup.find("div", attrs={"data-testid": "sub-section-director"})
        if director_section:
            ul = director_section.find("ul")
            if ul:
                for li in ul.find_all("li", recursive=False):
                    anchor = li.find("a", class_="name-credits--title-text-big")
                    if anchor:
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor["href"].split("?")[0]
                        directors.append(Director(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing directors for {movieURL}\n{repr(e)}")
        
    # --- CAST (limited to top 5) ---
    try:
        cast_section = soup.find("div", attrs={"data-testid": "sub-section-cast"})
        if cast_section:
            ul = cast_section.find("ul")
            if ul:
                cast_lis = ul.find_all("li", class_="full-credits-page-list-item", recursive=False)[:5]
                for li in cast_lis:
                    anchor = li.find("a", class_="name-credits--title-text-big")
                    if anchor:
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor["href"].split("?")[0]
                        thespians.append(Thespian(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing cast for {movieURL}\n{repr(e)}")

    return (directors, thespians)

def cleanMovieTitle(title: str) -> str:
    """
    Clean movie title by removing:
    1. Ranking number prefix (e.g., "1. ", "25. ")
    2. Year in parentheses (e.g., "(2024)")
    """
    # Remove ranking number prefix (e.g., "1. ", "25. ")
    cleaned_title = re.sub(r'^\d+\.\s*', '', title)
    
    # Remove year in parentheses
    cleaned_title = re.sub(r'\s*\(\d{4}\)\s*', '', cleaned_title)
    
    return cleaned_title.strip()

import unicodedata

def remove_accents(text: str) -> str:
    """
    Remove accents from characters (e.g. à -> a, ñ -> n)
    """
    normalized = unicodedata.normalize('NFD', text)
    return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

def formatMovieTitleForTheNumbers(title: str, year: str = "2024") -> str:
    """
    Format a movie title for use in The Numbers, respecting their naming style.
    Handles special characters, & → and, # → Number, and places articles correctly.
    """
    title = cleanMovieTitle(title)
    title = title.replace('&', 'and').replace('#', 'Number')
    title = re.sub(r'[^\w\s:-]', '', title)
    title = remove_accents(title)

    # Split around colon if present
    parts = title.split(':')
    if len(parts) == 2:
        main, subtitle = parts[0].strip(), parts[1].strip()

        # Check if main part starts with article
        words = main.split()
        articles = []
        if words and words[0].lower() in ['the', 'a', 'an']:
            articles.append(words.pop(0))  # remove and store

        # Recombine all parts
        final_words = words + articles + subtitle.split()
    else:
        # No colon, just do standard cleaning
        final_words = title.split()

        # Move any articles to the end (fallback behavior)
        articles = [w for w in final_words if w.lower() in ['the', 'a', 'an']]
        final_words = [w for w in final_words if w.lower() not in ['the', 'a', 'an']] + articles

    formatted_title = '-'.join(final_words) + f"-({year})"
    formatted_title = re.sub(r'-+', '-', formatted_title)
    return formatted_title

from typing import Optional
import requests
from bs4 import BeautifulSoup
import re

def getBoxOfficeDataFromTheNumbers(movie_title: str, year: str = "2024") -> tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Extract budget and box office data from The Numbers.
    Returns (budget, domestic_gross, worldwide_gross)
    """
    budget = None
    domestic_gross = None
    worldwide_gross = None
    
    if not movie_title:
        return budget, domestic_gross, worldwide_gross
    
    formatted_title = formatMovieTitleForTheNumbers(movie_title, year)
    thenumbers_url = f"https://www.the-numbers.com/movie/{formatted_title}#tab=summary"
    
    try:
        logEvent(f"Accessing URL: {thenumbers_url}")
        response = requests.get(thenumbers_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # ✅ Buscar el presupuesto de forma precisa
        summary_div = soup.find("div", id="summary")
        if summary_div:
            budget_row = summary_div.find("b", string=re.compile(r"Production(\xa0| )Budget:"))
            if budget_row:
                td = budget_row.find_next("td")
                if td:
                    # Solo extraemos el número (ej. "$6,000,000")
                    match = re.search(r"\$\d[\d,]*", td.text)
                    if match:
                        budget = match.group(0).strip()

        # Buscar la tabla con los ingresos (igual que antes)
        tables = soup.find_all("table")
        for table in tables:
            # Domestic Box Office
            domestic_element = table.find(string=re.compile("Domestic Box Office", re.IGNORECASE))
            if domestic_element:
                row = domestic_element.find_parent("tr")
                if row:
                    data_cells = row.find_all("td", class_="data")
                    if data_cells:
                        domestic_gross = data_cells[0].text.strip()
            
            # Worldwide Box Office
            worldwide_element = table.find(string=re.compile("Worldwide Box Office", re.IGNORECASE))
            if worldwide_element:
                row = worldwide_element.find_parent("tr")
                if row:
                    data_cells = row.find_all("td", class_="data")
                    if data_cells:
                        worldwide_gross = data_cells[0].text.strip()
        
        logEvent(f"Retrieved data for {movie_title}: Budget={budget}, Domestic={domestic_gross}, Worldwide={worldwide_gross}")
        
    except Exception as e:
        logError(f"Failed to retrieve box office data from The Numbers for {movie_title}\n{repr(e)}")
    
    return budget, domestic_gross, worldwide_gross

def scrapeAcademyAwards(movieURL: str) -> List[str]:
    """
    Ultra specific function targeting the exact HTML structure shown in screenshots.
    """
    awardsURL = movieURL + "awards/"
    awards = []
    
    try:
        response = requests.get(awardsURL, headers=httpHeaders)
        response.raise_for_status()
    except Exception as e:
        logError(f"Failed to retrieve awards page for {movieURL}\n{repr(e)}")
        return awards

    soup = BeautifulSoup(response.text, 'html.parser')
    
    try:
        # Buscar específicamente la sección con atributo data-testid="sub-section-ev0000003"
        # ya que "ev0000003" parece ser el código de los Academy Awards en IMDB
        academy_section = soup.find("div", attrs={"data-testid": lambda v: v and "sub-section-ev0000003" in v})
        
        if not academy_section:
            # Si no encuentra con el ID específico, buscar por el texto
            academy_spans = soup.find_all("span", string=lambda s: s and "Academy Awards, USA" in s)
            for span in academy_spans:
                section = span.find_parent("div", attrs={"data-testid": lambda v: v and "sub-section" in v})
                if section:
                    academy_section = section
                    break
        
        if academy_section:
            # Buscar todos los items de la lista de premios
            award_items = academy_section.find_all("li", class_="ipc-metadata-list-summary-item")
            
            if not award_items:
                # Si no encuentra con ese selector, buscar con uno más general
                award_items = academy_section.find_all("div", class_=lambda c: c and "ipc-metadata-list-summary-item" in c)
            
            for item in award_items:
                # Buscar el texto que indica si es ganador
                if "Ganador" not in item.text and "Winner" not in item.text:
                    continue
                
                # Buscar la categoría del premio
                # Primero buscar spans con clase específica para categorías
                category_span = item.find("span", class_=lambda c: c and "awardCategoryName" in c)
                
                if category_span:
                    category = category_span.text.strip()
                    awards.append(category)
                else:
                    # Buscar en elementos li dentro de listas inline
                    category_li = item.find("li", class_="ipc-inline-list__item")
                    if category_li:
                        # Buscar span dentro del li
                        category_span = category_li.find("span")
                        if category_span and "Best" in category_span.text:
                            category = category_span.text.strip()
                            awards.append(category)
        
        # Registrar resultados para depuración
        if awards:
            logEvent(f"Found {len(awards)} awards: {awards}")
        else:
            logEvent("No awards found using ultra specific method")
            
    except Exception as e:
        logError(f"Error in ultra specific scraper: {repr(e)}")
        
    # Si no encontramos nada, guardar HTML para análisis
    if not awards:
        try:
            with open("imdb_awards_debug.html", "w", encoding="utf-8") as f:
                f.write(soup.prettify())
            logEvent("Saved HTML to imdb_awards_debug.html for debugging")
        except:
            pass
            
    return awards

# Main execution
def main():
    # Get list of movies from IMDb
    print("Scraping movies from IMDb...")
    movies = scrapeIMDbMoviesWithSlidingWindow(url)
    
    # Enrich with credits and financial data
    print("Enriching with credits and financial data, and academy awards...")
    for movie in tqdm(movies):
        # Get credits
        directors, thespians = scrapeMovieCredits(movie.url)
        movie.directors = directors
        movie.thespians = thespians
        
        # Get financial data from The Numbers
        # Extract movie title and try to determine year
        title = movie.title
        year_match = re.search(r"\((\d{4})\)", title)
        year = year_match.group(1) if year_match else "2024"
        
        budget, domestic, worldwide = getBoxOfficeDataFromTheNumbers(title, year)
        movie.budget = budget
        movie.domesticGross = domestic
        movie.worldwideGross = worldwide
        
        awards = scrapeAcademyAwards(movie.url)
        movie.academyAwards = awards
        # Respect the website's rate limits
        time.sleep(1)
    
    # Save data
    saveEntityListAsJSON(movies, 'movies.json')
    saveEntityListAsCSV(movies, 'movies.csv')
    print(f"Done! Scraped {len(movies)} movies with their financial data.")

# Generic entity saver/loader functions
def saveEntityListAsJSON(entities: List, filename: str):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([asdict(e) for e in entities], f, ensure_ascii=False, indent=2)

def saveEntityListAsCSV(entities: List, filename: str):
    if not entities:
        return
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=asdict(entities[0]).keys())

        writer.writeheader()
        for e in entities:
            writer.writerow(asdict(e))

if __name__ == "__main__":
    main()

Scraping movies from IMDb...
Enriching with credits and financial data, and academy awards...


100%|██████████| 60/60 [17:54<00:00, 17.90s/it]   

Done! Scraped 60 movies with their financial data.





In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from dataclasses import dataclass, field, asdict
from typing import Optional, List, Dict, Tuple, Literal
import re
import time
from datetime import datetime
import requests
from tqdm import tqdm
import json
import csv
import unicodedata

# Constantes para cada país
COUNTRIES = {
    "US": {
        "code": "US",
        "name": "United States",
        "award_name": "Academy Awards",
        "award_id": "ev0000003",
        "url": "https://www.imdb.com/search/title/?title_type=tv_movie,feature&release_date=2023-01-01,2024-12-31&country_of_origin=US&moviemeter=1,20000"
    },
    "GB": {
        "code": "GB",
        "name": "United Kingdom",
        "award_name": "BAFTA Film Awards",
        "award_id": "ev0000123",
        "url": "https://www.imdb.com/search/title/?title_type=tv_movie,feature&release_date=2023-01-01,2024-12-31&country_of_origin=GB&moviemeter=1,20000"
    },
    "ES": {
        "code": "ES",
        "name": "Spain",
        "award_name": "Goya Awards",
        "award_id": "ev0000211",
        "url": "https://www.imdb.com/es-es/search/title/?title_type=tv_movie,feature&release_date=2023-01-01,2024-12-31&country_of_origin=ES&moviemeter=1,20000"
    }
}

httpHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.imdb.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

@dataclass
class Director:
    name: str
    url: str

@dataclass
class Thespian:
    name: str
    url: str

@dataclass
class MovieInfo:
    title: str
    url: str
    country: str = ""  # Añadido campo de país
    imdbRating: Optional[float] = None
    imdbVotes: Optional[int] = None
    metascore: Optional[int] = None
    budget: Optional[str] = None
    domesticGross: Optional[str] = None
    worldwideGross: Optional[str] = None
    awards: List[str] = field(default_factory=list)  # Renombrado para ser genérico (no solo Academy Awards)
    directors: List[Director] = field(default_factory=list)
    thespians: List[Thespian] = field(default_factory=list)

# Generic logger
def logEvent(msg: str, level: str = "INFO", filePath: str = "scrapingLog.log") -> None:
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    for line in msg.strip().splitlines():
        with open(filePath, 'a') as f:
            f.write(f"{timestamp} [{level}] {line}\n")

# Error logger shortcut
def logError(msg: str, filePath: str = "scrapingErrors.log") -> None:
    logEvent(msg, level="ERROR", filePath=filePath)

# Universal try-catcher with controllable flow
def trierCatcher(keepGoing, traceMsg, task, *taskArgs, **taskKwargs):
    if not keepGoing:
        return (False, None)
    try:
        result = task(*taskArgs, **taskKwargs)
        return (True, result)
    except Exception as e:
        logError(f"{traceMsg}\n{repr(e)}")
        return (False, None)

# Extract movie info from current loaded page
def tryParseMovieItem(item, country_code: str) -> Optional[MovieInfo]:
    try:
        titleBlock = item.select_one("div.dli-parent h3")
        if not titleBlock:
            return None
        title = titleBlock.text.strip()
        anchor = item.select_one("a")
        if not anchor:
            return None
        url = "https://www.imdb.com" + anchor['href'].split('?')[0]

        imdbRatingSpan = item.select_one("span.ipc-rating-star--rating")
        imdbVotesSpan = item.select_one("span.ipc-rating-star--voteCount")
        metascoreSpan = item.select_one("span.metacritic-score-box")

        imdbRating = imdbRatingSpan.text if imdbRatingSpan else None
        imdbVotes = imdbVotesSpan.text if imdbVotesSpan else None
        metascore = metascoreSpan.text if metascoreSpan else None

        return MovieInfo(title=title, url=url, country=country_code, imdbRating=imdbRating, 
                         imdbVotes=imdbVotes, metascore=metascore)
    except Exception as e:
        logError(f"Error parsing a movie item: {repr(e)}")
        return None

def extractMoviesFromPage(pageSource: str, country_code: str) -> List[MovieInfo]:
    soup = BeautifulSoup(pageSource, 'html.parser')
    movieItems = soup.select("ul.ipc-metadata-list > li")
    movieBatch = []
    for item in movieItems:
        if len(item.attrs) == 1:
            movie = tryParseMovieItem(item, country_code)
            if movie:
                movieBatch.append(movie)
    return movieBatch

def getBrowser(someURL):
    options = Options()
    options.add_argument("--lang=en-US")
    driver = webdriver.Chrome(options=options)
    driver.get(someURL)
    return driver 

def scrapeIMDbMoviesWithSlidingWindow(country_info: Dict) -> List[MovieInfo]:
    someURL = country_info["url"]
    country_code = country_info["code"]
    movieList = []
    batchCounter = 0
    defaultBatchSize = 50
    pageBatchSize = 50
    sleepTimeSeconds = 0.5
    driverWaitTimeout = 10
    keepGoing = True
    nMoreButtonText = "ipc-see-more__button"
    buttonTextRetrievalJSCommand = "return arguments[0].innerText;"
    domPruningJSCommand = """
            const ul = document.querySelector("ul.ipc-metadata-list");
            const lis = ul.querySelectorAll("li");
            for (let i = 0; i < 50 && i < lis.length; i++) { lis[i].remove(); }
        """
    clicketyJSCommand = "arguments[0].click();"
    scrollJSCommand = "arguments[0].scrollIntoView({block: 'center'});"
    metadataList = "ipc-metadata-list-summary-item"
    pruningFailMsg = "JS movie LI cleanup failure"
    movieExtractionFailMsg = "Failed to extract movies from page"
    movieExtensionFailMsg = "Failed to append new movies"
    clickFailMsg = "Clickety failure"
    loadFailMsg = "New movie load wait failure"
    scrollFailMsg = "Scroll failure"
    batchSizeFailMsg = "Batch size update failure"
    sleepFailMsg = "Sleep failure"
    buttonFailMsg = "Button retrieval failure"
    buttonTextFailMsg = "Button text fetch failure"
    driver = getBrowser(someURL)

    while keepGoing:
        keepGoing, newMovies = trierCatcher(keepGoing, movieExtractionFailMsg, extractMoviesFromPage, driver.page_source, country_code)
        keepGoing, _ = trierCatcher(keepGoing, movieExtensionFailMsg, movieList.extend, newMovies)
        keepGoing, _ = trierCatcher(keepGoing, pruningFailMsg, driver.execute_script, domPruningJSCommand)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        
        # Verificar si hay botón de "Ver más" o si estamos al final
        try:
            button = WebDriverWait(driver, driverWaitTimeout).until(
                EC.element_to_be_clickable((By.CLASS_NAME, nMoreButtonText))
            )
        except:
            # Si no hay botón de "Ver más", terminamos
            logEvent(f"No more movies to load for {country_code}, ending scraping")
            break
            
        keepGoing, buttonText = trierCatcher(keepGoing, buttonTextFailMsg, driver.execute_script, buttonTextRetrievalJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, scrollFailMsg, driver.execute_script, scrollJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, _ = trierCatcher(keepGoing, clickFailMsg, driver.execute_script, clicketyJSCommand, button)
        keepGoing, match = trierCatcher(keepGoing, batchSizeFailMsg, re.search, r"(\d+)", buttonText)
        pageBatchSize = int(match.group(1)) if keepGoing and match else defaultBatchSize
        keepGoing, _ = trierCatcher(keepGoing, loadFailMsg, WebDriverWait(driver, driverWaitTimeout).until, lambda d: len(d.find_elements(By.CLASS_NAME, metadataList)) >= pageBatchSize)
    driver.quit()
    return movieList

def scrapeMovieCredits(movieURL: str) -> tuple[List[Director], List[Thespian]]:
    fullCreditsURL = movieURL + "fullcredits/"
    directors = []
    thespians = []
    try:
        response = requests.get(fullCreditsURL, headers=httpHeaders)
        response.raise_for_status()
    except Exception as e:
        logError(f"Failed to retrieve full credits page for {movieURL}\n{repr(e)}")
        return (directors, thespians)

    soup = BeautifulSoup(response.text, 'html.parser')

    # --- DIRECTORS ---
    try:
        director_section = soup.find("div", attrs={"data-testid": "sub-section-director"})
        if director_section:
            ul = director_section.find("ul")
            if ul:
                for li in ul.find_all("li", recursive=False):
                    anchor = li.find("a", class_="name-credits--title-text-big")
                    if anchor:
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor["href"].split("?")[0]
                        directors.append(Director(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing directors for {movieURL}\n{repr(e)}")
        
    # --- CAST (limited to top 5) ---
    try:
        cast_section = soup.find("div", attrs={"data-testid": "sub-section-cast"})
        if cast_section:
            ul = cast_section.find("ul")
            if ul:
                cast_lis = ul.find_all("li", class_="full-credits-page-list-item", recursive=False)[:5]
                for li in cast_lis:
                    anchor = li.find("a", class_="name-credits--title-text-big")
                    if anchor:
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor["href"].split("?")[0]
                        thespians.append(Thespian(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing cast for {movieURL}\n{repr(e)}")

    return (directors, thespians)

def cleanMovieTitle(title: str) -> str:
    """
    Clean movie title by removing:
    1. Ranking number prefix (e.g., "1. ", "25. ")
    2. Year in parentheses (e.g., "(2024)")
    """
    # Remove ranking number prefix (e.g., "1. ", "25. ")
    cleaned_title = re.sub(r'^\d+\.\s*', '', title)
    
    # Remove year in parentheses
    cleaned_title = re.sub(r'\s*\(\d{4}\)\s*', '', cleaned_title)
    
    return cleaned_title.strip()

def remove_accents(text: str) -> str:
    """
    Remove accents from characters (e.g. à -> a, ñ -> n)
    """
    normalized = unicodedata.normalize('NFD', text)
    return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

def formatMovieTitleForTheNumbers(title: str, year: str = "2024") -> str:
    """
    Format a movie title for use in The Numbers, respecting their naming style.
    Handles special characters, & → and, # → Number, and places articles correctly.
    """
    title = cleanMovieTitle(title)
    title = title.replace('&', 'and')
    title = title.replace('#', 'Number')
    title = re.sub(r'[^\w\s:-]', '', title)
    title = remove_accents(title)

    # Split around colon if present
    parts = title.split(':')
    if len(parts) == 2:
        main, subtitle = parts[0].strip(), parts[1].strip()

        # Check if main part starts with article
        words = main.split()
        articles = []
        if words and words[0].lower() in ['the', 'a', 'an', 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas']:
            articles.append(words.pop(0))  # remove and store

        # Recombine all parts
        final_words = words + articles + subtitle.split()
    else:
        # No colon, just do standard cleaning
        final_words = title.split()

        # Move any articles to the end (fallback behavior)
        articles = [w for w in final_words if w.lower() in ['the', 'a', 'an', 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas']]
        final_words = [w for w in final_words if w.lower() not in ['the', 'a', 'an', 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas']] + articles

    formatted_title = '-'.join(final_words) + f"-({year})"
    formatted_title = re.sub(r'-+', '-', formatted_title)
    return formatted_title

def getBoxOfficeDataFromTheNumbers(movie_title: str, country_code: str, year: str = "2024") -> tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Extract budget and box office data from The Numbers.
    Returns (budget, domestic_gross, worldwide_gross)
    """
    budget = None
    domestic_gross = None
    worldwide_gross = None
    
    if not movie_title:
        return budget, domestic_gross, worldwide_gross
    
    formatted_title = formatMovieTitleForTheNumbers(movie_title, year)
    thenumbers_url = f"https://www.the-numbers.com/movie/{formatted_title}#tab=summary"
    
    try:
        logEvent(f"Accessing URL for {country_code} movie: {thenumbers_url}")
        response = requests.get(thenumbers_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Buscar el presupuesto de forma precisa
        summary_div = soup.find("div", id="summary")
        if summary_div:
            budget_row = summary_div.find("b", string=re.compile(r"Production(\xa0| )Budget:"))
            if budget_row:
                td = budget_row.find_next("td")
                if td:
                    # Solo extraemos el número (ej. "$6,000,000")
                    match = re.search(r"\$\d[\d,]*", td.text)
                    if match:
                        budget = match.group(0).strip()

        # Buscar la tabla con los ingresos
        tables = soup.find_all("table")
        for table in tables:
            # Domestic Box Office
            domestic_element = table.find(string=re.compile("Domestic Box Office", re.IGNORECASE))
            if domestic_element:
                row = domestic_element.find_parent("tr")
                if row:
                    data_cells = row.find_all("td", class_="data")
                    if data_cells:
                        domestic_gross = data_cells[0].text.strip()
            
            # Worldwide Box Office
            worldwide_element = table.find(string=re.compile("Worldwide Box Office", re.IGNORECASE))
            if worldwide_element:
                row = worldwide_element.find_parent("tr")
                if row:
                    data_cells = row.find_all("td", class_="data")
                    if data_cells:
                        worldwide_gross = data_cells[0].text.strip()
        
        logEvent(f"Retrieved data for {movie_title} ({country_code}): Budget={budget}, Domestic={domestic_gross}, Worldwide={worldwide_gross}")
        
    except Exception as e:
        logError(f"Failed to retrieve box office data from The Numbers for {movie_title} ({country_code})\n{repr(e)}")
    
    return budget, domestic_gross, worldwide_gross

def scrapeNationalAwards(movieURL: str, country_info: Dict) -> List[str]:
    """
    Adaptada para reconocer diferentes premios según el país
    """
    awardsURL = movieURL + "awards/"
    awards = []
    award_id = country_info["award_id"]
    award_name = country_info["award_name"]
    country_code = country_info["code"]
    
    try:
        response = requests.get(awardsURL, headers=httpHeaders)
        response.raise_for_status()
    except Exception as e:
        logError(f"Failed to retrieve awards page for {movieURL} ({country_code})\n{repr(e)}")
        return awards

    soup = BeautifulSoup(response.text, 'html.parser')
    
    try:
        # Buscar específicamente la sección con atributo data-testid que contiene el ID del premio
        award_section = soup.find("div", attrs={"data-testid": lambda v: v and f"sub-section-{award_id}" in v})
        
        if not award_section:
            # Si no se encuentra con el ID específico, buscar por el texto del premio
            award_spans = soup.find_all("span", string=lambda s: s and award_name in s)
            for span in award_spans:
                section = span.find_parent("div", attrs={"data-testid": lambda v: v and "sub-section" in v})
                if section:
                    award_section = section
                    break
        
        if award_section:
            # Términos específicos de ganador según el idioma y país
            winner_terms = ["Winner", "Ganador", "Ganadora"]
            
            # Buscar todos los items de la lista de premios
            award_items = award_section.find_all("li", class_="ipc-metadata-list-summary-item")
            
            if not award_items:
                # Si no encuentra con ese selector, buscar con uno más general
                award_items = award_section.find_all("div", class_=lambda c: c and "ipc-metadata-list-summary-item" in c)
            
            for item in award_items:
                # Verificar si es ganador
                is_winner = any(term in item.text for term in winner_terms)
                if not is_winner:
                    continue
                
                # Buscar la categoría del premio
                category_span = item.find("span", class_=lambda c: c and "awardCategoryName" in c)
                
                if category_span:
                    category = category_span.text.strip()
                    awards.append(category)
                else:
                    # Buscar en elementos li dentro de listas inline
                    category_li = item.find("li", class_="ipc-inline-list__item")
                    if category_li:
                        # Buscar span dentro del li
                        category_span = category_li.find("span")
                        if category_span and ("Best" in category_span.text or "Mejor" in category_span.text):
                            category = category_span.text.strip()
                            awards.append(category)
        
        # Registrar resultados para depuración
        if awards:
            logEvent(f"Found {len(awards)} {award_name} awards for a {country_code} movie: {awards}")
        else:
            logEvent(f"No {award_name} awards found for a {country_code} movie")
            
    except Exception as e:
        logError(f"Error in award scraper for {country_code}: {repr(e)}")
    
    return awards

# Generic entity saver/loader functions
def saveEntityListAsJSON(entities: List, filename: str):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([asdict(e) for e in entities], f, ensure_ascii=False, indent=2)

def saveEntityListAsCSV(entities: List, filename: str):
    if not entities:
        return
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=asdict(entities[0]).keys())

        writer.writeheader()
        for e in entities:
            writer.writerow(asdict(e))

def scrape_country_movies(country_info: Dict) -> List[MovieInfo]:
    """
    Función para scrapear películas de un país específico
    """
    country_code = country_info["code"]
    country_name = country_info["name"]
    
    print(f"Scraping movies from {country_name}...")
    movies = scrapeIMDbMoviesWithSlidingWindow(country_info)
    
    print(f"Found {len(movies)} movies from {country_name}")
    print(f"Enriching with credits, financial data, and {country_info['award_name']}...")
    
    for movie in tqdm(movies):
        # Get credits
        directors, thespians = scrapeMovieCredits(movie.url)
        movie.directors = directors
        movie.thespians = thespians
        
        # Get financial data from The Numbers
        # Extract movie title and try to determine year
        title = movie.title
        year_match = re.search(r"\((\d{4})\)", title)
        year = year_match.group(1) if year_match else "2024"
        
        budget, domestic, worldwide = getBoxOfficeDataFromTheNumbers(title, country_code, year)
        movie.budget = budget
        movie.domesticGross = domestic
        movie.worldwideGross = worldwide
        
        # Get national awards
        awards = scrapeNationalAwards(movie.url, country_info)
        movie.awards = awards
        
        # Respect the website's rate limits
        time.sleep(1)
    
    # Save data for this country
    saveEntityListAsJSON(movies, f'movies_{country_code.lower()}.json')
    saveEntityListAsCSV(movies, f'movies_{country_code.lower()}.csv')
    
    return movies

def main():
    all_movies = []
    
    # Scrapear para cada país
    for country_code, country_info in COUNTRIES.items():
        country_movies = scrape_country_movies(country_info)
        all_movies.extend(country_movies)
        
        # Pausa entre países para evitar sobrecarga de servidores
        if country_code != list(COUNTRIES.keys())[-1]:  # Si no es el último país
            print(f"Pausing between countries...")
            time.sleep(5)
    
    # Guardar todos los datos combinados
    saveEntityListAsJSON(all_movies, 'movies_all_countries.json')
    saveEntityListAsCSV(all_movies, 'movies_all_countries.csv')
    print(f"Done! Scraped {len(all_movies)} movies from all countries with their data.")

if __name__ == "__main__":
    main()

Scraping movies from United States...
Found 809 movies from United States
Enriching with credits, financial data, and Academy Awards...


 23%|██▎       | 190/809 [20:49<1:02:57,  6.10s/it]