Notes:

1. If you had not, install selenium using `pip install selenium webdriver-manager`
2. If you had not, install tqdm using `pip install tqdm`
3. instead of storing strings & numbers, or generic objects, it is wiser to store typed objects; two good choices are
    1. __[namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple)__
    2. __[dataclass](https://docs.python.org/3/library/dataclasses.html)__

In [53]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from dataclasses import dataclass, field, asdict
from typing import Optional, List
import re
import time
from datetime import datetime

url = 'https://www.imdb.com/search/title/?title_type=tv_movie,feature&release_date=2024-01-01,2024-12-31&country_of_origin=ES'

@dataclass
class Director:
    name: str
    url: str

@dataclass
class Thespian:
    name: str
    url: str

@dataclass
class MovieInfo:
    title: str
    url: str
    imdbRating: Optional[float] = None
    imdbVotes: Optional[int] = None
    metascore: Optional[int] = None
    directors: List[str] = field(default_factory=list)
    thespians: List[str] = field(default_factory=list)

# Generic logger
def logEvent(msg: str, level: str = "INFO", filePath: str = "scrapingLog.log") -> None:
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    for line in msg.strip().splitlines():
        with open(filePath, 'a') as f:
            f.write(f"{timestamp} [{level}] {line}\n")

# Error logger shortcut
def logError(msg: str, filePath: str = "scrapingErrors.log") -> None:
    logEvent(msg, level="ERROR", filePath=filePath)

# Universal try-catcher with controllable flow
def trierCatcher(keepGoing, traceMsg, task, *taskArgs, **taskKwargs):
    if not keepGoing:
        return (False, None)
    try:
        result = task(*taskArgs, **taskKwargs)
        return (True, result)
    except Exception as e:
        logError(f"{traceMsg}\n{repr(e)}")
        return (False, None)

# Extract movie info from current loaded page
def tryParseMovieItem(item) -> Optional[MovieInfo]:
    try:
        titleBlock = item.select_one("div.dli-parent h3")
        if not titleBlock:
            return None
        title = titleBlock.text.strip()
        anchor = item.select_one("a")
        if not anchor:
            return None
        url = "https://www.imdb.com" + anchor['href'].split('?')[0]

        imdbRatingSpan = item.select_one("span.ipc-rating-star--rating")
        imdbVotesSpan = item.select_one("span.ipc-rating-star--voteCount")
        metascoreSpan = item.select_one("span.metacritic-score-box")

        imdbRating = imdbRatingSpan.text if imdbRatingSpan else None
        imdbVotes = imdbVotesSpan.text if imdbVotesSpan else None
        metascore = metascoreSpan.text if metascoreSpan else None

        return MovieInfo(title=title, url=url, imdbRating=imdbRating, imdbVotes=imdbVotes, metascore=metascore)
    except Exception as e:
        logError(f"Error parsing a movie item: {repr(e)}")
        return None

def extractMoviesFromPage(pageSource: str) -> List[MovieInfo]:
    soup = BeautifulSoup(pageSource, 'html.parser')
    movieItems = soup.select("ul.ipc-metadata-list > li")
    movieBatch = []
    for item in movieItems:
        if len(item.attrs) == 1:
            movie = tryParseMovieItem(item)
            if movie:
                movieBatch.append(movie)
    return movieBatch

def getBrowser(someURL):
    driver = webdriver.Chrome()
    driver.get(someURL)
    return driver    

def scrapeIMDbMoviesWithSlidingWindow(someURL: str) -> List[MovieInfo]:
    movieList = []
    batchCounter = 0
    defaultBatchSize = 50
    pageBatchSize = 50
    sleepTimeSeconds = 0.5
    driverWaitTimeout = 10
    keepGoing = True
    nMoreButtonText = "ipc-see-more__button"
    buttonTextRetrievalJSCommand = "return arguments[0].innerText;"
    domPruningJSCommand = """
            const ul = document.querySelector("ul.ipc-metadata-list");
            const lis = ul.querySelectorAll("li");
            for (let i = 0; i < 50 && i < lis.length; i++) { lis[i].remove(); }
        """
    clicketyJSCommand = "arguments[0].click();"
    scrollJSCommand = "arguments[0].scrollIntoView({block: 'center'});"
    metadataList = "ipc-metadata-list-summary-item"
    pruningFailMsg = "JS movie LI cleanup failure"
    movieExtractionFailMsg = "Failed to extract movies from page"
    movieExtensionFailMsg = "Failed to append new movies"
    clickFailMsg = "Clickety failure"
    loadFailMsg = "New movie load wait failure"
    scrollFailMsg = "Scroll failure"
    batchSizeFailMsg = "Batch size update failure"
    sleepFailMsg = "Sleep failure"
    buttonFailMsg = "Button retrieval failure"
    buttonTextFailMsg = "Button text fetch failure"
    driver = getBrowser(someURL)

    while keepGoing:
        keepGoing, newMovies = trierCatcher(keepGoing, movieExtractionFailMsg, extractMoviesFromPage, driver.page_source)
        keepGoing, _ = trierCatcher(keepGoing, movieExtensionFailMsg, movieList.extend, newMovies)
        keepGoing, _ = trierCatcher(keepGoing, pruningFailMsg, driver.execute_script, domPruningJSCommand)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, button = trierCatcher(keepGoing, buttonFailMsg, WebDriverWait(driver, driverWaitTimeout).until, EC.element_to_be_clickable((By.CLASS_NAME, nMoreButtonText)))
        keepGoing, buttonText = trierCatcher(keepGoing, buttonTextFailMsg, driver.execute_script, buttonTextRetrievalJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, scrollFailMsg, driver.execute_script, scrollJSCommand, button)
        keepGoing, _ = trierCatcher(keepGoing, sleepFailMsg, time.sleep, sleepTimeSeconds)
        keepGoing, _ = trierCatcher(keepGoing, clickFailMsg, driver.execute_script, clicketyJSCommand, button)
        keepGoing, match = trierCatcher(keepGoing, batchSizeFailMsg, re.search, r"(\d+)", buttonText)
        pageBatchSize = int(match.group(1)) if keepGoing and match else defaultBatchSize
        keepGoing, _ = trierCatcher(keepGoing, loadFailMsg, WebDriverWait(driver, driverWaitTimeout).until, lambda d: len(d.find_elements(By.CLASS_NAME, metadataList)) >= pageBatchSize)
    driver.quit()
    return movieList

movies = scrapeIMDbMoviesWithSlidingWindow(url)

In [54]:
def scrapeMovieCredits(movieURL: str) -> tuple[List[Director], List[Thespian]]:
    fullCreditsURL = movieURL + "fullcredits/"
    directors = []
    thespians = []
    try:
        response = requests.get(fullCreditsURL)
        response.raise_for_status()
    except Exception as e:
        logError(f"Failed to retrieve full credits page for {movieUrl}\n{repr(e)}")
        return (directors, thespians)

    soup = BeautifulSoup(response.text, 'html.parser')

    # --- DIRECTORS ---
    try:
        header = soup.find("h4", string=re.compile("Directed by"))
        if header:
            directorTable = header.find_next_sibling("table")
            if directorTable:
                for a in directorTable.find_all("a"):
                    name = a.text.strip()
                    url = "https://www.imdb.com" + a['href'].split('?')[0]
                    if name and url:
                        directors.append(Director(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing directors for {movieURL}\n{repr(e)}")

    # --- CAST ---
    try:
        castTable = soup.find("table", class_="cast_list")
        if castTable:
            rows = castTable.find_all("tr", class_=lambda c: c != "castlist_label")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) >= 2:
                    anchor = cols[1].find("a")
                    if anchor and anchor.text.strip():
                        name = anchor.text.strip()
                        url = "https://www.imdb.com" + anchor['href'].split('?')[0]
                        thespians.append(Thespian(name=name, url=url))
    except Exception as e:
        logError(f"Failed parsing cast for {movieURL}\n{repr(e)}")

    return (directors, thespians)

In [55]:
from tqdm import tqdm
import requests

for movie in tqdm(movies):
    directors, thespians = scrapeMovieCredits(movie.url)
    movie.directors = directors
    movie.thespians = thespians
    time.sleep(0.5)  # Respect IMDb

100%|██████████| 475/475 [09:44<00:00,  1.23s/it]


In [56]:
print(len(movies))

475


In [57]:
print(movies)



In [58]:
import json, csv

# Generic entity saver/loader functions
def saveEntityListAsJSON(entities: List, filename: str):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([asdict(e) for e in entities], f, ensure_ascii=False, indent=2)

def saveEntityListAsCSV(entities: List, filename: str):
    if not entities:
        return
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=asdict(entities[0]).keys())
        writer.writeheader()
        for e in entities:
            writer.writerow(asdict(e))

In [59]:
saveEntityListAsJSON(movies, 'movies.json')
saveEntityListAsCSV(movies, 'movies.csv')