In [1]:
import pandas as pd
import ast
import requests
import re
import unidecode
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

## Extracting Awards from Directors URLs

In [2]:
def parsing_movies_info(name_archive: str) -> pd.DataFrame:
    """
    This function reads a file containing movie information and returns a dataframe
    containing the movie information.

    Args:
        name_archive: name of the xlsx archive with the movie info

    Returns:
        movies_df: dataframe with the movie info
    """
    movies_df = pd.read_csv(name_archive)
    # Convert stringified lists to actual Python lists
    movies_df['directors'] = movies_df['directors'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else []
    )

    # Now extract names and URLs
    movies_df['director_names'] = movies_df['directors'].apply(
        lambda x: ', '.join([d.get('name', '') for d in x]) if isinstance(x, list) else ''
    )

    movies_df['director_urls'] = movies_df['directors'].apply(
        lambda x: ', '.join([d.get('url', '') for d in x]) if isinstance(x, list) else ''
    )
    return movies_df

def parsing_directors_URL(movies_df: pd.DataFrame) -> list:
    """
    This function receives the dataframe with the movie info and parse the
    director urls and save it into a list.

    Args:
        movies_df: dataframe with the movie info

    Returns:
        director_urls: list with the directors urls
    """
    # Get all non-empty director_urls, split by comma, and flatten
    director_urls = movies_df[movies_df['director_urls'] != '']['director_urls'] \
        .str.split(', ') \
        .explode() \
        .tolist()
    return director_urls

def get_director_raw_info(url: str) -> tuple[list[Tag], str]:
    """
    This function retrieves the raw HTML content of a director's IMDb page
    and extracts the raw info block and the director's name.

    Args:
        url: IMDb URL of the director's page

    Returns:
        dir_info_raw: list of HTML elements containing the raw data about the director
        name: name of the director
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    dir_info_raw = soup.find_all('li', {'data-testid': True})

    # Extracting director name from the title
    title_tag = soup.find('title')
    name = title_tag.get_text(strip=True).split('-')[0].strip() if title_tag else 'Nombre no encontrado'

    return dir_info_raw, name

def get_director_structured_info(name: str, item: Tag, director_url: str) -> dict:
    """
    This function takes the raw HTML element of a director's credit and returns
    a structured dictionary with relevant information.

    Args:
        name: name of the director
        item: raw HTML item from the IMDb page

    Returns:
        response: structured information including name, category, movie name, rating, and URL;
                  if an error occurs, a dictionary with an error message is returned
    """
    try:
        category = item.get("data-testid", default="no_category")

        if not category.startswith("cred"):
            return {"error": "unexpected category"}

        match = re.search(r'_(.*?)_', category)
        category_cleaned = unidecode.unidecode(match.group(1))

        info = item.find('a', {'aria-label': True})
        movie_name = info["aria-label"]
        url = f"https://www.imdb.com{info.get('href')}"

        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        response = {
            "name": name,
            "category": category_cleaned,
            "movie_name": movie_name,
            "rating": rating,
            "movie_url": url,
            "director_url": director_url
        }
        return response
    except Exception as ex:
        return {"error": f"{ex}"}

def obtaining_info_per_url(urls: list[str]) -> list[dict]:
    """
    This function loops through a list of director IMDb URLs, parses the raw and structured
    information, and aggregates it into a list of dictionaries.

    Args:
        urls: list of IMDb URLs for directors

    Returns:
        all_structured_data: list of dictionaries containing structured movie info for each director
    """
    all_structured_data = []

    for url in tqdm(urls):
        try:
            raw_info, director_name = get_director_raw_info(url)

            structured_data = list(
                filter(
                    lambda item: "error" not in item,
                    map(lambda item: get_director_structured_info(director_name, item, url), raw_info)
                )
            )

            all_structured_data.extend(structured_data)
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
    return all_structured_data

def scrapeDirectorAwards(director_url: str):
    awards_url = director_url.rstrip('/') + "/awards"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(awards_url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to scrape {awards_url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    award_divs = soup.select(".ipc-metadata-list-summary-item__tc")

    # Extract director's name from the <title> tag or header
    director_name = ""
    try:
        director_name_tag = soup.select_one("title")
        if director_name_tag:
            director_name = director_name_tag.text.split("-")[0].strip()
    except:
        pass

    results = []
    for award_div in award_divs:
        try:
            main_award = award_div.select_one("a.ipc-metadata-list-summary-item__t")
            if not main_award:
                continue
            year_result = main_award.contents[0].strip()
            award_name = main_award.select_one("span").text.strip()

            category_tag = award_div.select_one(".awardCategoryName")
            category = category_tag.text.strip() if category_tag else ""

            title_tag = award_div.select_one(".ipc-metadata-list-summary-item__stl a")
            title = title_tag.text.strip() if title_tag else ""

            results.append({
                "director_name": director_name,
                "director_url": director_url,
                "year_result": year_result,
                "award_name": award_name,
                "category": category,
                "title": title
            })
        except Exception as e:
            print(f"⚠️ Failed to parse one award block: {e}")
            continue

    return results

In [3]:
# movies_df = parsing_movies_info("outputs/movies.csv")
# urls = parsing_directors_URL(movies_df)
# all_structured_data = obtaining_info_per_url(urls)
# df_directors = pd.DataFrame(all_structured_data)

In [4]:
# all_awards = pd.DataFrame()
# for url in tqdm(urls):
#     awards_data = scrapeDirectorAwards(url)
#     if awards_data:  # Only proceed if something was scraped
#         awards_df = pd.DataFrame(awards_data)
#         all_awards = pd.concat([all_awards, awards_df], ignore_index=True)

In [5]:
# all_awards.to_csv("./outputs/directors_awards_info.csv", index=False)
# df_directors.to_csv("./outputs/directors_info.csv", index=False)

## Extracting top directors from Awards pages

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.chrome.options import Options

## Oscars Awards

In [7]:
BASE_URL = "https://www.imdb.com"
OSCARS_HOME = f"{BASE_URL}/es-es/oscars/"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [8]:
def extract_director_urls_from_section(section):
    directors = []
    # Find all <a> tags that contain "/name/" in href
    director_links = section.select('a[href*="/name/"]')

    for a in director_links:
        relative_url = a.get("href")
        full_url = BASE_URL + relative_url.split("?")[0]
        directors.append(full_url)
    
    return directors[0]

def get_year_urls_by_decade_with_selenium():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("window-size=1920x1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)
    driver.get(OSCARS_HOME)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    decade_tabs = driver.find_elements(By.CSS_SELECTOR, '.ipc-tabs[role="tablist"] li[role="tab"]')
    decade_tabs = decade_tabs[:5]

    decade_to_years = {}

    for i, tab in enumerate(decade_tabs):
        decade_label = tab.text.strip()
        print(f"📆 Switching to: {decade_label}")

        try:
            # Scroll into view and wait until clickable
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(tab))
            time.sleep(1)
            tab.click()
            time.sleep(2)  # Let the page update
        except Exception as e:
            print(f"❌ Could not click tab for {decade_label}: {e}")
            continue

        # Parse page source after switching
        soup = BeautifulSoup(driver.page_source, "html.parser")
        year_links = soup.select(".ipc-chip-list__scroller a[href*='/event/ev0000003/']")
        years = [(a.text.strip(), "https://www.imdb.com" + a['href']) for a in year_links][:-1]

        decade_to_years[decade_label] = years
        time.sleep(1)

    driver.quit()
    return decade_to_years

def extract_best_director(year, url, decade):
    try:
        res = requests.get(url, headers=HEADERS)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to get {url}: {e}")
        return None

    soup = BeautifulSoup(res.text, "html.parser")
    section = soup.find("section", {"data-testid": "BestAchievementinDirecting"})
    if not section:
        section = soup.find("section", {"data-testid": "BestDirector"})
        if not section:
            print(f"⚠️ No Best Director section found for {year}")
            return None
        
    nominees = []
    nominee_items = section.select("li.ipc-metadata-list__item")

    for item in nominee_items:
        try:
            result_tag = item.select_one(".ipc-signpost__text")
            result = result_tag.text.strip() if result_tag else "Nominee"

            director_tag = item.select_one('a.ipc-link[href*="/name/"]')
            director = director_tag.text.strip() if director_tag else ""

            director_url = extract_director_urls_from_section(item)

            movie_tag = item.select_one('a[href*="/title/"]')
            movie = movie_tag.text.strip() if movie_tag else ""

            nominees.append({
                "decade": decade,
                "year": year,
                "result": result,
                "director": director,
                "movie": movie,
                "director url": director_url
            })
        except Exception as e:
            print(f"⚠️ Error parsing nominee: {e}")
            continue
    return nominees

def scrape_all_best_director_winners():
    decade_to_years = get_year_urls_by_decade_with_selenium()
    all_data = []

    for decade, year_list in decade_to_years.items():
        for year, url in year_list:
            print(f"🔍 Scraping {year} from {url}")
            nominees = extract_best_director(year, url, decade)
            all_data.extend(nominees)  # Append all nominees
            time.sleep(1)
    
    return pd.DataFrame(all_data)

In [9]:
oscars_info = scrape_all_best_director_winners()

📆 Switching to: Década de 2020
📆 Switching to: Década de 2010
📆 Switching to: Década de 2000
📆 Switching to: Década de 1990
📆 Switching to: Década de 1980
🔍 Scraping 2025 from https://www.imdb.com/es-es/event/ev0000003/2025/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_1
🔍 Scraping 2024 from https://www.imdb.com/es-es/event/ev0000003/2024/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_2
🔍 Scraping 2023 from https://www.imdb.com/es-es/event/ev0000003/2023/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_3
🔍 Scraping 2022 from https://www.imdb.com/es-es/event/ev0000003/2022/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_4
🔍 Scraping 2021 from https://www.imdb.com/es-es/event/ev0000003/2021/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_5
🔍 Scraping 2020 from https://www.imdb.com/es-es/event/ev0000003/2020/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_6
🔍 Scraping 2019 from https://www.imdb.com/es-es/event/ev0000003/2019/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_1
🔍 Scraping 2018 from https://www.imdb.com/es-es/event/ev0000003/2018

In [10]:
oscars_info.to_csv('outputs/oscars_awards_directors_info.csv', index=False)

In [11]:
oscars_directors_url = oscars_info["director url"].unique().tolist()
oscars_directors_complete_info = obtaining_info_per_url(oscars_directors_url)
df_oscars_directors_complete_info = pd.DataFrame(oscars_directors_complete_info)

100%|██████████| 147/147 [03:54<00:00,  1.60s/it]


In [12]:
df_oscars_directors_complete_info.to_csv('outputs/oscars_directors_complete_info.csv', index=False)

## Golden Globes Awards

In [13]:
BASE_URL = "https://www.imdb.com"
GG_HOME = f"{BASE_URL}/es-es/golden-globes/"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [14]:
def get_year_urls_by_decade_with_selenium():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("window-size=1920x1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)
    driver.get(GG_HOME)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    decade_tabs = driver.find_elements(By.CSS_SELECTOR, '.ipc-tabs[role="tablist"] li[role="tab"]')
    decade_tabs = decade_tabs[:5]

    decade_to_years = {}

    for i, tab in enumerate(decade_tabs):
        decade_label = tab.text.strip()
        print(f"📆 Switching to: {decade_label}")

        try:
            # Scroll into view and wait until clickable
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(tab))
            time.sleep(1)
            tab.click()
            time.sleep(2)  # Let the page update
        except Exception as e:
            print(f"❌ Could not click tab for {decade_label}: {e}")
            continue

        # Parse page source after switching
        soup = BeautifulSoup(driver.page_source, "html.parser")
        year_links = soup.select(".ipc-chip-list__scroller a[href*='/event/ev0000292/']")
        years = [(a.text.strip(), "https://www.imdb.com" + a['href']) for a in year_links][:-1]

        decade_to_years[decade_label] = years
        time.sleep(1)

    driver.quit()
    return decade_to_years

def extract_best_director(year, url, decade):
    try:
        res = requests.get(url, headers=HEADERS)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to get {url}: {e}")
        return None

    soup = BeautifulSoup(res.text, "html.parser")
    section = soup.find("section", {"data-testid": "BestDirector,MotionPicture"})
    if not section:
        section = soup.find("section", {"data-testid": "BestDirector-MotionPicture"})
        if not section:
            print(f"⚠️ No Best Director section found for {year}")
            return None
        
    nominees = []
    nominee_items = section.select("li.ipc-metadata-list__item")

    for item in nominee_items:
        try:
            result_tag = item.select_one(".ipc-signpost__text")
            result = result_tag.text.strip() if result_tag else "Nominee"

            director_tag = item.select_one('a.ipc-link[href*="/name/"]')
            director = director_tag.text.strip() if director_tag else ""

            director_url = extract_director_urls_from_section(item)

            movie_tag = item.select_one('a[href*="/title/"]')
            movie = movie_tag.text.strip() if movie_tag else ""

            nominees.append({
                "decade": decade,
                "year": year,
                "result": result,
                "director": director,
                "movie": movie,
                "director url": director_url
            })
        except Exception as e:
            print(f"⚠️ Error parsing nominee: {e}")
            continue
    return nominees

def scrape_all_best_director_winners():
    decade_to_years = get_year_urls_by_decade_with_selenium()
    all_data = []

    for decade, year_list in decade_to_years.items():
        for year, url in year_list:
            print(f"🔍 Scraping {year} from {url}")
            nominees = extract_best_director(year, url, decade)
            all_data.extend(nominees)  # Append all nominees
            time.sleep(1)
    
    return pd.DataFrame(all_data)

In [15]:
golden_globes_info = scrape_all_best_director_winners()

📆 Switching to: Década de 2020
📆 Switching to: Década de 2010
📆 Switching to: Década de 2000
📆 Switching to: Década de 1990
📆 Switching to: Década de 1980
🔍 Scraping 2025 from https://www.imdb.com/es-es/event/ev0000292/2025/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_1
🔍 Scraping 2024 from https://www.imdb.com/es-es/event/ev0000292/2024/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_2
🔍 Scraping 2023 from https://www.imdb.com/es-es/event/ev0000292/2023/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_3
🔍 Scraping 2022 from https://www.imdb.com/es-es/event/ev0000292/2022/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_4
🔍 Scraping 2021 from https://www.imdb.com/es-es/event/ev0000292/2021/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_5
🔍 Scraping 2020 from https://www.imdb.com/es-es/event/ev0000292/2020/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_6
🔍 Scraping 2019 from https://www.imdb.com/es-es/event/ev0000292/2019/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_1
🔍 Scraping 2018 from https://www.imdb.com/es-es

In [16]:
golden_globes_info.to_csv('outputs/golden_globes_awards_directors_info.csv', index=False)

In [17]:
gg_directors_url = golden_globes_info["director url"].unique().tolist()
gg_directors_complete_info = obtaining_info_per_url(gg_directors_url)
df_gg_directors_complete_info = pd.DataFrame(gg_directors_complete_info)

100%|██████████| 133/133 [04:39<00:00,  2.10s/it]


In [18]:
df_gg_directors_complete_info.to_csv('outputs/golden_globes_directors_complete_info.csv', index=False)