In [134]:
import pandas as pd
import ast
import requests
import re
import unidecode
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from urllib.parse import urlparse, urlunparse

## Extracting Awards from Directors URLs

In [135]:
def parsing_movies_info(name_archive: str) -> pd.DataFrame:
    """
    This function reads a file containing movie information and returns a dataframe
    containing the movie information.

    Args:
        name_archive: name of the xlsx archive with the movie info

    Returns:
        movies_df: dataframe with the movie info
    """
    movies_df = pd.read_csv(name_archive)
    # Convert stringified lists to actual Python lists
    movies_df['directors'] = movies_df['directors'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else []
    )

    # Now extract names and URLs
    movies_df['director_names'] = movies_df['directors'].apply(
        lambda x: ', '.join([d.get('name', '') for d in x]) if isinstance(x, list) else ''
    )

    movies_df['director_urls'] = movies_df['directors'].apply(
        lambda x: ', '.join([d.get('url', '') for d in x]) if isinstance(x, list) else ''
    )
    return movies_df

def parsing_directors_URL(movies_df: pd.DataFrame) -> list:
    """
    This function receives the dataframe with the movie info and parse the
    director urls and save it into a list.

    Args:
        movies_df: dataframe with the movie info

    Returns:
        director_urls: list with the directors urls
    """
    # Get all non-empty director_urls, split by comma, and flatten
    director_urls = movies_df[movies_df['director_urls'] != '']['director_urls'] \
        .str.split(', ') \
        .explode() \
        .tolist()
    return director_urls

def get_director_raw_info(url: str) -> tuple[list[Tag], str]:
    """
    This function retrieves the raw HTML content of a director's IMDb page
    and extracts the raw info block and the director's name.

    Args:
        url: IMDb URL of the director's page

    Returns:
        dir_info_raw: list of HTML elements containing the raw data about the director
        name: name of the director
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    dir_info_raw = soup.find_all('li', {'data-testid': True})

    # Extracting director name from the title
    title_tag = soup.find('title')
    name = title_tag.get_text(strip=True).split('-')[0].strip() if title_tag else 'Nombre no encontrado'

    return dir_info_raw, name

def get_director_structured_info(name: str, item: Tag, director_url: str) -> dict:
    """
    This function takes the raw HTML element of a director's credit and returns
    a structured dictionary with relevant information.

    Args:
        name: name of the director
        item: raw HTML item from the IMDb page

    Returns:
        response: structured information including name, category, movie name, rating, and URL;
                  if an error occurs, a dictionary with an error message is returned
    """
    try:
        category = item.get("data-testid", default="no_category")

        if not category.startswith("cred"):
            return {"error": "unexpected category"}

        match = re.search(r'_(.*?)_', category)
        category_cleaned = unidecode.unidecode(match.group(1))

        info = item.find('a', {'aria-label': True})
        movie_name = info["aria-label"]
        url = f"https://www.imdb.com{info.get('href')}"

        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        response = {
            "name": name,
            "category": category_cleaned,
            "movie_name": movie_name,
            "rating": rating,
            "movie_url": url,
            "director_url": director_url
        }
        return response
    except Exception as ex:
        return {"error": f"{ex}"}

def obtaining_info_per_url(urls: list[str]) -> list[dict]:
    """
    This function loops through a list of director IMDb URLs, parses the raw and structured
    information, and aggregates it into a list of dictionaries.

    Args:
        urls: list of IMDb URLs for directors

    Returns:
        all_structured_data: list of dictionaries containing structured movie info for each director
    """
    all_structured_data = []

    for url in tqdm(urls):
        try:
            raw_info, director_name = get_director_raw_info(url)

            structured_data = list(
                filter(
                    lambda item: "error" not in item,
                    map(lambda item: get_director_structured_info(director_name, item, url), raw_info)
                )
            )

            all_structured_data.extend(structured_data)
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
    return all_structured_data

def scrapeDirectorAwards(director_url: str) -> list[dict]:
    """
    Scrapes a director's IMDb awards page and returns structured information about each award.

    This function navigates to the director's IMDb awards subpage, parses the award listings,
    and extracts relevant metadata such as year, award name, category, and the title associated
    with the award (e.g., a movie or show). If the page cannot be accessed or parsed, it returns
    an empty list.

    Args:
        director_url: The base URL to the director's IMDb profile page (e.g., "https://www.imdb.com/name/nm0000001").

    Returns:
        results: A list of dictionaries containing the director's name, IMDb URL, award result (e.g., winner or nominee),
                 award name, category, and the title associated with the award. Returns an empty list if scraping fails.
    """
    awards_url = director_url.rstrip('/') + "/awards"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(awards_url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to scrape {awards_url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    award_divs = soup.select(".ipc-metadata-list-summary-item__tc")

    # Extract director's name from the <title> tag or header
    director_name = ""
    try:
        director_name_tag = soup.select_one("title")
        if director_name_tag:
            director_name = director_name_tag.text.split("-")[0].strip()
    except:
        pass

    results = []
    for award_div in award_divs:
        try:
            main_award = award_div.select_one("a.ipc-metadata-list-summary-item__t")
            if not main_award:
                continue
            year_result = main_award.contents[0].strip()
            award_name = main_award.select_one("span").text.strip()

            category_tag = award_div.select_one(".awardCategoryName")
            category = category_tag.text.strip() if category_tag else ""

            title_tag = award_div.select_one(".ipc-metadata-list-summary-item__stl a")
            title = title_tag.text.strip() if title_tag else ""

            results.append({
                "director_name": director_name,
                "director_url": director_url,
                "year_result": year_result,
                "award_name": award_name,
                "category": category,
                "title": title
            })
        except Exception as e:
            print(f"⚠️ Failed to parse one award block: {e}")
            continue

    return results

def standardize_imdb_url(url: str) -> str:
    """
    Standardizes an IMDb URL by removing locale-specific segments and stripping query parameters or fragments.

    This function ensures that IMDb URLs are consistent by removing regional prefixes (e.g., '/es-es/')
    from the path and eliminating any trailing query strings or fragments.

    Args:
        url: The original IMDb URL, potentially containing locale info, query parameters, or fragments.

    Returns:
        A cleaned IMDb URL with a standardized path and no query or fragment components.
    """
    parsed = urlparse(url)
    # Remove '/es-es/' if present
    clean_path = parsed.path.replace("/es-es/", "/")
    # Ensure only the base URL, without query or fragment
    return urlunparse(parsed._replace(path=clean_path, query="", fragment=""))

In [137]:
movies_df = parsing_movies_info("outputs/movies_all_countries.csv")
urls = parsing_directors_URL(movies_df)
all_structured_data = obtaining_info_per_url(urls)
df_directors = pd.DataFrame(all_structured_data)

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
100%|██████████| 1130/1130 [33:36<00:00,  1.78s/it]


In [205]:
df_directors['movie_url'] = df_directors['movie_url'].apply(standardize_imdb_url)
df_directors['director_url'] = df_directors['director_url'].apply(standardize_imdb_url)
df_cleaned = (
    df_directors
    .groupby(['name', 'movie_name', 'movie_url', 'director_url', 'rating'], as_index=False)
    .agg({'category': lambda x: list(x)})
    .rename(columns={'category': 'categories'})
)

In [206]:
movies_df['url'] = movies_df['url'].apply(standardize_imdb_url)
movies_df_dropped = movies_df.drop(['title', 'director_names', 'director_urls'], axis=1)
movies_df_dropped = movies_df_dropped.rename(columns={'url': 'movie_url'})

In [239]:
df_merged = df_cleaned.merge(movies_df_dropped, on='movie_url', how='left')
df_merged = df_merged.drop(['awards', 'directors', 'name'], axis=1)
df_merged = df_merged.rename(columns={'categories': 'participation_categories'})

In [189]:
all_awards = pd.DataFrame()
for url in tqdm(urls):
    awards_data = scrapeDirectorAwards(url)
    if awards_data:  # Only proceed if something was scraped
        awards_df = pd.DataFrame(awards_data)
        all_awards = pd.concat([all_awards, awards_df], ignore_index=True)

100%|██████████| 1130/1130 [33:40<00:00,  1.79s/it]


In [240]:
all_awards['director_url'] = all_awards['director_url'].apply(standardize_imdb_url)
all_awards_cleaned = (
    all_awards
    .groupby(['director_name', 'director_url', 'title', 'year_result', 'award_name'], as_index=False)
    .agg({'category': lambda x: list(x)})
)
all_awards_cleaned = all_awards_cleaned.rename(columns={'title': 'movie_name',
                                                        'category':'award_categories'})

In [250]:
final_df = all_awards_cleaned.merge(df_merged, how='left', on=['director_url', 'movie_name'])

In [253]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22616 entries, 0 to 22615
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   director_name             22616 non-null  object 
 1   director_url              22616 non-null  object 
 2   movie_name                22616 non-null  object 
 3   year_result               22616 non-null  object 
 4   award_name                22616 non-null  object 
 5   award_categories          22616 non-null  object 
 6   movie_url                 17870 non-null  object 
 7   rating                    17870 non-null  object 
 8   participation_categories  17870 non-null  object 
 9   country                   3491 non-null   object 
 10  imdbRating                3491 non-null   object 
 11  imdbVotes                 3491 non-null   object 
 12  metascore                 2982 non-null   float64
 13  budget                    352 non-null    object 
 14  domest

In [261]:
final_df.to_csv('./outputs/Q3/directors_info.csv', index=False)

## Extracting top directors from Awards pages

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.chrome.options import Options

## Oscars Awards

In [7]:
BASE_URL = "https://www.imdb.com"
OSCARS_HOME = f"{BASE_URL}/es-es/oscars/"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [8]:
def extract_director_urls_from_section(section):
    directors = []
    # Find all <a> tags that contain "/name/" in href
    director_links = section.select('a[href*="/name/"]')

    for a in director_links:
        relative_url = a.get("href")
        full_url = BASE_URL + relative_url.split("?")[0]
        directors.append(full_url)
    
    return directors[0]

def get_year_urls_by_decade_with_selenium():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("window-size=1920x1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)
    driver.get(OSCARS_HOME)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    decade_tabs = driver.find_elements(By.CSS_SELECTOR, '.ipc-tabs[role="tablist"] li[role="tab"]')
    decade_tabs = decade_tabs[:5]

    decade_to_years = {}

    for i, tab in enumerate(decade_tabs):
        decade_label = tab.text.strip()
        print(f"📆 Switching to: {decade_label}")

        try:
            # Scroll into view and wait until clickable
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(tab))
            time.sleep(1)
            tab.click()
            time.sleep(2)  # Let the page update
        except Exception as e:
            print(f"❌ Could not click tab for {decade_label}: {e}")
            continue

        # Parse page source after switching
        soup = BeautifulSoup(driver.page_source, "html.parser")
        year_links = soup.select(".ipc-chip-list__scroller a[href*='/event/ev0000003/']")
        years = [(a.text.strip(), "https://www.imdb.com" + a['href']) for a in year_links][:-1]

        decade_to_years[decade_label] = years
        time.sleep(1)

    driver.quit()
    return decade_to_years

def extract_best_director(year, url, decade):
    try:
        res = requests.get(url, headers=HEADERS)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to get {url}: {e}")
        return None

    soup = BeautifulSoup(res.text, "html.parser")
    section = soup.find("section", {"data-testid": "BestAchievementinDirecting"})
    if not section:
        section = soup.find("section", {"data-testid": "BestDirector"})
        if not section:
            print(f"⚠️ No Best Director section found for {year}")
            return None
        
    nominees = []
    nominee_items = section.select("li.ipc-metadata-list__item")

    for item in nominee_items:
        try:
            result_tag = item.select_one(".ipc-signpost__text")
            result = result_tag.text.strip() if result_tag else "Nominee"

            director_tag = item.select_one('a.ipc-link[href*="/name/"]')
            director = director_tag.text.strip() if director_tag else ""

            director_url = extract_director_urls_from_section(item)

            movie_tag = item.select_one('a[href*="/title/"]')
            movie = movie_tag.text.strip() if movie_tag else ""

            nominees.append({
                "decade": decade,
                "year": year,
                "result": result,
                "director": director,
                "movie": movie,
                "director url": director_url
            })
        except Exception as e:
            print(f"⚠️ Error parsing nominee: {e}")
            continue
    return nominees

def scrape_all_best_director_winners():
    decade_to_years = get_year_urls_by_decade_with_selenium()
    all_data = []

    for decade, year_list in decade_to_years.items():
        for year, url in year_list:
            print(f"🔍 Scraping {year} from {url}")
            nominees = extract_best_director(year, url, decade)
            all_data.extend(nominees)  # Append all nominees
            time.sleep(1)
    
    return pd.DataFrame(all_data)

In [9]:
oscars_info = scrape_all_best_director_winners()

📆 Switching to: Década de 2020
📆 Switching to: Década de 2010
📆 Switching to: Década de 2000
📆 Switching to: Década de 1990
📆 Switching to: Década de 1980
🔍 Scraping 2025 from https://www.imdb.com/es-es/event/ev0000003/2025/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_1
🔍 Scraping 2024 from https://www.imdb.com/es-es/event/ev0000003/2024/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_2
🔍 Scraping 2023 from https://www.imdb.com/es-es/event/ev0000003/2023/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_3
🔍 Scraping 2022 from https://www.imdb.com/es-es/event/ev0000003/2022/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_4
🔍 Scraping 2021 from https://www.imdb.com/es-es/event/ev0000003/2021/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_5
🔍 Scraping 2020 from https://www.imdb.com/es-es/event/ev0000003/2020/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_6
🔍 Scraping 2019 from https://www.imdb.com/es-es/event/ev0000003/2019/1/?ref_=fea_acd_ww_fea_eds_center-29_yr_1
🔍 Scraping 2018 from https://www.imdb.com/es-es/event/ev0000003/2018

In [10]:
oscars_info.to_csv('outputs/oscars_awards_directors_info.csv', index=False)

In [11]:
oscars_directors_url = oscars_info["director url"].unique().tolist()
oscars_directors_complete_info = obtaining_info_per_url(oscars_directors_url)
df_oscars_directors_complete_info = pd.DataFrame(oscars_directors_complete_info)

100%|██████████| 147/147 [03:54<00:00,  1.60s/it]


In [12]:
df_oscars_directors_complete_info.to_csv('outputs/oscars_directors_complete_info.csv', index=False)

## Golden Globes Awards

In [13]:
BASE_URL = "https://www.imdb.com"
GG_HOME = f"{BASE_URL}/es-es/golden-globes/"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [14]:
def get_year_urls_by_decade_with_selenium():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("window-size=1920x1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)
    driver.get(GG_HOME)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    decade_tabs = driver.find_elements(By.CSS_SELECTOR, '.ipc-tabs[role="tablist"] li[role="tab"]')
    decade_tabs = decade_tabs[:5]

    decade_to_years = {}

    for i, tab in enumerate(decade_tabs):
        decade_label = tab.text.strip()
        print(f"📆 Switching to: {decade_label}")

        try:
            # Scroll into view and wait until clickable
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(tab))
            time.sleep(1)
            tab.click()
            time.sleep(2)  # Let the page update
        except Exception as e:
            print(f"❌ Could not click tab for {decade_label}: {e}")
            continue

        # Parse page source after switching
        soup = BeautifulSoup(driver.page_source, "html.parser")
        year_links = soup.select(".ipc-chip-list__scroller a[href*='/event/ev0000292/']")
        years = [(a.text.strip(), "https://www.imdb.com" + a['href']) for a in year_links][:-1]

        decade_to_years[decade_label] = years
        time.sleep(1)

    driver.quit()
    return decade_to_years

def extract_best_director(year, url, decade):
    try:
        res = requests.get(url, headers=HEADERS)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to get {url}: {e}")
        return None

    soup = BeautifulSoup(res.text, "html.parser")
    section = soup.find("section", {"data-testid": "BestDirector,MotionPicture"})
    if not section:
        section = soup.find("section", {"data-testid": "BestDirector-MotionPicture"})
        if not section:
            print(f"⚠️ No Best Director section found for {year}")
            return None
        
    nominees = []
    nominee_items = section.select("li.ipc-metadata-list__item")

    for item in nominee_items:
        try:
            result_tag = item.select_one(".ipc-signpost__text")
            result = result_tag.text.strip() if result_tag else "Nominee"

            director_tag = item.select_one('a.ipc-link[href*="/name/"]')
            director = director_tag.text.strip() if director_tag else ""

            director_url = extract_director_urls_from_section(item)

            movie_tag = item.select_one('a[href*="/title/"]')
            movie = movie_tag.text.strip() if movie_tag else ""

            nominees.append({
                "decade": decade,
                "year": year,
                "result": result,
                "director": director,
                "movie": movie,
                "director url": director_url
            })
        except Exception as e:
            print(f"⚠️ Error parsing nominee: {e}")
            continue
    return nominees

def scrape_all_best_director_winners():
    decade_to_years = get_year_urls_by_decade_with_selenium()
    all_data = []

    for decade, year_list in decade_to_years.items():
        for year, url in year_list:
            print(f"🔍 Scraping {year} from {url}")
            nominees = extract_best_director(year, url, decade)
            all_data.extend(nominees)  # Append all nominees
            time.sleep(1)
    
    return pd.DataFrame(all_data)

In [15]:
golden_globes_info = scrape_all_best_director_winners()

📆 Switching to: Década de 2020
📆 Switching to: Década de 2010
📆 Switching to: Década de 2000
📆 Switching to: Década de 1990
📆 Switching to: Década de 1980
🔍 Scraping 2025 from https://www.imdb.com/es-es/event/ev0000292/2025/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_1
🔍 Scraping 2024 from https://www.imdb.com/es-es/event/ev0000292/2024/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_2
🔍 Scraping 2023 from https://www.imdb.com/es-es/event/ev0000292/2023/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_3
🔍 Scraping 2022 from https://www.imdb.com/es-es/event/ev0000292/2022/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_4
🔍 Scraping 2021 from https://www.imdb.com/es-es/event/ev0000292/2021/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_5
🔍 Scraping 2020 from https://www.imdb.com/es-es/event/ev0000292/2020/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_6
🔍 Scraping 2019 from https://www.imdb.com/es-es/event/ev0000292/2019/1/?ref_=fea_globes_ww_fea_eds_center-29_yr_1
🔍 Scraping 2018 from https://www.imdb.com/es-es

In [16]:
golden_globes_info.to_csv('outputs/golden_globes_awards_directors_info.csv', index=False)

In [17]:
gg_directors_url = golden_globes_info["director url"].unique().tolist()
gg_directors_complete_info = obtaining_info_per_url(gg_directors_url)
df_gg_directors_complete_info = pd.DataFrame(gg_directors_complete_info)

100%|██████████| 133/133 [04:39<00:00,  2.10s/it]


In [18]:
df_gg_directors_complete_info.to_csv('outputs/golden_globes_directors_complete_info.csv', index=False)

## Parsing Directors Info

In [94]:
import pandas as pd

Oscars Awards

In [99]:
oscars1 = pd.read_csv('./outputs/oscars_directors_complete_info.csv')
oscars2 = pd.read_csv('./outputs/oscars_awards_directors_info.csv')

In [100]:
oscars1['movie_url'] = oscars1['movie_url'].apply(standardize_imdb_url)

In [101]:
oscars1 = oscars1.rename({'name':'director',
                          'movie_name':'movie',
                          'movie_url':'movie url',
                          'director_url':'director url'}, axis=1)
oscars2 = oscars2.drop(['director url'], axis=1)

In [102]:
oscars_info = oscars2.merge(oscars1, on=['director', 'movie'], how='left', indicator=True)
oscars_info = oscars_info[oscars_info['_merge']=='both'].drop('_merge', axis=1)
oscars_info.to_csv('./outputs/oscars_directors_info.csv', index=False)

Golden Globes Awards

In [103]:
golden1 = pd.read_csv('./outputs/golden_globes_directors_complete_info.csv')
golden2 = pd.read_csv('./outputs/golden_globes_awards_directors_info.csv')

In [104]:
golden1['movie_url'] = golden1['movie_url'].apply(standardize_imdb_url)

In [105]:
golden1 = golden1.rename({'name':'director',
                          'movie_name':'movie',
                          'movie_url':'movie url',
                          'director_url':'director url'}, axis=1)
golden2 = golden2.drop(['director url'], axis=1)

In [106]:
golden_info = golden2.merge(golden1, on=['director', 'movie'], how='left', indicator=True)
golden_info = golden_info[golden_info['_merge']=='both'].drop('_merge', axis=1)
golden_info.to_csv('./outputs/golden_globes_directors_info.csv', index=False)

Budget Info

In [107]:
movies_budget = pd.read_csv('./outputs/movies_all_countries.csv')
movies_budget = movies_budget.rename(columns={'url': 'movie url'})

In [110]:
movies_budget['movie url'] = movies_budget['movie url'].apply(standardize_imdb_url)

In [130]:
movies_merged_oscars = oscars_info.merge(movies_budget, how='left', on='movie url', indicator=True)

In [131]:
movies_merged_oscars = movies_merged_oscars[movies_merged_oscars['_merge']=='both'].drop(['_merge','title','movie url', 'directors', 'rating'], axis=1)

In [132]:
movies_merged_oscars

Unnamed: 0,decade,year,result,director,movie,category,director url,country,imdbRating,imdbVotes,metascore,budget,domesticGross,worldwideGross,awards,thespians
0,Década de 2020,2025,Ganador,Sean Baker,Anora,guion,https://www.imdb.com/es-es/name/nm0048918/,US,7.5,(176K),91.0,"$6,000,000","$20,474,298","$56,556,258","['Best Motion Picture of the Year', 'Best Achi...","[{'name': 'Mikey Madison', 'url': 'https://www..."
1,Década de 2020,2025,Ganador,Sean Baker,Anora,produccion,https://www.imdb.com/es-es/name/nm0048918/,US,7.5,(176K),91.0,"$6,000,000","$20,474,298","$56,556,258","['Best Motion Picture of the Year', 'Best Achi...","[{'name': 'Mikey Madison', 'url': 'https://www..."
2,Década de 2020,2025,Ganador,Sean Baker,Anora,direccion,https://www.imdb.com/es-es/name/nm0048918/,US,7.5,(176K),91.0,"$6,000,000","$20,474,298","$56,556,258","['Best Motion Picture of the Year', 'Best Achi...","[{'name': 'Mikey Madison', 'url': 'https://www..."
3,Década de 2020,2025,Nominee,Brady Corbet,The Brutalist,produccion,https://www.imdb.com/es-es/name/nm1227232/,US,7.4,(76K),90.0,,"$16,249,415","$49,315,030",['Best Achievement in Music Written for Motion...,"[{'name': 'Adrien Brody', 'url': 'https://www...."
4,Década de 2020,2025,Nominee,Brady Corbet,The Brutalist,direccion,https://www.imdb.com/es-es/name/nm1227232/,US,7.4,(76K),90.0,,"$16,249,415","$49,315,030",['Best Achievement in Music Written for Motion...,"[{'name': 'Adrien Brody', 'url': 'https://www...."
8,Década de 2020,2025,Nominee,Coralie Fargeat,La sustancia,guion,https://www.imdb.com/es-es/name/nm0267287/,GB,7.3,(292K),78.0,,"$17,584,795","$77,311,451",['Best Make Up & Hair'],"[{'name': 'Demi Moore', 'url': 'https://www.im..."
9,Década de 2020,2025,Nominee,Coralie Fargeat,La sustancia,direccion,https://www.imdb.com/es-es/name/nm0267287/,GB,7.3,(292K),78.0,,"$17,584,795","$77,311,451",['Best Make Up & Hair'],"[{'name': 'Demi Moore', 'url': 'https://www.im..."
10,Década de 2020,2025,Nominee,James Mangold,A Complete Unknown,produccion,https://www.imdb.com/es-es/name/nm0003506/,US,7.4,(70K),70.0,,"$74,986,510","$137,917,419",[],"[{'name': 'Timothée Chalamet', 'url': 'https:/..."
11,Década de 2020,2025,Nominee,James Mangold,A Complete Unknown,direccion,https://www.imdb.com/es-es/name/nm0003506/,US,7.4,(70K),70.0,,"$74,986,510","$137,917,419",[],"[{'name': 'Timothée Chalamet', 'url': 'https:/..."
12,Década de 2020,2025,Nominee,James Mangold,A Complete Unknown,guion,https://www.imdb.com/es-es/name/nm0003506/,US,7.4,(70K),70.0,,"$74,986,510","$137,917,419",[],"[{'name': 'Timothée Chalamet', 'url': 'https:/..."


In [None]:
movies_merged = movies_budget.merge(oscars_info, how='left', on='movie url', indicator=True)