In [None]:
import pandas as pd
import ast
import requests
import re
import unidecode
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
from bs4 import BeautifulSoup
import time

In [None]:
def parsing_movies_info(name_archive: str) -> pd.DataFrame:
    """
    This function reads a file containing movie information and returns a dataframe
    containing the movie information.

    Args:
        name_archive: name of the xlsx archive with the movie info

    Returns:
        movies_df: dataframe with the movie info
    """
    movies_df = pd.read_excel(name_archive)
    # Convert stringified lists to actual Python lists
    movies_df['directors'] = movies_df['directors'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else []
    )

    # Now extract names and URLs
    movies_df['director_names'] = movies_df['directors'].apply(
        lambda x: ', '.join([d.get('name', '') for d in x]) if isinstance(x, list) else ''
    )

    movies_df['director_urls'] = movies_df['directors'].apply(
        lambda x: ', '.join([d.get('url', '') for d in x]) if isinstance(x, list) else ''
    )
    return movies_df

def parsing_directors_URL(movies_df: pd.DataFrame) -> list:
    """
    This function receives the dataframe with the movie info and parse the
    director urls and save it into a list.

    Args:
        movies_df: dataframe with the movie info

    Returns:
        director_urls: list with the directors urls
    """
    # Get all non-empty director_urls, split by comma, and flatten
    director_urls = movies_df[movies_df['director_urls'] != '']['director_urls'] \
        .str.split(', ') \
        .explode() \
        .tolist()
    return director_urls

def get_director_raw_info(url: str) -> tuple[list[Tag], str]:
    """
    This function retrieves the raw HTML content of a director's IMDb page
    and extracts the raw info block and the director's name.

    Args:
        url: IMDb URL of the director's page

    Returns:
        dir_info_raw: list of HTML elements containing the raw data about the director
        name: name of the director
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    dir_info_raw = soup.find_all('li', {'data-testid': True})

    # Extracting director name from the title
    title_tag = soup.find('title')
    name = title_tag.get_text(strip=True).split('-')[0].strip() if title_tag else 'Nombre no encontrado'

    return dir_info_raw, name

def get_director_structured_info(name: str, item: Tag) -> dict:
    """
    This function takes the raw HTML element of a director's credit and returns
    a structured dictionary with relevant information.

    Args:
        name: name of the director
        item: raw HTML item from the IMDb page

    Returns:
        response: structured information including name, category, movie name, rating, and URL;
                  if an error occurs, a dictionary with an error message is returned
    """
    try:
        category = item.get("data-testid", default="no_category")

        if not category.startswith("cred"):
            return {"error": "unexpected category"}

        match = re.search(r'_(.*?)_', category)
        category_cleaned = unidecode.unidecode(match.group(1))

        info = item.find('a', {'aria-label': True})
        movie_name = info["aria-label"]
        url = f"https://www.imdb.com{info.get('href')}"

        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        response = {
            "name": name,
            "category": category_cleaned,
            "movie_name": movie_name,
            "rating": rating,
            "url": url,
        }
        return response
    except Exception as ex:
        return {"error": f"{ex}"}

def obtaining_info_per_url(urls: list[str]) -> list[dict]:
    """
    This function loops through a list of director IMDb URLs, parses the raw and structured
    information, and aggregates it into a list of dictionaries.

    Args:
        urls: list of IMDb URLs for directors

    Returns:
        all_structured_data: list of dictionaries containing structured movie info for each director
    """
    all_structured_data = []

    for url in urls:
        try:
            raw_info, director_name = get_director_raw_info(url)

            structured_data = list(
                filter(
                    lambda item: "error" not in item,
                    map(lambda item: get_director_structured_info(director_name, item), raw_info)
                )
            )

            all_structured_data.extend(structured_data)
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
    return all_structured_data

def scrape_imdb_awards(imdb_id):
    url = f"https://www.imdb.com/name/{imdb_id}/awards"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    tables = soup.find_all("table", class_="awards")
    data = []
    current_award = None

    for table in tables:
        for row in table.find_all("tr"):
            if "award_category" in row.get("class", []):
                current_award = row.get_text(strip=True)
            elif "award_description" in row.get("class", []):
                outcome = row.find("td", class_="title_award_outcome").get_text(strip=True)
                outcome_type = row.find("b").get_text(strip=True) if row.find("b") else ""
                desc = row.find("td", class_="award_description").get_text(strip=True)
                title_tag = row.find("td", class_="award_description").find("a")
                title = title_tag.get_text(strip=True) if title_tag else ""
                year_tag = row.find("span", class_="award_year")
                year = year_tag.get_text(strip=True) if year_tag else ""

                data.append({
                    "award": current_award,
                    "outcome": outcome_type,
                    "category_and_description": desc,
                    "title": title,
                    "year": year
                })

    return pd.DataFrame(data)

In [None]:
movies_df = parsing_movies_info("movies.xlsx")
urls = parsing_directors_URL(movies_df)
all_structured_data = obtaining_info_per_url(urls)
df_directors = pd.DataFrame(all_structured_data)
df_directors['imdb_id'] = df_directors['director_url'].apply(
    lambda url: re.search(r'nm\d+', url).group(0) if pd.notnull(url) else None
)

all_awards = []
for _, row in df_directors.iterrows():
    name = row['director_name']
    imdb_id = row['imdb_id']

    if pd.notnull(imdb_id):
        print(f"üîç Scraping {name} ({imdb_id})...")
        try:
            df = scrape_imdb_awards(imdb_id)
            df["director_name"] = name
            all_awards.append(df)
            time.sleep(2)
        except Exception as e:
            print(f"‚ö†Ô∏è Error with {name}: {e}")

# Combine all individual dataframes
combined_awards_df = pd.concat(all_awards, ignore_index=True)

df_directors.to_csv("./outputs/directors_info.csv", index=False)