In [1]:
import requests
from bs4 import BeautifulSoup

# url = 'https://www.imdb.com/es/name/nm0634240/'

In [2]:
import requests
import re
import unidecode
from bs4 import BeautifulSoup
from bs4.element import Tag
import json

def get_director_raw_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    dir_info_raw = soup.find_all('li', {'data-testid': True})
    
    # Extraer nombre del director desde el título
    title_tag = soup.find('title')
    name = title_tag.get_text(strip=True).split('-')[0].strip() if title_tag else 'Nombre no encontrado'

    return dir_info_raw, name

def get_director_structured_info(name, item: Tag):
    try:
        category = item.get("data-testid", default="no_category")

        if not category.startswith("cred"):
            return {"error": "unexpected category"}

        match = re.search(r'_(.*?)_', category)
        category_cleaned = unidecode.unidecode(match.group(1))

        info = item.find('a', {'aria-label': True})
        movie_name = info["aria-label"]
        url = f"https://www.imdb.com{info.get('href')}"

        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        response = {
            "name": name,
            "category": category_cleaned,
            "movie_name": movie_name,
            "rating": rating,
            "url": url,
        }
        return response
    except Exception as ex:
        return {"error": f"{ex}"}

In [3]:
urls = [
    "https://www.imdb.com/name/nm0634240/",
    "https://www.imdb.com/name/nm0000217/",
    "https://www.imdb.com/name/nm0000229/",
    "https://www.imdb.com/name/nm0000233/",
    "https://www.imdb.com/name/nm0000033/",
    "https://www.imdb.com/name/nm0000040/",
    "https://www.imdb.com/name/nm0000338/",
    "https://www.imdb.com/name/nm0000631/",
    "https://www.imdb.com/name/nm0000116/",
    "https://www.imdb.com/name/nm0001392/",
    "https://www.imdb.com/name/nm0000318/",
    "https://www.imdb.com/name/nm0868219/",
    "https://www.imdb.com/name/nm0327944/",
    "https://www.imdb.com/name/nm0000264/",
    "https://www.imdb.com/name/nm0001068/",
    "https://www.imdb.com/name/nm0000941/",
    "https://www.imdb.com/name/nm0000759/",
    "https://www.imdb.com/name/nm0027572/",
    "https://www.imdb.com/name/nm0000399/",
    "https://www.imdb.com/name/nm0000184/"
]

In [4]:
all_structured_data = []

for url in urls:
    try:
        raw_info, director_name = get_director_raw_info(url)
        
        structured_data = list(
            filter(
                lambda item: "error" not in item, 
                map(lambda item: get_director_structured_info(director_name, item), raw_info)
            )
        )
        
        all_structured_data.extend(structured_data)
    except Exception as e:
        print(f"Error procesando URL {url}: {e}")

In [5]:
import pandas as pd

df = pd.DataFrame(all_structured_data)

In [6]:
df

Unnamed: 0,name,category,movie_name,rating,url
0,Christopher Nolan,writer,Oppenheimer,8.3,https://www.imdb.com/title/tt15398776/?ref_=nm...
1,Christopher Nolan,writer,Tenet,7.3,https://www.imdb.com/title/tt6723592/?ref_=nm_...
2,Christopher Nolan,writer,Dunkerque,7.8,https://www.imdb.com/title/tt5013056/?ref_=nm_...
3,Christopher Nolan,writer,Interstellar,8.7,https://www.imdb.com/title/tt0816692/?ref_=nm_...
4,Christopher Nolan,writer,El hombre de acero,7.1,https://www.imdb.com/title/tt0770828/?ref_=nm_...
...,...,...,...,...,...
851,George Lucas,director,Filmmaker,6.4,https://www.imdb.com/title/tt0062970/?ref_=nm_...
852,George Lucas,director,The Emperor,5.7,https://www.imdb.com/title/tt0061621/?ref_=nm_...
853,George Lucas,director,Anyone Lived in a Pretty How Town,5.1,https://www.imdb.com/title/tt0061362/?ref_=nm_...
854,George Lucas,director,6-18-67,5.1,https://www.imdb.com/title/tt0061318/?ref_=nm_...


In [7]:
df.to_csv("directors_info.csv", index=False)