In [3]:
import pandas as pd
import ast
# Extract director names

movies_df = pd.read_excel("movies.xlsx")

# Convert stringified lists to actual Python lists
movies_df['directors'] = movies_df['directors'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else []
)

# Now extract names and URLs as before
movies_df['director_names'] = movies_df['directors'].apply(
    lambda x: ', '.join([d.get('name', '') for d in x]) if isinstance(x, list) else ''
)

movies_df['director_urls'] = movies_df['directors'].apply(
    lambda x: ', '.join([d.get('url', '') for d in x]) if isinstance(x, list) else ''
)


In [4]:
movies_df

Unnamed: 0,title,url,imdbRating,imdbVotes,metascore,directors,thespians,director_names,director_urls
0,1. The Penguin Lessons,https://www.imdb.com/title/tt26677014/,73,(655),58.0,[],[],,
1,2. La fiebre de los ricos,https://www.imdb.com/title/tt17677434/,55,"(5,4 mil)",,[],[],,
2,3. El juego del asesino,https://www.imdb.com/title/tt0327785/,58,(17 mil),36.0,"[{'name': 'J.J. Perry', 'url': 'https://www.im...","[{'name': 'Dave Bautista', 'url': 'https://www...",J.J. Perry,https://www.imdb.com/name/nm0675102/
3,4. La habitación de al lado,https://www.imdb.com/title/tt29439114/,68,(19 mil),70.0,[],[],,
4,5. Hechizados,https://www.imdb.com/title/tt7215232/,56,"(7,1 mil)",54.0,"[{'name': 'Vicky Jenson', 'url': 'https://www....","[{'name': 'Rachel Zegler', 'url': 'https://www...",Vicky Jenson,https://www.imdb.com/name/nm0421776/
...,...,...,...,...,...,...,...,...,...
468,469. Juan Espino: El mejor luchador de todos l...,https://www.imdb.com/title/tt32991422/,,,,[],[],,
469,470. Athletes to Watch - Paris 2024,https://www.imdb.com/title/tt32992679/,,,,[],[],,
470,471. Película Nº1,https://www.imdb.com/title/tt31124592/,,,,[],[],,
471,472. El eco de otras voces,https://www.imdb.com/title/tt36386095/,,,,"[{'name': 'Adriana Domínguez', 'url': 'https:/...","[{'name': 'Adolfo Domínguez', 'url': 'https://...",Adriana Domínguez,https://www.imdb.com/name/nm1063497/


In [12]:
# Get all non-empty director_urls, split by comma, and flatten
director_urls = movies_df[movies_df['director_urls'] != '']['director_urls'] \
    .str.split(', ') \
    .explode() \
    .tolist()

director_urls

['https://www.imdb.com/name/nm0675102/',
 'https://www.imdb.com/name/nm0421776/',
 'https://www.imdb.com/name/nm0181579/',
 'https://www.imdb.com/name/nm2008067/',
 'https://www.imdb.com/name/nm5231416/',
 'https://www.imdb.com/name/nm3433064/',
 'https://www.imdb.com/name/nm3911679/',
 'https://www.imdb.com/name/nm2262403/',
 'https://www.imdb.com/name/nm2253409/',
 'https://www.imdb.com/name/nm0130714/',
 'https://www.imdb.com/name/nm2201717/',
 'https://www.imdb.com/name/nm1534594/',
 'https://www.imdb.com/name/nm1531686/',
 'https://www.imdb.com/name/nm0874096/',
 'https://www.imdb.com/name/nm3473876/',
 'https://www.imdb.com/name/nm0343903/',
 'https://www.imdb.com/name/nm1443023/',
 'https://www.imdb.com/name/nm4933787/',
 'https://www.imdb.com/name/nm0093081/',
 'https://www.imdb.com/name/nm0016176/',
 'https://www.imdb.com/name/nm0580793/',
 'https://www.imdb.com/name/nm2025290/',
 'https://www.imdb.com/name/nm4053711/',
 'https://www.imdb.com/name/nm7289977/',
 'https://www.im

In [13]:
len(director_urls)

288

In [14]:
import requests
from bs4 import BeautifulSoup

# url = 'https://www.imdb.com/es/name/nm0634240/'

In [17]:
import requests
import re
import unidecode
from bs4 import BeautifulSoup
from bs4.element import Tag
import json

def get_director_raw_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    dir_info_raw = soup.find_all('li', {'data-testid': True})
    
    # Extraer nombre del director desde el título
    title_tag = soup.find('title')
    name = title_tag.get_text(strip=True).split('-')[0].strip() if title_tag else 'Nombre no encontrado'

    return dir_info_raw, name

def get_director_structured_info(name, item: Tag):
    try:
        category = item.get("data-testid", default="no_category")

        if not category.startswith("cred"):
            return {"error": "unexpected category"}

        match = re.search(r'_(.*?)_', category)
        category_cleaned = unidecode.unidecode(match.group(1))

        info = item.find('a', {'aria-label': True})
        movie_name = info["aria-label"]
        url = f"https://www.imdb.com{info.get('href')}"

        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        response = {
            "name": name,
            "category": category_cleaned,
            "movie_name": movie_name,
            "rating": rating,
            "url": url,
        }
        return response
    except Exception as ex:
        return {"error": f"{ex}"}

In [19]:
urls = director_urls

In [20]:
all_structured_data = []

for url in urls:
    try:
        raw_info, director_name = get_director_raw_info(url)
        
        structured_data = list(
            filter(
                lambda item: "error" not in item, 
                map(lambda item: get_director_structured_info(director_name, item), raw_info)
            )
        )
        
        all_structured_data.extend(structured_data)
    except Exception as e:
        print(f"Error procesando URL {url}: {e}")

In [21]:
import pandas as pd

df = pd.DataFrame(all_structured_data)

In [22]:
df

Unnamed: 0,name,category,movie_name,rating,url
0,J.J. Perry,stunts,Avatar: El sentido del agua,7.5,https://www.imdb.com/title/tt1630029/?ref_=nm_...
1,J.J. Perry,stunts,Samaritan,5.7,https://www.imdb.com/title/tt5500218/?ref_=nm_...
2,J.J. Perry,stunts,Fast & Furious 9,5.2,https://www.imdb.com/title/tt5433138/?ref_=nm_...
3,J.J. Perry,stunts,Sombra y hueso,7.5,https://www.imdb.com/title/tt2403776/?ref_=nm_...
4,J.J. Perry,stunts,Falcon y el Soldado de Invierno,7.1,https://www.imdb.com/title/tt9208876/?ref_=nm_...
...,...,...,...,...,...
5250,Lucas Figueroa,producer,Viral,4.4,https://www.imdb.com/title/tt2594078/?ref_=nm_...
5251,Lucas Figueroa,producer,Clip,,https://www.imdb.com/title/tt36413583/?ref_=nm...
5252,Lucas Figueroa,producer,Boletos por favor,6.2,https://www.imdb.com/title/tt1298740/?ref_=nm_...
5253,Lucas Figueroa,producer,Porque hay cosas que nunca se olvidan,6.2,https://www.imdb.com/title/tt1292572/?ref_=nm_...


In [23]:
df.to_csv("directors_info.csv", index=False)