In [16]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.imdb.com/es/name/nm0634240/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

response = requests.get(url, headers=headers)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html lang="es-MX" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1741730456386);
        }
    })
  </script>
  <title>
   Christopher Nolan - IMDb
  </title>
  <meta content="Christopher Nolan. Guion: Tenet. Christopher Nolan nació el 30 de julio de 1970 en Londres, Inglaterra. Es un escritor y productor, conocido por Tenet (2020), El origen (2010) 

In [None]:
import requests
import re
import unidecode
from bs4 import BeautifulSoup
from bs4.element import Tag

def get_director_raw_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    dir_info_raw = soup.find_all('li', {'data-testid': True})
    return dir_info_raw

def get_director_structured_info(item: Tag):
    try:
        category = item.get("data-testid", default="no_category")

        if not category.startswith("cred"):
            return {"error": "unexpected category"}

        match = re.search(r'_(.*?)_', category)
        category_cleaned = unidecode.unidecode(match.group(1))

        info = item.find('a', {'aria-label': True})
        movie_name = info["aria-label"]
        url = f"https://www.imdb.com{info.get('href')}"

        rating_span = item.find('span', class_='ipc-rating-star--rating')
        rating = rating_span.get_text(strip=True) if rating_span else 'N/A'

        response = {
            "category": category_cleaned,
            "movie_name": movie_name,
            "rating": rating,
            "url": url,
        }
        return response
    except Exception as ex:
        response ={
            "error": f"{ex}"
        }
        return response

In [43]:
url = 'https://www.imdb.com/es/name/nm0634240/'
raw_info = get_director_raw_info(url)
structured_data = list(
    filter(
        lambda item: "error" not in item, map(get_director_structured_info, raw_info)
    )
)
structured_data

[{'category': 'guion',
  'movie_name': 'Oppenheimer',
  'rating': '8.3',
  'url': 'https://www.imdb.com/es/title/tt15398776/?ref_=nm_flmg_job_1_cdt_img'},
 {'category': 'guion',
  'movie_name': 'Tenet',
  'rating': '7.3',
  'url': 'https://www.imdb.com/es/title/tt6723592/?ref_=nm_flmg_job_1_cdt_img_1'},
 {'category': 'guion',
  'movie_name': 'Dunkerque',
  'rating': '7.8',
  'url': 'https://www.imdb.com/es/title/tt5013056/?ref_=nm_flmg_job_1_cdt_img_2'},
 {'category': 'guion',
  'movie_name': 'Interestelar',
  'rating': '8.7',
  'url': 'https://www.imdb.com/es/title/tt0816692/?ref_=nm_flmg_job_1_cdt_img_3'},
 {'category': 'guion',
  'movie_name': 'El hombre de acero',
  'rating': '7.1',
  'url': 'https://www.imdb.com/es/title/tt0770828/?ref_=nm_flmg_job_1_cdt_img_4'},
 {'category': 'guion',
  'movie_name': 'Batman: El caballero de la noche asciende',
  'rating': '8.4',
  'url': 'https://www.imdb.com/es/title/tt1345836/?ref_=nm_flmg_job_1_cdt_img_5'},
 {'category': 'guion',
  'movie_nam