In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_imdb_movies(url="https://www.imdb.com/list/ls009072108/?sort=popularity%2Casc"):
    # Cabeceras para simular un navegador
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
            " AppleWebKit/537.36 (KHTML, like Gecko)"
            " Chrome/112.0.0.0 Safari/537.36"
        )
    }
    
    # Hacemos la petición con las cabeceras
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Lanza excepción si falla (4xx, 5xx)

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Seleccionamos todos los <li> donde vienen las películas
    movie_items = soup.select('li.ipc-metadata-list-summary-item')
    
    movies_data = []

    for movie in movie_items:
        try:
            # Extraemos título
            title_tag = movie.select_one('h3.ipc-title__text')
            title = title_tag.get_text(strip=True) if title_tag else "No Title"

            # Año/duración
            metadata_spans = movie.select('div.sc-d5ea4b9d-6 span.sc-d5ea4b9d-7')
            year = metadata_spans[0].get_text(strip=True) if len(metadata_spans) > 0 else ""
            duration = metadata_spans[1].get_text(strip=True) if len(metadata_spans) > 1 else ""

            # Calificación
            rating_span = movie.select_one('span.ipc-rating-star--rating')
            rating = rating_span.get_text(strip=True) if rating_span else ""

            # Presupuesto
            budget_text = ""
            budget_div = movie.select_one('div[data-testid="title-list-item-description"]')
            if budget_div:
                text_all = budget_div.get_text(strip=True)
                if "Budget:" in text_all:
                    budget_text = text_all.replace("Budget: ", "")

            movies_data.append({
                'Title': title,
                'Year': year,
                'Duration': duration,
                'Rating': rating,
                'Budget': budget_text
            })
        except Exception as e:
            print(f"Error procesando una película: {e}")
            continue
    
    df = pd.DataFrame(movies_data)
    return df

# Ejemplo de uso
if __name__ == "__main__":
    df_imdb = scrape_imdb_movies()
    print(df_imdb.head())

    # Guardar en CSV si se desea
    df_imdb.to_csv('data/movies_budgets.csv', index=False)


HTTPError: 403 Client Error: Forbidden for url: https://www.imdb.com/list/ls009072108/?sort=popularity%2Casc