In [None]:
import requests
from bs4 import BeautifulSoup

# Define the IMDb URL for the top movies list
url = "https://www.imdb.com/chart/top/"

# Set headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Send an HTTP GET request
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract movie data
movies = soup.select("li.ipc-metadata-list-summary-item")

for movie in movies[:50]:  # Get top 250 movies
    title = movie.select_one("h3").text.strip()
    year = movie.select_one(".cli-title-metadata span").text.strip() if movie.select_one(".cli-title-metadata span") else "N/A"
    rating = movie.select_one(".ipc-rating-star span").text.strip() if movie.select_one(".ipc-rating-star span") else "N/A"
    
    print(f"{title} ({year}) - IMDb Rating: {rating}")


1. The Shawshank Redemption (1994) - IMDb Rating: 9.3
2. The Godfather (1972) - IMDb Rating: 9.2
3. The Dark Knight (2008) - IMDb Rating: 9.0
4. The Godfather Part II (1974) - IMDb Rating: 9.0
5. 12 Angry Men (1957) - IMDb Rating: 9.0
6. The Lord of the Rings: The Return of the King (2003) - IMDb Rating: 9.0
7. Schindler's List (1993) - IMDb Rating: 9.0
8. Pulp Fiction (1994) - IMDb Rating: 8.9
9. The Lord of the Rings: The Fellowship of the Ring (2001) - IMDb Rating: 8.9
10. Il buono, il brutto, il cattivo (1966) - IMDb Rating: 8.8
11. Forrest Gump (1994) - IMDb Rating: 8.8
12. The Lord of the Rings: The Two Towers (2002) - IMDb Rating: 8.8
13. Fight Club (1999) - IMDb Rating: 8.8
14. Inception (2010) - IMDb Rating: 8.8
15. Star Wars: Episode V - The Empire Strikes Back (1980) - IMDb Rating: 8.7
16. The Matrix (1999) - IMDb Rating: 8.7
17. GoodFellas (1990) - IMDb Rating: 8.7
18. One Flew Over the Cuckoo's Nest (1975) - IMDb Rating: 8.7
19. Interstellar (2014) - IMDb Rating: 8.7
20. S

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Set up the WebDriver options
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Initialize WebDriver
service = Service("C:\\Users\\ASUS\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe")  # Update with your path
driver = webdriver.Chrome(service=service, options=options)

# IMDb Top 250 URL
top_250_url = "https://www.imdb.com/chart/top/"

# Open the URL
driver.get(top_250_url)

# Wait for the page to load and movie links to appear
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list li")))

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Extract movie links - Update the selector based on inspection of the page
movies = soup.select("ul.ipc-metadata-list li a.ipc-title-link-wrapper")  # Update this selector
movie_links = ["https://www.imdb.com" + movie["href"] for movie in movies]

# Limit to first 250 movies
movie_links = movie_links[:10]

# Initialize list for data
movies_data = []

# Loop through each movie link
for index, link in enumerate(movie_links):
    driver.get(link)
    time.sleep(1)  # Allow page to load
    
    # Wait for the title to load on the movie page
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "h1")))

    movie_soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract movie details
    try:
        title = movie_soup.find("h1").text.strip() if movie_soup.find("h1") else "N/A"
        year = movie.select_one(".cli-title-metadata span").text.strip() if movie.select_one(".cli-title-metadata span") else "N/A" # movie_soup.select_one("ul.ipc-inline-list li a").text.strip() if movie_soup.select_one("ul.ipc-inline-list li a") else "N/A"
        rating = movie.select_one(".ipc-rating-star span").text.strip() if movie.select_one(".ipc-rating-star span") else "N/A"# movie_soup.select_one("span.sc-bde20123-1").text.strip() if movie_soup.select_one("span.sc-bde20123-1") else "N/A"
        genre_tags = movie_soup.select("div.ipc-chip-list a span")
        genres = ", ".join([g.text for g in genre_tags]) if genre_tags else "N/A"
        directors_tags = movie_soup.select("li[data-testid='title-pc-principal-credit'] a")
        directors = ", ".join([d.text for d in directors_tags]) if directors_tags else "N/A"
        
        # Limit to top 3 actors (if available)
        actors_tags = movie_soup.select("a.sc-bfec09a1-1")
        actors = ", ".join([a.text for a in actors_tags[:3]]) if actors_tags else "N/A"
        
        # Box office revenue (optional)
        box_office = "N/A"
        box_office_tag = movie_soup.select_one("span[data-testid='title-boxoffice-cumulativeworldwidegross']")
        if box_office_tag:
            box_office = box_office_tag.text.strip()

        # Save data
        movies_data.append({
            "Title": title,
            "Year": year,
            "IMDb Rating": rating,
            "Genre": genres,
            "Director(s)": directors,
            "Lead Actors": actors,
            "Box Office Revenue": box_office
        })

        print(f"Scraped {index+1}: {title} ({year})")

    except Exception as e:
        print(f"Error scraping movie {index+1}: {e}")

# Close the Selenium driver
driver.quit()

# Save to CSV
df = pd.DataFrame(movies_data)
df.to_csv("imdb_top_250.csv", index=False)

print("\n✅ IMDb Top 250 Movies Scraped Successfully! Data saved to 'imdb_top_250.csv'")

Scraped 1: The Shawshank Redemption (2002)
Scraped 2: The Godfather (2002)
Scraped 3: The Dark Knight (2002)
Scraped 4: The Godfather Part II (2002)
Scraped 5: 12 Angry Men (2002)
Scraped 6: The Lord of the Rings: The Return of the King (2002)
Scraped 7: Schindler's List (2002)
Scraped 8: Pulp Fiction (2002)
Scraped 9: The Lord of the Rings: The Fellowship of the Ring (2002)
Scraped 10: The Good, the Bad and the Ugly (2002)

✅ IMDb Top 250 Movies Scraped Successfully! Data saved to 'imdb_top_250.csv'
