In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
import json

# Setup Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

service = Service("E:\chromedriver-win64\chromedriver.exe")  
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open IMDb Top 250 page
url = "https://www.imdb.com/chart/top/"
driver.get(url)

# Scroll to bottom to load all movies
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Parse page source with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# Extract movie details
movies = soup.find_all('li', {'class': 'ipc-metadata-list-summary-item'})
print(f"Movies found: {len(movies)}")

# Lists to store extracted data
rank, title, year, rating, genre1, genre2, genre3, duration, votes, director = [], [], [], [], [], [], [], [], [], []

for movie in movies:
    try:
        # Rank & Title
        rank_title = movie.select_one('h3.ipc-title__text').text.strip()
        rank1, title1 = rank_title.split('. ', 1)
    except:
        rank1, title1 = 'N/A', 'N/A'
    
    rank.append(rank1)
    title.append(title1)

    # Year
    try:
        year1 = movie.select_one('span.cli-title-metadata-item').text.strip()
    except:
        year1 = 'N/A'
    year.append(year1)

    # Rating
    try:
        rating1 = movie.select_one('span.ipc-rating-star--imdb').text.strip().split()[0]
    except:
        rating1 = 'N/A'
    rating.append(rating1)

    # Duration
    try:
        metadata = movie.select('span.cli-title-metadata-item')
        duration1 = metadata[1].text.strip() if len(metadata) > 1 else "N/A"
    except:
        duration1 = 'N/A'
    duration.append(duration1)

    # Number of Reviews(Votes) - Convert to Numeric
    try:
        vote_element = movie.select_one('span.ipc-rating-star--voteCount')
        votes_str = vote_element.text.replace(",", "").replace("(", "").replace(")", "")
        if "M" in votes_str:
            votes1 = float(votes_str.replace("M", "")) * 1_000_000
        elif "K" in votes_str:
            votes1 = float(votes_str.replace("K", "")) * 1_000
        else:
            votes1 = int(votes_str)
    except:
        votes1 = 0  # Default to 0 if not available
    votes.append(votes1)

    # Extract Genre & Director from Movie Page
    try:
        movie_link = movie.find('a', class_='ipc-title-link-wrapper', href=True)
        movie_url = "https://www.imdb.com" + movie_link['href']
        time.sleep(1)  # Avoid too many requests
        response = requests.get(movie_url, headers={'User-Agent': 'Mozilla/5.0'})
        movie_soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Genre
        script = movie_soup.find('script', type='application/ld+json')
        if script:
            data = json.loads(script.string)
            genre_list = data.get('genre', ['N/A'])
            genre1.append(genre_list[0] if len(genre_list) > 0 else 'N/A')
            genre2.append(genre_list[1] if len(genre_list) > 1 else 'N/A')
            genre3.append(genre_list[2] if len(genre_list) > 2 else 'N/A')
        else:
            genre1.append('N/A')
            genre2.append('N/A')
            genre3.append('N/A')

        # Extract Director Name
        try:
            director1 = movie_soup.select_one('li.ipc-inline-list__item a.ipc-metadata-list-item__list-content-item').text.strip()
        except:
            director1 = 'N/A'
        director.append(director1)

    except:
        genre1.append('N/A')
        genre2.append('N/A')
        genre3.append('N/A')
        director.append('N/A')

# Create DataFrame
df = pd.DataFrame({
    'Rank': rank,
    'Title': title,
    'Year': year,
    'Rating': rating,
    'Genre 1': genre1,
    'Genre 2': genre2,
    'Genre 3': genre3,
    'Duration': duration,
    'Number of Reviews(Votes)': votes,
    'Director': director
})

# Save to CSV
csv_filename = 'IMDB_TOP250.csv'
df.to_csv(csv_filename, index=False)
print(f'Successfully extracted {len(df)} movies to {csv_filename}')


  service = Service("E:\chromedriver-win64\chromedriver.exe")


Movies found: 250
Successfully extracted 250 movies to IMDB_TOP250.csv
