In [None]:
import requests
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

In [None]:
def scrape_imdb_movies(url, max_clicks):
    # Setup Chrome options
    options = Options()
    options.add_argument("--headless")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    # Initialize the driver
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Click "Load More" repeatedly
    clicks = 0
    while clicks < max_clicks:
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(3)  # Wait for new items to load
            clicks += 1
            print(f"Clicked 'Load More' {clicks} times...")
        except (NoSuchElementException, ElementClickInterceptedException):
            print("No more 'Load More' button or couldn't click it.")
            break

    # After loading all content
    soup = BeautifulSoup(driver.page_source, "lxml")
    driver.quit()

    movies = []
    movie_items = soup.find_all('li', class_='ipc-metadata-list-summary-item')

    for item in movie_items:
        title_tag = item.find('h3', class_='ipc-title__text')
        title = title_tag.text if title_tag else 'N/A'
        
        year_tag = item.find('span', class_='sc-5179a348-7')
        year = year_tag.text if year_tag else 'N/A'
        
        runtime_tag = item.find_all('span', class_='sc-5179a348-7')
        if len(runtime_tag) > 1:
            runtime = runtime_tag[1].text
            if 'h' in runtime:
                parts = runtime.split('h')
                hours = int(parts[0].strip())
                minutes = int(parts[1].strip().replace('m', '')) if 'm' in parts[1] else 0
                total_minutes = hours * 60 + minutes
            else:
                total_minutes = int(runtime.replace('m', '').strip())
        else:
            total_minutes = 'N/A'
        
        rating = runtime_tag[2].text if len(runtime_tag) > 2 else 'N/A'
        
        id_tag = item.find('a', href=True)
        movie_id = id_tag['href'].split('/')[2] if id_tag else 'N/A'
        
        movies.append({
            'title': title,
            'year': year,
            'runtime': total_minutes,
            'rating': rating,
            'id': movie_id
        })

    # Write to CSV
    with open('movies.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'year', 'runtime', 'rating', 'id'])
        writer.writeheader()
        writer.writerows(movies)
        
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01,&num_votes=3000,&colors=color&country_of_origin=US&primary_language=en&sort=boxoffice_gross_us,desc"
max_clicks = 180
scrape_imdb_movies(url, max_clicks)
print('Done!')