### Code for scraping the first website

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import pandas as pd

In [2]:
# Initialize WebDriver and open the URL
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.imdb.com/list/ls098063263/")

In [3]:
# Make lists to store data
titles = []
release_years = []
ratings = []
gross_earnings = []
directors = []

In [4]:
def random_scroll(driver, total_wait_time):
    total_height = driver.execute_script("return document.body.scrollHeight")
    scroll_steps = random.randint(3, 10)
    scroll_increment = total_height // scroll_steps
    time_per_step = total_wait_time / scroll_steps

    for step in range(scroll_steps):
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        random_wait = random.uniform(0.5 * time_per_step, 1.5 * time_per_step)
        time.sleep(random_wait)

#Function to scroll from top to bottom of the page with random intervals
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [5]:
#Make the scraping function
def imdb_data(driver):
    page_number = 1 #Counts the pages

    while True:
        print(f"Scraping Page {page_number}...")

        #Scroll through the page so it loads
        random_scroll(driver, random.uniform(5, 12))
        
        #Add a delay to let contents load
        time.sleep(3)

        #Find the movie containers
        movie_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li')
        
        #Title scrape
        title_elements = driver.find_elements(By.XPATH, '//h3[@class="ipc-title__text"]')
        for title in title_elements:
            titles.append(title.text)

        #Release year scrape
        year_elements = driver.find_elements(By.XPATH, '//span[@class="sc-5bc66c50-6 OOdsw dli-title-metadata-item"]')
        for year in year_elements:
            release_years.append(year.text)

        #Movie rating scrape
        ratings_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li[1]/div/div/div/div[1]/div[2]/div[2]/span[3]')
        for rating in ratings_elements:
            ratings.append(rating.text)

        #Gross earning scrape
        gross_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li[1]/div/div/div/div[2]/div[2]/div/span')
        for gross_earning in gross_elements:
            gross_earnings.append(gross_earning.text)

        #Director scrape
        directors_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li[1]/div/div/div/div[2]/span/span[2]')
        for director in directors_elements:
            directors.append(director.text)
        
        #Wait a random time between 2-20 seconds
        wait_time = random.randint(2, 3)
        print(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        
        #Scroll to the bottom of the current page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                              

In [6]:
imdb_data(driver)

Scraping Page 1...
Waiting 3 seconds...
Scraping Page 1...
Waiting 2 seconds...
Scraping Page 1...
Waiting 2 seconds...
Scraping Page 1...
Waiting 2 seconds...
Scraping Page 1...
Waiting 2 seconds...
Scraping Page 1...
Waiting 3 seconds...
Scraping Page 1...


KeyboardInterrupt: 

In [None]:
  # Create a DataFrame
highest_grossing_imdb = pd.DataFrame({
    'Title': titles,
    'Release Year': release_years,
    'Rating': ratings,
    'Worldwide Lifetime Gross': gross_earnings,
    'Directors': directors
    })

In [15]:
print(release_years)

['2009', '2h 42m', 'PG-13', '2019', '3h 1m', 'PG-13', '2022', '3h 12m', 'PG-13', '1997', '3h 14m', 'PG-13', '2015', '2h 18m', 'PG-13', '2018', '2h 29m', 'PG-13', '2021', '2h 28m', 'PG-13', '2024', '1h 36m', 'PG', '2015', '2h 4m', 'PG-13', '2019', '1h 58m', 'PG', '2012', '2h 23m', 'PG-13', '2015', '2h 17m', 'PG-13', '2022', '2h 10m', 'PG-13', '2019', '1h 43m', 'PG', '2023', '1h 54m', 'PG-13', '2015', '2h 21m', 'PG-13', '2023', '1h 32m', 'PG', '2018', '2h 14m', 'PG-13', '2011', '2h 10m', 'PG-13', '2017', '2h 32m', 'PG-13', '2024', '2h 8m', 'R', '2018', '2h 8m', 'PG-13', '2013', '1h 42m', 'PG', '2017', '2h 9m', 'PG', '2018', '1h 58m', 'PG', '2017', '2h 16m', 'PG-13', '2013', '2h 10m', 'PG-13', '2015', '1h 31m', 'PG', '2016', '2h 27m', 'PG-13', '2018', '2h 23m', 'PG-13', '2003', '3h 21m', 'PG-13', '2019', '2h 9m', 'PG-13', '2019', '2h 3m', 'PG-13', '2011', '2h 34m', 'PG-13', '2012', '2h 44m', 'PG-13', '2012', '2h 23m', 'PG-13', '2014', '2h 45m', 'PG-13', '1993', '2h 7m', 'PG-13', '2019', '

In [None]:
#Save to csv
#df.to_csv('imdb_gross_movies_scraped.csv', index=False)