### Code for scraping the first website

In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import pandas as pd
import re

In [2]:
# Initialize WebDriver and open the URL
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.imdb.com/list/ls098063263/")

In [3]:
# Make lists to store data
titles = []
release_years = []
ratings = []
gross_earnings = []
directors = []

In [4]:
def random_scroll(driver, total_wait_time):
    total_height = driver.execute_script("return document.body.scrollHeight")
    scroll_steps = random.randint(3, 10)
    scroll_increment = total_height // scroll_steps
    time_per_step = total_wait_time / scroll_steps

    for step in range(scroll_steps):
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        random_wait = random.uniform(0.5 * time_per_step, 1.5 * time_per_step)
        time.sleep(random_wait)

#Function to scroll from top to bottom of the page with random intervals
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [5]:
#Make the scraping function
def imdb_data(driver):
    page_number = 1 #Counts the pages

    while True:
        print(f"Scraping Page {page_number}...")

        #Scroll through the page so it loads
        random_scroll(driver, random.uniform(5, 12))
        
        #Add a delay to let contents load
        time.sleep(3)

        #Find the movie containers
        movie_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li')
        
        #Title scrape
        title_elements = driver.find_elements(By.XPATH, '//h3[@class="ipc-title__text"]')
        for title in title_elements:
            titles.append(title.text)

        #Release year scrape
        #year_elements = driver.find_elements(By.CLASS_NAME, "sc-6ade9358-7.exckou.dli-title-metadata-item")
        #year_elements = driver.find_elements(By.XPATH, '//span[@class="sc-5bc66c50-6 OOdsw dli-title-metadata-item"]')
        try: 
            year_element = movie.find_element(By.XPATH, ".//span[contains(@class, 'cli-title-metadata-item')][1]")
            movie_year = year_element.text.strip()
        except Exception:
            movie_year = "N/A"
        for year in year_elements:
            release_years.append(year.text)

        #Movie rating scrape
        ratings_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li[1]/div/div/div/div[1]/div[2]/div[2]/span[3]')
        for rating in ratings_elements:
            ratings.append(rating.text)

        #Gross earning scrape
        #gross_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li[1]/div/div/div/div[2]/div[2]/div/span')
        gross_elements = driver.find_elements(By.XPATH, "//span[contains(text(),'Worldwide Lifetime Gross:')]")
        for gross_earning in gross_elements:
            gross_earnings.append(gross_earning.text)

        #Director scrape
        #directors_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li[1]/div/div/div/div[2]/span/span[2]')
        directors_elements = driver.find_elements(By.XPATH, "//a[@class='ipc-link ipc-link--base dli-director-item']")
        for director in directors_elements:
            directors.append(director.text)
        
        #Wait a random time between 2-20 seconds
        wait_time = random.randint(5, 10)
        print(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        
        #Scroll to the bottom of the current page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                              

In [6]:
imdb_data(driver)

Scraping Page 1...
Waiting 5 seconds...
Scraping Page 1...
Waiting 5 seconds...
Scraping Page 1...
Waiting 9 seconds...
Scraping Page 1...
Waiting 9 seconds...
Scraping Page 1...
Waiting 8 seconds...
Scraping Page 1...
Waiting 6 seconds...
Scraping Page 1...


KeyboardInterrupt: 

In [None]:
  # Create a DataFrame
highest_grossing_imdb = pd.DataFrame({
    'Title': titles,
    'Release Year': release_years,
    'Rating': ratings,
    'Worldwide Lifetime Gross': gross_earnings,
    'Directors': directors
    })

In [17]:
print(gross_earnings)

['Worldwide Lifetime Gross: $2,923,706,026', 'Worldwide Lifetime Gross: $2,799,439,100', 'Worldwide Lifetime Gross: $2,320,250,281', 'Worldwide Lifetime Gross: $2,264,750,694', 'Worldwide Lifetime Gross: $2,071,310,218', 'Worldwide Lifetime Gross: $2,052,415,039', 'Worldwide Lifetime Gross: $1,921,373,347', 'Worldwide Lifetime Gross: $1,693,216,995', 'Worldwide Lifetime Gross: $1,671,537,444', 'Worldwide Lifetime Gross: $1,662,020,819', 'Worldwide Lifetime Gross: $1,520,538,536', 'Worldwide Lifetime Gross: $1,515,342,457', 'Worldwide Lifetime Gross: $1,495,696,292', 'Worldwide Lifetime Gross: $1,453,683,476', 'Worldwide Lifetime Gross: $1,446,938,421', 'Worldwide Lifetime Gross: $1,405,018,048', 'Worldwide Lifetime Gross: $1,362,566,989', 'Worldwide Lifetime Gross: $1,349,926,083', 'Worldwide Lifetime Gross: $1,342,480,797', 'Worldwide Lifetime Gross: $1,334,407,706', 'Worldwide Lifetime Gross: $1,332,494,933', 'Worldwide Lifetime Gross: $1,310,469,037', 'Worldwide Lifetime Gross: $1,3

In [19]:
print(release_years)

[]


In [16]:
cleaned_gross = [re.search(r'\$\S+', entry).group() for entry in gross_earnings if re.search(r'\$\S+', entry)]

In [None]:
#Save to csv
highest_grossing_imdb.to_csv('imdb_gross_movies_scraped.csv', index=False)

In [18]:
print(cleaned_gross)

['$2,923,706,026', '$2,799,439,100', '$2,320,250,281', '$2,264,750,694', '$2,071,310,218', '$2,052,415,039', '$1,921,373,347', '$1,693,216,995', '$1,671,537,444', '$1,662,020,819', '$1,520,538,536', '$1,515,342,457', '$1,495,696,292', '$1,453,683,476', '$1,446,938,421', '$1,405,018,048', '$1,362,566,989', '$1,349,926,083', '$1,342,480,797', '$1,334,407,706', '$1,332,494,933', '$1,310,469,037', '$1,306,450,154', '$1,266,115,964', '$1,243,225,667', '$1,236,009,236', '$1,215,577,205', '$1,159,457,503', '$1,155,046,416', '$1,152,028,393', '$1,138,027,091', '$1,132,705,055', '$1,131,416,446', '$1,123,794,079', '$1,114,975,066', '$1,108,594,137', '$1,105,261,713', '$1,104,379,926', '$1,078,958,629', '$1,077,022,372', '$1,073,841,394', '$1,067,316,101', '$1,066,179,747', '$1,058,684,742', '$1,054,304,000', '$1,046,721,266', '$1,046,515,409', '$1,034,800,131', '$1,029,266,989', '$1,025,521,689', '$1,025,468,216', '$1,024,583,854', '$1,017,107,150', '$1,009,025,676', '$1,001,978,080', '$981,708