# Project Scraping

In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Step 1: Install necessary packages (run these in a separate cell or before your script if needed)
# !pip install selenium pandas webdriver-manager

# Step 2: Import libraries
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager  # Optional, for automatic chromedriver installation

# Step 3: Set up the Selenium WebDriver
service = Service(ChromeDriverManager().install())  # Automatically install chromedriver
driver = webdriver.Chrome(service=service)

# Step 4: Open the IMDb top 100 movies page
driver.get('https://www.imdb.com/chart/top')

# Step 5: Wait for the movies data to load (wait for a specific element to load)
try:
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.sc-6ade9358-0.ktYEKX')))
except Exception as e:
    print(f"Error loading page: {e}")

# Step 6: Initialize empty lists to store the movie details
movie_name = []
year = []
time = []
rating = []

# Step 7: Locate the movie containers and loop through them to extract data
movie_data = driver.find_elements(By.CSS_SELECTOR, 'div.sc-6ade9358-0.ktYEKX')

for movie in movie_data:
    try:
        # Extract the movie title
        title = movie.find_element(By.CSS_SELECTOR, 'h3.ipc-title__text').text.split('. ', 1)[1]  # Skip the number part
        movie_name.append(title)
        
        # Extract the year and runtime
        metadata = movie.find_elements(By.CSS_SELECTOR, 'span.sc-6ade9358-7.exckou.cli-title-metadata-item')
        movie_year = metadata[0].text  # Year
        movie_runtime = metadata[1].text  # Runtime
        year.append(movie_year)
        time.append(movie_runtime)
        
        # Extract the IMDb rating
        imdb_rating = movie.find_element(By.CSS_SELECTOR, 'span.ipc-rating-star--rating').text
        rating.append(imdb_rating)
        
    except Exception as e:
        print(f"Error extracting data for a movie: {e}")
        continue

# Step 8: Close the driver after scraping
driver.quit()

# Step 9: Create a DataFrame from the scraped data
movie_df = pd.DataFrame({
    'Name of movie': movie_name,
    'Year of release': year,
    'Watchtime': time,
    'Movie Rating': rating
})

# Step 10: Display the dataframe
print(movie_df[['Name of movie', 'Year of release', 'Movie Rating', 'Watchtime']])

# Step 11: Save the DataFrame to your specified path
output_path = '/Users/schmidty/Desktop/Data Wrang/Project 1/Movies Box Office Collection Data/movie_data.csv'
movie_df.to_csv(output_path, index=False)

print(f"Data saved to: {output_path}")

                                         Name of movie Year of release  \
0                             The Shawshank Redemption            1994   
1                                        The Godfather            1972   
2                                      The Dark Knight            2008   
3                                The Godfather Part II            1974   
4                                         12 Angry Men            1957   
..                                                 ...             ...   
245                                            Rebecca            1940   
246                                           The Help            2011   
247                                     Cool Hand Luke            1967   
248                          A Silent Voice: The Movie            2016   
249  Demon Slayer: Kimetsu no Yaiba - The Movie: Mu...            2020   

    Movie Rating Watchtime  
0            9.3    2h 22m  
1            9.2    2h 55m  
2            9.0    2h 3