In [1]:
!pip install selenium
!pip install webdriver-manager



In [2]:
import subprocess

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import time
import random


from concurrent.futures import ThreadPoolExecutor

from bs4 import BeautifulSoup
import json
import requests

import pandas as pd
import numpy as np
import re

from IPython.display import FileLink

from unittest.mock import MagicMock


## Check the version of Chrome and compare it with ChromeDriver

In [3]:
def chrome_driver_options():
    chrome_options = Options()

    chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.160 Safari/537.36")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-dev-shm-usage')  # Important for Colab
    chrome_options.add_argument('--remote-debugging-port=9222')  # To fix DevToolsActivePort issue
    
    service = Service(ChromeDriverManager().install()) 
    driver = webdriver.Chrome(service=service, options=chrome_options)  # Return WebDriver instance
    driver.set_page_load_timeout(60)  # Increase timeout to 60 seconds
    driver.set_script_timeout(120)  # Increase script timeout as well

    return driver

In [4]:
# # Test that Chrome is working
# driver = chrome_driver_options()
# driver.get("https://www.google.com")
# print(driver.title)  
# driver.quit()


# Use Selenium and BeautifulSoup to collect the relevant top movie data

In [5]:
# Urls for IMDb Top 250 movies page
imdb_url = "https://www.imdb.com"
top_250_url = 'https://www.imdb.com/chart/top/'


In [6]:
# Function to scrape the IMDb top movies list
def get_top_movies(browser):
    try:
        # Navigate to the IMDB top 250 page
        
        browser.get(top_250_url)
        time.sleep(15)  # Allows JS to load
    
        # Get the page source and parse it with BeautifulSoup
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the JSON data from the page
        script_tag = soup.find('script', {'type': "application/ld+json"})
        if not script_tag:
            raise Exception("No JSON data found in the page.")
        data = json.loads(script_tag.text)

        # Extract the list of movies from the JSON data
        movies = data.get('itemListElement', [])
        
        # Retrieve all metadata divs that might contain additional year and duration info.
        # (Note: year and duration are not in JSON and we need this to gather this extra info)
        metadata_divs = soup.find_all('div', class_='sc-d5ea4b9d-6 hBxwRe cli-title-metadata')
        

        # Extract movie details
        movie_data = []

        for index, movie in enumerate(movies):
            movie_info = movie['item']
            title = movie_info['name']
            rank = index + 1  # Assuming movies are in the correct order
            

            # Extract genre, content rating, IMDb rating, url from JSON
            genre = movie_info.get("genre", None)
            content_rating = movie_info.get("contentRating", None)
            imdb_rating = movie_info.get("aggregateRating", {}).get("ratingValue", np.nan)
            url = movie_info["url"] 
            
            # Ensure we have a metadata div for the movie before extracting
            if index < len(metadata_divs):
                metadata_div = metadata_divs[index]
                spans = metadata_div.find_all('span')

                # Extract the year, duration 
                year = spans[0].text if len(spans) > 0 else 'N/A'
                duration = spans[1].text if len(spans) > 1 else 'N/A'
            else:
                # Default values if no corresponding metadata div
                year = np.NaN
                duration = np.NaN


            # Append the extracted details to the movie_data list
            movie_data.append({
                'rank': rank,
                'title': title,
                'year': year,
                'rating': content_rating,
                'duration': duration,
                'imdb_rating': imdb_rating,
                'genre': genre,
                'movie_url': url
            })
            
            time.sleep(0.5)  # Be polite and avoid getting blocked

        # Return the data
        #print(movie_data[:50]) # Check our work
        return movie_data

    except Exception as e:
        print("There was an error scraping the top 250", e)
        return None


### Get the top 250 movies

In [7]:
# Initialize the Selenium browser
print("Trying...")

browser = chrome_driver_options()
print("Browser initialized!")

# # Set implicit wait for elements to load
browser.implicitly_wait(10)  # Wait for elements to load

# Call the scrape function to extract movie data
print("Scraping IMDB Top 250 movie data")
movies_list = get_top_movies(browser)  # Ensure this function returns a list of dicts

time.sleep(10)  

# Create a Dataframe of the top movie (using returned list)
top_movies_df = pd.DataFrame(movies_list)
print("Top movies dataframe created")

browser.quit()

Trying...
Browser initialized!
Scraping IMDB Top 250 movie data
Top movies dataframe created


### Get the movie details 

## Results from Scraping the Top Movies

In [10]:
len(top_movies_df)

250

In [11]:
top_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rank         250 non-null    int64  
 1   title        250 non-null    object 
 2   year         250 non-null    object 
 3   rating       245 non-null    object 
 4   duration     250 non-null    object 
 5   imdb_rating  250 non-null    float64
 6   genre        250 non-null    object 
 7   movie_url    250 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 15.8+ KB


In [12]:
top_movies_df.isna().sum()

rank           0
title          0
year           0
rating         5
duration       0
imdb_rating    0
genre          0
movie_url      0
dtype: int64

In [13]:
top_movies_df.duplicated().sum()

0

In [14]:
# Checking our work
top_movies_df.head(20)

Unnamed: 0,rank,title,year,rating,duration,imdb_rating,genre,movie_url
0,1,The Shawshank Redemption,1994,R,2h 22m,9.3,Drama,https://www.imdb.com/title/tt0111161/
1,2,The Godfather,1972,R,2h 55m,9.2,"Crime, Drama",https://www.imdb.com/title/tt0068646/
2,3,The Dark Knight,2008,PG-13,2h 32m,9.0,"Action, Crime, Drama",https://www.imdb.com/title/tt0468569/
3,4,The Godfather Part II,1974,R,3h 22m,9.0,"Crime, Drama",https://www.imdb.com/title/tt0071562/
4,5,12 Angry Men,1957,Approved,1h 36m,9.0,"Crime, Drama",https://www.imdb.com/title/tt0050083/
5,6,The Lord of the Rings: The Return of the King,2003,PG-13,3h 21m,9.0,"Adventure, Drama, Fantasy",https://www.imdb.com/title/tt0167260/
6,7,Schindler&apos;s List,1993,R,3h 15m,9.0,"Biography, Drama, History",https://www.imdb.com/title/tt0108052/
7,8,Pulp Fiction,1994,R,2h 34m,8.9,"Crime, Drama",https://www.imdb.com/title/tt0110912/
8,9,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58m,8.9,"Adventure, Drama, Fantasy",https://www.imdb.com/title/tt0120737/
9,10,"Il buono, il brutto, il cattivo",1966,R,2h 58m,8.8,"Adventure, Drama, Western",https://www.imdb.com/title/tt0060196/


In [15]:
# Save the top movies dataframe as a .csv
top_movies_df.to_csv('top_movies_df.csv', index=False)

# Then create a download link for that file
display(FileLink('top_movies_df.csv'))

# Scrape the Movie Details with BeautifulSoup 

In [22]:
# Define headers to mimic a real browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.160 Safari/537.36"
}

In [27]:
def fetch_movie_details_soup(movies_df):
    movie_details_list = []

    for index, row in movies_df.iterrows():
        movie_url = row['movie_url']
        movie_title = row['title']

        try:
            print(f"Fetching details for: {movie_title}")

            # Make a GET request to the IMDb movie URL
            response = requests.get(movie_url, headers=HEADERS)

            # Check if the request was successful
            if response.status_code != 200:
                print(f"Failed to retrieve {movie_title}. Status Code:", response.status_code)
                continue

            # Parse page with BeautifulSoup
            movie_soup = BeautifulSoup(response.text, 'html.parser')

            # --- Popularity Score ---
            pop_score = 0
            pop_score_element = movie_soup.find('div', {'data-testid': 'hero-rating-bar__popularity__score'})
            if pop_score_element:
                pop_score = int(pop_score_element.get_text(strip=True).replace(',', ''))

            # --- Star data ---
            star_data = {}
            for idx, actor_tag in enumerate(movie_soup.select('a[data-testid="title-cast-item__actor"]')[:3]):  
                actor_name = actor_tag.text.strip()
                actor_url = "https://www.imdb.com" + actor_tag["href"]

                # Assign the name and URL
                star_data[f"star_{idx + 1}"] = actor_name
                star_data[f"star_{idx + 1}_link"] = actor_url

            # --- Awards data ---
            awards_section = movie_soup.find('div', {'data-testid': 'awards'})
            oscar_wins = oscar_nominations = 0

            if awards_section:
                awards_text = awards_section.get_text(strip=True)
                win_match = re.search(r'Won (\d+)', awards_text)
                nom_match = re.search(r'Nominated for (\d+)', awards_text)

                if win_match:
                    oscar_wins = int(win_match.group(1))
                if nom_match:
                    oscar_nominations = int(nom_match.group(1))

            # Store movie details
            movie_details = {
                'title': movie_title,
                'popularity_score': pop_score,
                'oscar_wins': oscar_wins,
                'oscar_nominations': oscar_nominations
            }

            movie_details.update(star_data)
            movie_details_list.append(movie_details)

            # Randomized delay between requests (2-5 seconds)
            time.sleep(random.uniform(2, 5))

        except Exception as e:
            print(f"Error fetching details for {movie_title}: {e}")

    return pd.DataFrame(movie_details_list)


In [28]:
# Scrape for movie details 
print("Ready to scrape for movie details!")

# Search using the top 250 movies
movie_details = top_movies_df[['title', 'movie_url']]

# Fetch the movie details using the scraping function
movie_details_df = fetch_movie_details_soup(movie_details)

# Display the first few resulting rows to check our work
print(movie_details_df.head())

Ready to scrape for movie details!
Fetching details for: The Shawshank Redemption
Fetching details for: The Godfather
Fetching details for: The Dark Knight
Fetching details for: The Godfather Part II
Fetching details for: 12 Angry Men
Fetching details for: The Lord of the Rings: The Return of the King
Fetching details for: Schindler&apos;s List
Fetching details for: Pulp Fiction
Fetching details for: The Lord of the Rings: The Fellowship of the Ring
Fetching details for: Il buono, il brutto, il cattivo
Fetching details for: Forrest Gump
Fetching details for: The Lord of the Rings: The Two Towers
Fetching details for: Fight Club
Fetching details for: Inception
Fetching details for: Star Wars: Episode V - The Empire Strikes Back
Fetching details for: The Matrix
Fetching details for: GoodFellas
Fetching details for: One Flew Over the Cuckoo&apos;s Nest
Fetching details for: Interstellar
Fetching details for: Se7en
Fetching details for: It&apos;s a Wonderful Life
Fetching details for: Shic

Fetching details for: Mr. Smith Goes to Washington
Fetching details for: Maharaja
Fetching details for: Smultronstället
Fetching details for: The Third Man
Fetching details for: Logan
Fetching details for: Rocky
Fetching details for: Tôkyô monogatari
Fetching details for: Kimetsu no Yaiba: Tsuzumi Yashiki-hen
Fetching details for: The Big Lebowski
Fetching details for: Spotlight
Fetching details for: Det sjunde inseglet
Fetching details for: The Terminator
Fetching details for: Room
Fetching details for: Pirates of the Caribbean: The Curse of the Black Pearl
Fetching details for: Jai Bhim
Fetching details for: Hotel Rwanda
Fetching details for: Platoon
Fetching details for: La haine
Fetching details for: Before Sunset
Fetching details for: The Best Years of Our Lives
Fetching details for: La passion de Jeanne d&apos;Arc
Fetching details for: The Exorcist
Fetching details for: The Wizard of Oz
Fetching details for: The Incredibles
Fetching details for: Rush
Fetching details for: The Sou

## Collecting the Movie Data

In [None]:
len(movie_details_df)

In [None]:
movie_details_df.head(10)

In [None]:
movie_details_df.info()

In [None]:
movie_details_df.isna().sum()

In [None]:
movie_details_df.duplicated().sum()

In [None]:
# Merge the dataframes 
combined_movie_details = pd.merge(top_movies_df, movie_details_df, on='title', how='left')

In [None]:
combined_movie_details.info()

# Combining the data from the Top 250 movies with the Extracted Movie Details

In [None]:
combined_movie_details.head(100)d

In [None]:
combined_movie_details.duplicated().sum()

In [None]:
# Save the movies dataframe as a .csv
combined_movie_details.to_csv('movie_details_8FEB.csv', index=False)

# Then create a download link for that file
display(FileLink('movie_details_8FEB.csv'))