In [31]:
# Import the libraries
import requests                  # For making HTTP requests
from bs4 import BeautifulSoup     # For parsing HTML content
import pandas as pd           # For creating, storing and manipulating DataFrames
import json


In [19]:
# Define the URL of the IMDb Top 250 movies page
url = 'https://www.imdb.com/chart/top/'

# Set headers to mimic a browser request
headers = {
    'User-Agent': 'Chrome/91.0.4472.124'
}

# Send an HTTP GET request to fetch the page content with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    print("Successfully fetched the page!")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Successfully fetched the page!


In [28]:
# Send an HTTP GET request with the headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    print("Successfully fetched the page!")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

# Display the response object (if needed)
response

Successfully fetched the page!


<Response [200]>

### Parse the HTML

In [41]:
# Parse the HTML content of the page using Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")

# Print the parsed HTML to verify
print(soup.prettify()[:2000])

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1742237429722);
        }
    })
  </script>
  <title>
   IMDb Top 250 Movies
  </title>
  <meta content="As rated by regular IMDb voters." data-id="main" name="description"/>
  <meta content="max-image-preview:large" name="robots"/>
  <script type="application/ld+json">
   {"@type":"It

In [48]:
# Find the JSON-LD script tag
json_data = soup.find('script', type='application/ld+json')

if json_data:    # Parse the JSON data
    data = json.loads(json_data.string) 

In [50]:
# Extract titles and ratings
titles = []
urls = []
descriptions = []
best_ratings = []
worst_ratings = []
ratings = []
genres = []
durations = []

In [51]:
# Check if the data contains the expected structure
if 'itemListElement' in data:
    for item in data['itemListElement']:
        movie = item['item']
        
# Extract movie details
        titles.append(movie['name'])  # Movie name
        urls.append(movie['url'])  # Movie URL
        descriptions.append(movie['description'])  # Movie description
        
# Extract ratings (best, worst, actual rating)
        best_ratings.append(movie['aggregateRating']['bestRating'])  # Best rating
        worst_ratings.append(movie['aggregateRating']['worstRating']) # Worst rating
        ratings.append(float(movie['aggregateRating']['ratingValue'])) # Rating value
        
        genres.append(movie['genre'])  # Movie genre
        durations.append(movie['duration'])  # Movie duration

In [53]:
# Create a DataFrame to store the scraped data
df = pd.DataFrame({
    'Title': titles,
    'URL': urls,
    'Description': descriptions,
    'Best Rating': best_ratings,
    'Worst Rating': worst_ratings,
    'Rating': ratings,
    'Genre': genres,
    'Duration': durations
 })

In [54]:
df.head()

Unnamed: 0,Title,URL,Description,Best Rating,Worst Rating,Rating,Genre,Duration
0,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,A banker convicted of uxoricide forms a friend...,10,1,9.3,Drama,PT2H22M
1,The Godfather,https://www.imdb.com/title/tt0068646/,The aging patriarch of an organized crime dyna...,10,1,9.2,"Crime, Drama",PT2H55M
2,The Dark Knight,https://www.imdb.com/title/tt0468569/,When a menace known as the Joker wreaks havoc ...,10,1,9.0,"Action, Crime, Drama",PT2H32M
3,The Godfather Part II,https://www.imdb.com/title/tt0071562/,The early life and career of Vito Corleone in ...,10,1,9.0,"Crime, Drama",PT3H22M
4,12 Angry Men,https://www.imdb.com/title/tt0050083/,The jury in a New York City murder trial is fr...,10,1,9.0,"Crime, Drama",PT1H36M
