In [1]:
# Import the libraries
import requests                  # For making HTTP requests
from bs4 import BeautifulSoup     # For parsing HTML content
import pandas as pd           # For creating, storing and manipulating DataFrames
import json

### Send Request

In [2]:
# Define the URL
url = "https://www.imdb.com/list/ls095374765/"

# Get the request
response= requests.get(url)

# Set headers to mimic a browser request
headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36'
}

# Send an HTTP GET request to fetch the page content with headers
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    print("Successfully fetched the page!")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Successfully fetched the page!


### Parse the HTML Content

In [4]:
#Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

#Print the Parsed HTML to verify
print(soup.prettify()[:2000]) #readable format

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1742318719679);
        }
    })
  </script>
  <title>
   Movie Reviews
  </title>
  <meta content="" data-id="main" name="description"/>
  <meta content="max-image-preview:large" name="robots"/>
  <script type="application/ld+json">
   {"@type":"ItemList","itemListElement":[{"@type":"L

### Extract the movie details

In [5]:
# Find the JSON-LD script tag
json_data = soup.find('script', type='application/ld+json')

if json_data:    # Parse the JSON data
    data = json.loads(json_data.string) 

In [9]:
# Extract titles and ratings
titles = [] 
descriptions = []
ratings = []
durations = []


Movies came out to be empty. In this situation, we need to try and identify how each element has been stored.

In [10]:
# Check if the data contains the expected structure
if 'itemListElement' in data:
    for item in data['itemListElement']:
        movie = item['item']
        
# Extract movie details
        titles.append(movie['name'])  # Movie name
        descriptions.append(movie['description'])  # Movie description
        ratings.append(float(movie['aggregateRating']['ratingValue'])) # Rating value
        durations.append(movie['duration']) # Movie duration

In [11]:
# Create a DataFrame to store the scraped data
df = pd.DataFrame({
    'Title': titles,
    'Description': descriptions,
    'Rating': ratings,
    'Duration': durations
 })

In [12]:
df.head()

Unnamed: 0,Title,Description,Rating,Duration
0,Aladdin,"Aladdin, a kind thief, woos Jasmine, the princ...",6.9,PT2H8M
1,It Chapter Two,Twenty-seven years after their first encounter...,6.5,PT2H49M
2,Joker,"Arthur Fleck, a party clown and a failed stand...",8.3,PT2H2M
3,Dolemite Is My Name,Eddie Murphy portrays real-life legend Rudy Ra...,7.2,PT1H58M
4,Anna,Beneath Anna Poliatova&apos;s striking beauty ...,6.7,PT1H58M


### Data Filtering and Cleaning

In [91]:
# Display basic info about the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        250 non-null    object 
 1   Description  250 non-null    object 
 2   Rating       250 non-null    float64
 3   Duration     250 non-null    object 
dtypes: float64(1), object(3)
memory usage: 7.9+ KB
None


In [92]:
# Check for null values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Title          0
Description    0
Rating         0
Duration       0
dtype: int64


In [13]:
# Check for duplicates and remove them
print("\nDuplicate Rows Before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicate Rows After:", df.duplicated().sum())


Duplicate Rows Before: 1
Duplicate Rows After: 0


In [15]:
# Display some sample data
print("\nSample Data:")
print(df.sample(5))


Sample Data:
                        Title  \
47      Death Race 3: Inferno   
246  Batman: The Killing Joke   
197           Too Big to Fail   
199            Into the Storm   
217            V for Vendetta   

                                           Description  Rating Duration  
47   Convicted cop-killer Carl Lucas, a.k.a. Franke...     5.4  PT1H44M  
246  As Batman hunts for the escaped Joker, the Clo...     6.4  PT1H16M  
197  Chronicles the financial meltdown of 2008 and ...     7.2  PT1H39M  
199  Continuing the storyline of The Gathering Stor...     7.0  PT1H39M  
217  In a future British dystopian society, a shado...     8.1  PT2H12M  


In [16]:
# Save the DataFrame to a CSV file
df.to_csv('topIMBDmovies.csv', index=False) # Setting index=False

In [17]:
# Filter: Show only movies with a rating above 9
high_rated_movies = df[df['Rating']>7 ]
print("\nHighly Rated Movies:")
print(high_rated_movies)


Highly Rated Movies:
                                         Title  \
2                                        Joker   
3                          Dolemite Is My Name   
6                                     Deadwood   
7                          The Ron Clark Story   
8                           Colorado Territory   
..                                         ...   
217                             V for Vendetta   
218                                   Hercules   
220                                   The Crow   
231                   Deadpool &amp; Wolverine   
232  Shang-Chi and the Legend of the Ten Rings   

                                           Description  Rating Duration  
2    Arthur Fleck, a party clown and a failed stand...     8.3   PT2H2M  
3    Eddie Murphy portrays real-life legend Rudy Ra...     7.2  PT1H58M  
6    As the residents of Deadwood gather to commemo...     7.3  PT1H50M  
7    A small-town teacher relocates to one of the c...     7.6  PT1H30M  
8    Ou