# Final - IMDB

In [46]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup

# Step 1: Define the IMDb URL and headers for the request
URL = "https://www.imdb.com/chart/top/"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Step 2: Make a request to IMDb and parse the response
response = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')

# Step 3: Initialize lists to store movie details
movie_data = []

# Step 4: Extract JSON data from script tags
ratings_script_tag = soup.find('script', type='application/ld+json')
script_tag = soup.find('script', id='__NEXT_DATA__')

# Step 5: Extract ratings count and content ratings
ratings_count = []
content_ratings = []

if ratings_script_tag:
    ratings_data = json.loads(ratings_script_tag.string)
    for movie in ratings_data.get('itemListElement', []):
        movie_info = movie['item']
        ratings_count.append(movie_info.get('aggregateRating', {}).get('ratingCount', "N/A"))
        content_ratings.append(movie_info.get('contentRating', 'No Rating'))

# Step 6: Extract movie details from the main JSON data
if script_tag:
    data = json.loads(script_tag.string)
    movies = data['props']['pageProps']['pageData']['chartTitles']['edges']

    for idx, movie in enumerate(movies):
        try:
            # Extract movie information
            movie_info = movie['node']
            title = movie_info['titleText']['text']
            release_year = movie_info['releaseYear']['year']
            rating = movie_info['ratingsSummary']['aggregateRating']
            runtime = movie_info['runtime']['seconds'] // 60  # Convert seconds to minutes
            genres = [genre['genre']['text'] for genre in movie_info['titleGenres']['genres']]
            plot = movie_info['plot']['plotText']['plainText']

            # Format runtime
            duration = f"{runtime // 60}h {runtime % 60}m" if runtime else "N/A"

            # Format genres
            genre_str = ', '.join(genres)

            # Add ratings count and content rating
            movie_ratings_count = ratings_count[idx] if idx < len(ratings_count) else "N/A"
            movie_content_rating = content_ratings[idx] if idx < len(content_ratings) else "No Rating"

            # Append movie data to the list
            movie_data.append([
                idx + 1,               # Rank
                title,                 # Movie Name
                release_year,          # Release Year
                duration,              # Duration
                genre_str,             # Genres
                rating,                # Rating
                movie_ratings_count,   # Ratings Count
                movie_content_rating,  # Content Rating
                plot                   # Description
            ])
        except Exception as e:
            print(f"Error parsing movie {idx + 1}: {e}")

# Step 7: Create a DataFrame and save it to a CSV file
columns = ['Rank', 'Movie_Name', 'Release_Year', 'Duration', 'Genres', 'Rating', 'Ratings_Count', 'Content_Rating', 'Description']
df = pd.DataFrame(movie_data, columns=columns)

csv_file_path = "D://Study//Parul University//PUStudy//4th sem//#Data Science With Python//@Projects//IMDB//FIMDB.csv"
df.to_csv(csv_file_path, index=False)

# Step 8: Read and display the first few rows of the CSV file
df_read = pd.read_csv(csv_file_path)
df_read

Unnamed: 0,Rank,Movie_Name,Release_Year,Duration,Genres,Rating,Ratings_Count,Content_Rating,Description
0,1,The Shawshank Redemption,1994,2h 22m,Drama,9.3,2975064,A,A banker convicted of uxoricide forms a friend...
1,2,The Godfather,1972,2h 55m,"Crime, Drama",9.2,2075430,A,The aging patriarch of an organized crime dyna...
2,3,The Dark Knight,2008,2h 32m,"Action, Crime, Drama",9.0,2955864,UA,When a menace known as the Joker wreaks havoc ...
3,4,The Godfather: Part II,1974,3h 22m,"Crime, Drama",9.0,1400487,A,The early life and career of Vito Corleone in ...
4,5,12 Angry Men,1957,1h 36m,"Crime, Drama",9.0,898138,U,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...,...
245,246,Amores perros,2000,2h 34m,"Drama, Thriller",8.0,257317,A,"An amateur dog fighter, a supermodel, and a de..."
246,247,Rebecca,1940,2h 10m,"Drama, Mystery, Romance",8.1,150176,Approved,A self-conscious woman juggles adjusting to he...
247,248,The Help,2011,2h 26m,Drama,8.1,504221,UA,An aspiring author during the civil rights mov...
248,249,Eiga Koe No Katachi,2016,2h 10m,"Animation, Drama",8.1,110616,16,"A deaf girl, Shoko, is bullied by the popular ..."


In [47]:
df_read.head()

Unnamed: 0,Rank,Movie_Name,Release_Year,Duration,Genres,Rating,Ratings_Count,Content_Rating,Description
0,1,The Shawshank Redemption,1994,2h 22m,Drama,9.3,2975064,A,A banker convicted of uxoricide forms a friend...
1,2,The Godfather,1972,2h 55m,"Crime, Drama",9.2,2075430,A,The aging patriarch of an organized crime dyna...
2,3,The Dark Knight,2008,2h 32m,"Action, Crime, Drama",9.0,2955864,UA,When a menace known as the Joker wreaks havoc ...
3,4,The Godfather: Part II,1974,3h 22m,"Crime, Drama",9.0,1400487,A,The early life and career of Vito Corleone in ...
4,5,12 Angry Men,1957,1h 36m,"Crime, Drama",9.0,898138,U,The jury in a New York City murder trial is fr...


In [48]:
df.describe()

Unnamed: 0,Rank,Release_Year,Rating,Ratings_Count
count,250.0,250.0,250.0,250.0
mean,125.5,1987.968,8.3116,715459.3
std,72.312977,25.560837,0.235387,577794.1
min,1.0,1921.0,8.0,26716.0
25%,63.25,1968.75,8.1,237550.0
50%,125.5,1995.0,8.2,580014.5
75%,187.75,2008.0,8.4,1052326.0
max,250.0,2024.0,9.3,2975064.0


In [49]:
df.isnull()

Unnamed: 0,Rank,Movie_Name,Release_Year,Duration,Genres,Rating,Ratings_Count,Content_Rating,Description
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
245,False,False,False,False,False,False,False,False,False
246,False,False,False,False,False,False,False,False,False
247,False,False,False,False,False,False,False,False,False
248,False,False,False,False,False,False,False,False,False


In [50]:
df.isnull().sum()

Rank              0
Movie_Name        0
Release_Year      0
Duration          0
Genres            0
Rating            0
Ratings_Count     0
Content_Rating    0
Description       0
dtype: int64

In [51]:
# Drop duplicate rows
df = df.drop_duplicates()

# Verify no duplicates exist
print(f"Total duplicates: {df.duplicated().sum()}")


Total duplicates: 0
