
## Scraping IMDB website at a time to fetch top 250 movies data sorted by Rating.

* Mounts Google Drive

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


* Importing all necessary libraries:

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint


* Movie Data Scraping and Storage Initialization

In [None]:
#Declaring the headers
headers = {"Accept-Language": "en-US,en;q=0.5"}

#declaring the list of empty variables, So that we can append the data overall
movie_name = []
year = []
time=[]
rating=[]
metascore =[]
votes = []
description = []

* Page Range Initialization for Movie Scraping: 1 to 250

In [None]:
#creating an array of values and passing it in the url for dynamic webpages
pages = np.arange(1,5)

### collect and parse the information using requests and BeautifulSoup
* Iterates through a list of pages, sends HTTP requests to IMDb's top movie listings, and extracts information such as movie name, year of release, runtime, rating, Metascore, votes, and description. The script uses BeautifulSoup for web scraping, random delays to avoid overwhelming the server, and headers to mimic a web browser. Extracted data is appended to respective lists, and at the end, a Pandas DataFrame is created to organize the movie information for further analysis or presentation.


In [None]:
# Iterate through pages
for page in pages:
    # Set user-agent header to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }

    print(page)

    # Make a request to IMDb
    page_response = requests.get("https://www.imdb.com/search/title/?sort=user_rating,desc&groups=top_1000&count=250", headers=headers)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page_response.text, 'html.parser')

    # Find movie data containers
    movie_data = soup.findAll('div', {'class': 'sc-b189961a-0 iqHBGn'})

    # Pause to avoid overwhelming the server
     #sleep(randint(2, 8))

    # Print request status code for debugging
    # print("Request status code:", page_response.status_code)

    # Iterate through movie data
    for store in movie_data:
        # Extract information
        name = store.find('h3', class_='ipc-title__text').text
        movie_name.append(name)

        year_of_release = store.find('span', class_='sc-b189961a-8 hCbzGp dli-title-metadata-item').text
        year.append(year_of_release)

        runtime = store.findAll('span', class_='sc-b189961a-8 hCbzGp dli-title-metadata-item')[1].text
        time.append(runtime)

        # Rating element gracefully
        rate = store.find('span', class_="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating").text.replace('\n', '').split('\xa0')[0]
        rating.append(rate)

        meta = store.find('span', class_="sc-b0901df4-0 bcQdDJ metacritic-score-box").text if store.find('span', class_="sc-b0901df4-0 bcQdDJ metacritic-score-box") else "*****"
        metascore.append(meta)

        # Extracting the votes
        value = store.findNext("span", class_="ipc-rating-star--voteCount").text.split('Votes')[-1]
        votes.append(value)

        # Extracting the movie description
        describe = store.findNext('div', class_='ipc-html-content-inner-div').text
        description_ = describe.replace('\n', '') if len(describe) > 1 else '*****'
        description.append(description_)


1
2
3
4


* Creating and Displaying Movie Data DataFrame


In [None]:
# Creating a Pandas DataFrame to store movie data with columns for Movie Name, Year of Release, Watch Time, Movie Rating, Metascore of Movie, Votes, and Description
movie_list = pd.DataFrame({
    "Movie Name": movie_name,
    "Year of Release": year,
    "Watch Time": time,
    "Movie Rating": rating,
    "Meatscore of movie": metascore,
    "Votes": votes,
    "Description": description
})

# Displaying the movie list DataFrame
movie_list


Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Meatscore of movie,Votes,Description
0,1. The Shawshank Redemption,1994,2h 22m,9.3,*****,(2.9M),A banker convicted of uxoricide forms a friend...
1,2. The Godfather,1972,2h 55m,9.2,*****,(2M),The aging patriarch of an organized crime dyna...
2,3. The Dark Knight,2008,2h 32m,9.0,*****,(2.9M),When the menace known as the Joker wreaks havo...
3,4. The Lord of the Rings: The Return of the King,2003,3h 21m,9.0,*****,(2M),Gandalf and Aragorn lead the World of Men agai...
4,5. Schindler's List,1993,3h 15m,9.0,*****,(1.5M),"In German-occupied Poland during World War II,..."
...,...,...,...,...,...,...,...
95,21. Jai Bhim,2021,2h 44m,8.7,*****,(220K),When a tribal man is arrested for a case of al...
96,22. 777 Charlie,2022,2h 44m,8.7,*****,(41K),Dharma is stuck in a rut with his negative and...
97,23. Soorarai Pottru,2020,2h 33m,8.7,*****,(125K),"Nedumaaran Rajangam ""Maara"" sets out to make t..."
98,24. Rocketry: The Nambi Effect,2022,2h 37m,8.7,*****,(60K),The story of Indian Space Research Organizatio...


* Displaying the last three rows of the Movie Data DataFrame

In [None]:
movie_list[-3:]

Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Meatscore of movie,Votes,Description
97,23. Soorarai Pottru,2020,2h 33m,8.7,*****,(125K),"Nedumaaran Rajangam ""Maara"" sets out to make t..."
98,24. Rocketry: The Nambi Effect,2022,2h 37m,8.7,*****,(60K),The story of Indian Space Research Organizatio...
99,25. Maharaja,2024,2h 21m,8.6,*****,(44K),A barber seeks vengeance after his home is bur...


* Checking the data type of the Movie Data DataFrame

In [None]:
type(movie_list)

* Saving Movie Data DataFrame to Excel: IMDB_Movie_Ratings.xlsx

In [None]:
# Exporting them to excel
movie_list.to_excel('/content/drive/MyDrive/Colab Notebooks/IMDB_Movie_Ratings.xlsx', header=True, index=False)

In [None]:
# Getting the first 25 movies
top=movie_list.iloc[0:25]
#print(top)
print(top.shape)

(25, 7)


In [None]:
print(top.head)

<bound method NDFrame.head of                                            Movie Name Year of Release  \
0                         1. The Shawshank Redemption            1994   
1                                    2. The Godfather            1972   
2                                  3. The Dark Knight            2008   
3    4. The Lord of the Rings: The Return of the King            2003   
4                                 5. Schindler's List            1993   
5                                     6. 12 Angry Men            1957   
6                            7. The Godfather Part II            1974   
7   8. The Lord of the Rings: The Fellowship of th...            2001   
8                                     9. Pulp Fiction            1994   
9                                       10. 12th Fail            2023   
10                                     11. Fight Club            1999   
11                                      12. Inception            2010   
12                   

In [None]:
# Turning the values to numeric
top[['Movie Rating']]=top[['Movie Rating']].astype(float)


# Calculating the mean
mean_rating = top['Movie Rating'].mean()
print('Average Mean Rating:',mean_rating)


Average Mean Rating: 8.844


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top[['Movie Rating']]=top[['Movie Rating']].astype(float)


In [None]:
#Turning the values to numeric
top=top[['Year of Release']].astype(float)

#Finding the oldest movie
old=top['Year of Release'].max()
young=top['Year of Release'].min()
print('oldest movie:',old)
print('Newest Movie:', young)

oldest movie: 2024.0
Newest Movie: 1957.0
