In [1]:
pip install fake_useragent

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent

In [5]:
# Testing UserAgent

ua = UserAgent()
ua.random

'Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1'

In [7]:
url = "https://www.imdb.com/chart/boxoffice/?ref_=nv_ch_cht"
userAgent = ua.random
headers = {"User-Agent": userAgent}
response = requests.get(url, headers = headers)


In [8]:
soup = BeautifulSoup(response.content, "html.parser")

In [9]:
soup.find("div", class_ = "ipc-metadata-list-summary-item__c").text

'SinnersWeekend Gross: $48MTotal Gross: $78MWeeks Released: 18.2\xa0(43K)RateMark as watched'

In [10]:
movie_box = soup.find_all("div", class_ = "ipc-metadata-list-summary-item__c")


In [11]:
titles = []
numbers = []
ratings = []

for box in movie_box:
    #titles
    if box.find("h3", class_ = "ipc-title__text") is not None:
        title = box.find("h3", class_ = "ipc-title__text")
        titles.append(title.text)
    else:
        titles.append("None")
        
    #numbers
    if box.find("ul", attrs = {"data-testid": 'title-metadata-box-office-data-container'}) is not None:
        number = box.find("ul", attrs = {"data-testid": 'title-metadata-box-office-data-container'})
        numbers.append(number.text)
    else:
        numbers.append("None")
        
    #ratings
    if box.find("span", attrs = {"data-testid": 'ratingGroup--imdb-rating'}) is not None:
        rating = box.find("span", attrs = {"data-testid": 'ratingGroup--imdb-rating'})
        ratings.append(rating.text.replace("\xa0", ""))
    else:
        ratings.append("None")
        
ratings

['8.2(43K)',
 '5.9(72K)',
 '7.5(3.6K)',
 '6.7(15K)',
 '7.9(13K)',
 '6.5(7.7K)',
 '6.4(745)',
 '7.8(351K)',
 '7.8(153)',
 '1.6(346K)']

In [15]:
movie_df = pd.DataFrame(
    {
        "title": titles,
        "number": numbers,
        "rating": ratings
    }
)

movie_df

Unnamed: 0,title,number,rating
0,Sinners,Weekend Gross: $48MTotal Gross: $78MWeeks Rele...,8.2(43K)
1,A Minecraft Movie,Weekend Gross: $40MTotal Gross: $357MWeeks Rel...,5.9(72K)
2,The King of Kings,Weekend Gross: $18MTotal Gross: $50MWeeks Rele...,7.5(3.6K)
3,The Amateur,Weekend Gross: $7MTotal Gross: $30MWeeks Relea...,6.7(15K)
4,Warfare,Weekend Gross: $4.6MTotal Gross: $19MWeeks Rel...,7.9(13K)
5,Drop,Weekend Gross: $3.3MTotal Gross: $15MWeeks Rel...,6.5(7.7K)
6,Gekijô-ban Purojekuto Sekai Kowareta Sekai to ...,Weekend Gross: $2.8MTotal Gross: $3MWeeks Rele...,6.4(745)
7,Pride & Prejudice,Weekend Gross: $2.8MTotal Gross: $43MWeeks Rel...,7.8(351K)
8,Episode #5.6,Weekend Gross: $1.5MTotal Gross: $3.8MWeeks Re...,7.8(153)
9,Snow White,Weekend Gross: $1.2MTotal Gross: $85MWeeks Rel...,1.6(346K)


In [16]:
movie_df.number[0]

'Weekend Gross: $48MTotal Gross: $78MWeeks Released: 1'

In [18]:
movie_df["number"] = movie_df["number"]\
                        .str.replace("Weekend Gross: ", "")\
                        .str.replace("Total Gross: ", "|")\
                        .str.replace("Weeks Released: ", "|")
            
movie_df

Unnamed: 0,title,number,rating
0,Sinners,$48M|$78M|1,8.2(43K)
1,A Minecraft Movie,$40M|$357M|3,5.9(72K)
2,The King of Kings,$18M|$50M|2,7.5(3.6K)
3,The Amateur,$7M|$30M|2,6.7(15K)
4,Warfare,$4.6M|$19M|2,7.9(13K)
5,Drop,$3.3M|$15M|2,6.5(7.7K)
6,Gekijô-ban Purojekuto Sekai Kowareta Sekai to ...,$2.8M|$3M|1,6.4(745)
7,Pride & Prejudice,$2.8M|$43M|1,7.8(351K)
8,Episode #5.6,$1.5M|$3.8M|2,7.8(153)
9,Snow White,$1.2M|$85M|5,1.6(346K)


In [19]:
movie_df[["weekend gross", "total gross", "weeks release"]] = movie_df["number"].str.split("|", expand = True)
movie_df

Unnamed: 0,title,number,rating,weekend gross,total gross,weeks release
0,Sinners,$48M|$78M|1,8.2(43K),$48M,$78M,1
1,A Minecraft Movie,$40M|$357M|3,5.9(72K),$40M,$357M,3
2,The King of Kings,$18M|$50M|2,7.5(3.6K),$18M,$50M,2
3,The Amateur,$7M|$30M|2,6.7(15K),$7M,$30M,2
4,Warfare,$4.6M|$19M|2,7.9(13K),$4.6M,$19M,2
5,Drop,$3.3M|$15M|2,6.5(7.7K),$3.3M,$15M,2
6,Gekijô-ban Purojekuto Sekai Kowareta Sekai to ...,$2.8M|$3M|1,6.4(745),$2.8M,$3M,1
7,Pride & Prejudice,$2.8M|$43M|1,7.8(351K),$2.8M,$43M,1
8,Episode #5.6,$1.5M|$3.8M|2,7.8(153),$1.5M,$3.8M,2
9,Snow White,$1.2M|$85M|5,1.6(346K),$1.2M,$85M,5


In [20]:
movie_df[["rating", "vote count"]] = movie_df["rating"].str.split("(", expand = True)
movie_df

Unnamed: 0,title,number,rating,weekend gross,total gross,weeks release,vote count
0,Sinners,$48M|$78M|1,8.2,$48M,$78M,1,43K)
1,A Minecraft Movie,$40M|$357M|3,5.9,$40M,$357M,3,72K)
2,The King of Kings,$18M|$50M|2,7.5,$18M,$50M,2,3.6K)
3,The Amateur,$7M|$30M|2,6.7,$7M,$30M,2,15K)
4,Warfare,$4.6M|$19M|2,7.9,$4.6M,$19M,2,13K)
5,Drop,$3.3M|$15M|2,6.5,$3.3M,$15M,2,7.7K)
6,Gekijô-ban Purojekuto Sekai Kowareta Sekai to ...,$2.8M|$3M|1,6.4,$2.8M,$3M,1,745)
7,Pride & Prejudice,$2.8M|$43M|1,7.8,$2.8M,$43M,1,351K)
8,Episode #5.6,$1.5M|$3.8M|2,7.8,$1.5M,$3.8M,2,153)
9,Snow White,$1.2M|$85M|5,1.6,$1.2M,$85M,5,346K)


In [21]:
movie_df["vote count"] = movie_df["vote count"].str.replace(")", "")
movie_df

  movie_df["vote count"] = movie_df["vote count"].str.replace(")", "")


Unnamed: 0,title,number,rating,weekend gross,total gross,weeks release,vote count
0,Sinners,$48M|$78M|1,8.2,$48M,$78M,1,43K
1,A Minecraft Movie,$40M|$357M|3,5.9,$40M,$357M,3,72K
2,The King of Kings,$18M|$50M|2,7.5,$18M,$50M,2,3.6K
3,The Amateur,$7M|$30M|2,6.7,$7M,$30M,2,15K
4,Warfare,$4.6M|$19M|2,7.9,$4.6M,$19M,2,13K
5,Drop,$3.3M|$15M|2,6.5,$3.3M,$15M,2,7.7K
6,Gekijô-ban Purojekuto Sekai Kowareta Sekai to ...,$2.8M|$3M|1,6.4,$2.8M,$3M,1,745
7,Pride & Prejudice,$2.8M|$43M|1,7.8,$2.8M,$43M,1,351K
8,Episode #5.6,$1.5M|$3.8M|2,7.8,$1.5M,$3.8M,2,153
9,Snow White,$1.2M|$85M|5,1.6,$1.2M,$85M,5,346K


In [22]:
movie_df.drop("number", axis = 1, inplace = True)
movie_df

Unnamed: 0,title,rating,weekend gross,total gross,weeks release,vote count
0,Sinners,8.2,$48M,$78M,1,43K
1,A Minecraft Movie,5.9,$40M,$357M,3,72K
2,The King of Kings,7.5,$18M,$50M,2,3.6K
3,The Amateur,6.7,$7M,$30M,2,15K
4,Warfare,7.9,$4.6M,$19M,2,13K
5,Drop,6.5,$3.3M,$15M,2,7.7K
6,Gekijô-ban Purojekuto Sekai Kowareta Sekai to ...,6.4,$2.8M,$3M,1,745
7,Pride & Prejudice,7.8,$2.8M,$43M,1,351K
8,Episode #5.6,7.8,$1.5M,$3.8M,2,153
9,Snow White,1.6,$1.2M,$85M,5,346K


In [23]:
movie_df.to_csv("movie_data.csv", index = False)