In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import requests
import bs4

pd.set_option('display.max_rows', 50)

In [2]:
# define url
base = "https://www.boxofficemojo.com/date/"


In [3]:
# take one day example
date = "2020-02-02"
url = base + date
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content, "html.parser")
movies = soup.find("div", class_="a-section imdb-scroll-table-inner").find_all("tr")

### Exploring website 

In [4]:
# Title
movies[1].find("a", class_="a-link-normal").contents[0]

'Bad Boys for Life'

In [5]:
# Daily
movies[1].find_all("td", class_="a-text-right mojo-field-type-money mojo-estimatable")[0].contents[0].replace(",","").replace("$","")

'2823509'

In [6]:
# theaters
movies[1].find_all("td", class_="a-text-right mojo-field-type-positive_integer mojo-estimatable")[0].contents[0].replace(",","")

'3705'

In [7]:
# Gross accumulated
movies[1].find_all("td", class_="a-text-right mojo-field-type-money mojo-estimatable")[2].contents[0].replace(",","").replace("$","")

'148059490'

In [8]:
# days
movies[1].find_all("td", class_="a-text-right mojo-field-type-positive_integer")[1].contents[0]

'17'

In [9]:
# Distributor
movies[1].find_all("td", class_="a-text-left mojo-field-type-release_studios")[0].find("a", class_="a-link-normal").contents[0]

'Sony Pictures Releasing'

In [10]:
# rank
movies[1].find_all("td", class_="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank mojo-sort-column")[0].contents[0]

'1'

### Range of dates to extract

In [11]:
# test range 2019
date1 = '2018-01-01'
date2 = '2018-01-02'
mydates = pd.date_range(date1, date2)
mydates_list = [date.strftime("%Y-%m-%d") for date in mydates]


### Extract data from BoxOffice Mojo

In [12]:
base = "https://www.boxofficemojo.com/date/"

# empty dataframe
df = pd.DataFrame(columns=["date","rank","title", "daily" , "theaters", "gross", "days", "distributor"]) 
    
# loop for fetch all daily data of year 2019
for date in mydates_list:
    url = base + str(date)
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.content, "html.parser")
    movies = soup.find("div", class_="a-section imdb-scroll-table-inner").find_all("tr")
    for movie in movies[1:]:
        
        rank = str(movie.find_all("td", class_="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank mojo-sort-column")[0].contents[0])
        title = str(movie.find("a", class_="a-link-normal").contents[0])
        daily = str(movie.find_all("td", class_="a-text-right mojo-field-type-money mojo-estimatable")[0].contents[0].replace(",","").replace("$",""))
        theaters = str(movie.find_all("td", class_="a-text-right mojo-field-type-positive_integer mojo-estimatable")[0].contents[0].replace(",",""))
        gross = str(movie.find_all("td", class_="a-text-right mojo-field-type-money mojo-estimatable")[2].contents[0].replace(",","").replace("$",""))
        days = str(movie.find_all("td", class_="a-text-right mojo-field-type-positive_integer")[1].contents[0])
        distributor = str("-" if movie.find_all("td", class_="a-text-left mojo-field-type-release_studios")[0].find("a", class_="a-link-normal") == None else movie.find_all("td", class_="a-text-left mojo-field-type-release_studios")[0].find("a", class_="a-link-normal").contents[0])
        
        df = df.append({"date":str(date), "rank":rank, "title":title, "daily":daily, "theaters":theaters, "gross":gross, "days":days, "distributor":distributor}, ignore_index=True)

In [13]:
# check data
df

Unnamed: 0,date,rank,title,daily,theaters,gross,days,distributor
0,2018-01-01,1,Jumanji: Welcome to the Jungle,16222389,3765,185224946,13,Sony Pictures Releasing
1,2018-01-01,2,Star Wars: Episode VIII - The Last Jedi,14293461,4232,531511829,18,Walt Disney Studios Motion Pictures
2,2018-01-01,3,The Greatest Showman,5385815,3316,54422533,13,Twentieth Century Fox
3,2018-01-01,4,Pitch Perfect 3,4865865,3468,68166470,11,Universal Pictures
4,2018-01-01,5,Ferdinand,3465756,3337,57012473,18,Twentieth Century Fox
...,...,...,...,...,...,...,...,...
99,2018-01-02,38,The Breadwinner,1492,10,223841,47,GKIDS
100,2018-01-02,39,Marshall,1449,50,9472608,82,Open Road Films (II)
101,2018-01-02,40,Wonderstruck,1088,10,1057858,75,Roadside Attractions
102,2018-01-02,41,Let There Be Light,608,8,7209864,68,Atlas Distribution Company


In [14]:
%%time
#df.to_csv("daily_bo_2019.csv")

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.29 µs


### Create function

In [15]:
def get_daily_box_office(date1, date2):
    # create date range 
    mydates = pd.date_range(date1, date2)
    # set as a list
    mydates_list = [date.strftime("%Y-%m-%d") for date in mydates]

    # loop to scrap every day from the range
    # url base
    base = "https://www.boxofficemojo.com/date/"

    # empty dataframe
    df = pd.DataFrame(columns=["date","rank","title", "daily" , "theaters", "gross", "days", "distributor"]) 
        
    # loop for fetch each single day (url)
    for date in mydates_list:
        url = base + str(date)
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.content, "html.parser")
        movies = soup.find("div", class_="a-section imdb-scroll-table-inner").find_all("tr")
        # loop for extract each movie information
        for movie in movies[1:]:
            rank = str(movie.find_all("td", class_="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank mojo-sort-column")[0].contents[0])
            title = str(movie.find("a", class_="a-link-normal").contents[0])
            daily = str(movie.find_all("td", class_="a-text-right mojo-field-type-money mojo-estimatable")[0].contents[0].replace(",","").replace("$",""))
            theaters = str(movie.find_all("td", class_="a-text-right mojo-field-type-positive_integer mojo-estimatable")[0].contents[0].replace(",",""))
            gross = str(movie.find_all("td", class_="a-text-right mojo-field-type-money mojo-estimatable")[2].contents[0].replace(",","").replace("$",""))
            days = str(movie.find_all("td", class_="a-text-right mojo-field-type-positive_integer")[1].contents[0])
            distributor = str("-" if movie.find_all("td", class_="a-text-left mojo-field-type-release_studios")[0].find("a", class_="a-link-normal") == None else movie.find_all("td", class_="a-text-left mojo-field-type-release_studios")[0].find("a", class_="a-link-normal").contents[0])
            # append the info of each movie
            df = df.append({"date":str(date), "rank":rank, "title":title, "daily":daily, "theaters":theaters, "gross":gross, "days":days, "distributor":distributor}, ignore_index=True)
    
    # return df
    return df

In [16]:
%%time
get_daily_box_office("2018-01-01", "2018-01-31")

CPU times: user 6.88 s, sys: 82.1 ms, total: 6.96 s
Wall time: 38.4 s


Unnamed: 0,date,rank,title,daily,theaters,gross,days,distributor
0,2018-01-01,1,Jumanji: Welcome to the Jungle,16222389,3765,185224946,13,Sony Pictures Releasing
1,2018-01-01,2,Star Wars: Episode VIII - The Last Jedi,14293461,4232,531511829,18,Walt Disney Studios Motion Pictures
2,2018-01-01,3,The Greatest Showman,5385815,3316,54422533,13,Twentieth Century Fox
3,2018-01-01,4,Pitch Perfect 3,4865865,3468,68166470,11,Universal Pictures
4,2018-01-01,5,Ferdinand,3465756,3337,57012473,18,Twentieth Century Fox
...,...,...,...,...,...,...,...,...
1484,2018-01-31,47,Only the Brave,899,26,18342947,104,Sony Pictures Releasing
1485,2018-01-31,48,Wonder Wheel,136,5,1403991,62,Amazon Studios
1486,2018-01-31,49,On the Beach at Night Alone,126,1,34139,76,The Cinema Guild
1487,2018-01-31,50,Parchi,48,1,67620,20,Atlas Distribution Company
