In [1]:
import requests
import sys
import numpy as np
import time
import pandas as pd
from bs4 import BeautifulSoup
#time.sleep(1)

In [2]:
def get_links(year):
    """returns a list of link suffixes that connect to every movie in the
    boxofficemojo page for the given year"""
    page = requests.get("https://www.boxofficemojo.com/year/{}/".format(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    if page.status_code >=400:
        return [page.status_code,"https://www.boxofficemojo.com/year/{}/".format(year)]
    links = soup.select(".a-text-left.mojo-field-type-release.mojo-cell-wide a")
    return links


In [3]:
def cash_str_to_int(string):
    #takes string containing a number with a symbol at the front and returns the int
    return int(string[1:].replace(",",""))

In [4]:
def make_movie_dict(links):
    """accepts a list of link suffixes and returns a dictionary that
    has a key for each link. The value of each item is also a dictionary
    containing keys
                    domestic, international, worldwide, budget, genres, and title,
    with types      float     float          float      float   list[strs]   str
    
    if an int is missing its replaced with none and if genres is missing it is
    replaced with an empty list
    """
    movie_dict = {}
    for link in links:
        time.sleep(np.random.random())
        current = requests.get("https://www.boxofficemojo.com" + link["href"])
        currysoup = BeautifulSoup(current.content, "html.parser")
        title = currysoup.find("h1").text
        print("scraping ", title)
        try:
            budget = cash_str_to_int(currysoup.find(text="Budget").next_element.text)
        except AttributeError:
            budget = None
        grossli = currysoup.select(".a-section.a-spacing-none.mojo-performance-summary-table .a-section.a-spacing-none")
        grosses = []
        for i in grossli:
            try:
                grosses.append(cash_str_to_int(i.select(".money")[0].text))
            except IndexError:
                grosses.append(None)
        try:
            genres = list(currysoup.select(".a-section.a-spacing-none.mojo-summary-values.mojo-hidden-from-mobile")[0])[6].find_all("span")[1].text.split()
        except IndexError:
            genres = []
        movie_dict[link["href"]] = {"domestic":grosses[0],
                                    "international":grosses[1],
                                    "worldwide":grosses[2],
                                    "budget":budget,
                                    "genres":genres,
                                    "title":title
                                   }
        print(movie_dict[link["href"]])
        #uncomment the below to see only one pull
        #return(movie_dict)
    return movie_dict
    
    

In [5]:
links_2020 = get_links(2020)
movie_dict_2020 = make_movie_dict(links_2020)

scraping  Bad Boys for Life
{'domestic': 204417855, 'international': 220200000, 'worldwide': 424617855, 'budget': 90000000, 'genres': ['Action', 'Comedy', 'Crime', 'Thriller'], 'title': 'Bad Boys for Life'}
scraping  1917
{'domestic': 159227644, 'international': 221775855, 'worldwide': 381003499, 'budget': 95000000, 'genres': ['Drama', 'War'], 'title': '1917'}
scraping  Sonic the Hedgehog
{'domestic': 146066470, 'international': 160700000, 'worldwide': 306766470, 'budget': 85000000, 'genres': ['Action', 'Adventure', 'Comedy', 'Family', 'Sci-Fi'], 'title': 'Sonic the Hedgehog'}
scraping  Jumanji: The Next Level
{'domestic': 316831246, 'international': 479744747, 'worldwide': 796575993, 'budget': 125000000, 'genres': ['Action', 'Adventure', 'Comedy', 'Fantasy'], 'title': 'Jumanji: The Next Level'}
scraping  Star Wars: The Rise of Skywalker
{'domestic': 515202542, 'international': 558941706, 'worldwide': 1074144248, 'budget': 275000000, 'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi

scraping  2020 Oscar Nominated Short Films
{'domestic': 3306611, 'international': None, 'worldwide': 3306611, 'budget': None, 'genres': ['See', 'more', 'details', 'at', 'IMDbPro'], 'title': '2020 Oscar Nominated Short Films'}
scraping  Queen & Slim
{'domestic': 43808310, 'international': 3893946, 'worldwide': 47702256, 'budget': 18000000, 'genres': ['Crime', 'Drama', 'Romance'], 'title': 'Queen & Slim'}
scraping  Las Pildoras De Mi Novio
{'domestic': 2971116, 'international': 3321664, 'worldwide': 6292780, 'budget': None, 'genres': ['179', 'days/25', 'weeks'], 'title': 'Las Pildoras De Mi Novio'}
scraping  The Last Full Measure
{'domestic': 2949212, 'international': 415356, 'worldwide': 3364568, 'budget': None, 'genres': ['207', 'days/29', 'weeks'], 'title': 'The Last Full Measure'}
scraping  Ip Man 4: The Finale
{'domestic': 3956031, 'international': 172449312, 'worldwide': 176405343, 'budget': None, 'genres': ['125', 'theaters'], 'title': 'Ip Man 4: The Finale'}
scraping  The Wretche

scraping  Seberg
{'domestic': 434702, 'international': 151143, 'worldwide': 585845, 'budget': None, 'genres': ['179', 'days/25', 'weeks'], 'title': 'Seberg'}
scraping  Iron Man
{'domestic': 430000, 'international': None, 'worldwide': 430000, 'budget': 140000000, 'genres': ['Action', 'Adventure', 'Sci-Fi'], 'title': 'Iron Man'}
scraping  Fantastic Fungi
{'domestic': 1788322, 'international': 89353, 'worldwide': 1877675, 'budget': None, 'genres': ['37', 'theaters'], 'title': 'Fantastic Fungi'}
scraping  Infamous
{'domestic': 429148, 'international': None, 'worldwide': 429148, 'budget': None, 'genres': ['58', 'theaters'], 'title': 'Infamous'}
scraping  Black Panther
{'domestic': 367000, 'international': 1463934, 'worldwide': 1830934, 'budget': None, 'genres': ['39', 'days/5', 'weeks'], 'title': 'Black Panther'}
scraping  Greed
{'domestic': 355308, 'international': 1103518, 'worldwide': 1458826, 'budget': None, 'genres': ['172', 'days/24', 'weeks'], 'title': 'Greed'}
scraping  Ride Your Wa

scraping  Back to the Future
{'domestic': 107000, 'international': 137635, 'worldwide': 244635, 'budget': 19000000, 'genres': ['18', 'days/2', 'weeks'], 'title': 'Back to the Future'}
scraping  Hope Gap
{'domestic': 104732, 'international': 19679, 'worldwide': 124411, 'budget': None, 'genres': ['165', 'days/23', 'weeks'], 'title': 'Hope Gap'}
scraping  Citizen K
{'domestic': 120411, 'international': 25530, 'worldwide': 145941, 'budget': None, 'genres': ['15', 'theaters'], 'title': 'Citizen K'}
scraping  Grease
{'domestic': 102000, 'international': 36981, 'worldwide': 138981, 'budget': 6000000, 'genres': ['Musical', 'Romance'], 'title': 'Grease'}
scraping  First Cow
{'domestic': 101068, 'international': None, 'worldwide': 101068, 'budget': None, 'genres': ['165', 'days/23', 'weeks'], 'title': 'First Cow'}
scraping  The Woman Who Loves Giraffes
{'domestic': 113858, 'international': None, 'worldwide': 113858, 'budget': None, 'genres': ['22', 'theaters'], 'title': 'The Woman Who Loves Gira

scraping  Countdown
{'domestic': 25621766, 'international': 22400000, 'worldwide': 48021766, 'budget': 6500000, 'genres': ['Horror', 'Thriller'], 'title': 'Countdown'}
scraping  Cane River
{'domestic': 27317, 'international': None, 'worldwide': 27317, 'budget': None, 'genres': ['See', 'more', 'details', 'at', 'IMDbPro'], 'title': 'Cane River'}
scraping  Midnight Family
{'domestic': 42310, 'international': 9402, 'worldwide': 51712, 'budget': None, 'genres': ['13', 'theaters'], 'title': 'Midnight Family'}
scraping  José
{'domestic': 26563, 'international': None, 'worldwide': 26563, 'budget': None, 'genres': ['2', 'theaters'], 'title': 'José'}
scraping  Synonyms
{'domestic': 206003, 'international': 295220, 'worldwide': 501223, 'budget': None, 'genres': ['13', 'theaters'], 'title': 'Synonyms'}
scraping  The Booksellers
{'domestic': 25694, 'international': 127766, 'worldwide': 153460, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'The Booksellers'}
scraping  Promare
{'domestic': 23

scraping  The Truth
{'domestic': 9619, 'international': 5233647, 'worldwide': 5243266, 'budget': None, 'genres': ['46', 'days/6', 'weeks'], 'title': 'The Truth'}
scraping  Under the Sea 3D
{'domestic': 36262926, 'international': 19639698, 'worldwide': 55902624, 'budget': None, 'genres': ['108', 'theaters'], 'title': 'Under the Sea 3D'}
scraping  Kind Hearts and Coronets
{'domestic': 35948, 'international': 44847, 'worldwide': 80795, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Kind Hearts and Coronets'}
scraping  The Trip to Greece
{'domestic': 8340, 'international': 224674, 'worldwide': 233014, 'budget': None, 'genres': ['11', 'theaters'], 'title': 'The Trip to Greece'}
scraping  Inside the Rain
{'domestic': 8140, 'international': None, 'worldwide': 8140, 'budget': None, 'genres': ['1', 'theater'], 'title': 'Inside the Rain'}
scraping  The Wave
{'domestic': 8101, 'international': None, 'worldwide': 8101, 'budget': None, 'genres': ['13', 'theaters'], 'title': 'The Wave'}
scra

scraping  Christ Stopped at Eboli
{'domestic': 78736, 'international': None, 'worldwide': 78736, 'budget': None, 'genres': ['3', 'theaters'], 'title': 'Christ Stopped at Eboli'}
scraping  To the Arctic 3D
{'domestic': 14560838, 'international': 9134550, 'worldwide': 23695388, 'budget': None, 'genres': ['3,042', 'days/434', 'weeks'], 'title': 'To the Arctic 3D'}
scraping  Be Natural: The Untold Story of Alice Guy-Blaché
{'domestic': 115685, 'international': 49934, 'worldwide': 165619, 'budget': None, 'genres': ['14', 'theaters'], 'title': 'Be Natural: The Untold Story of Alice Guy-Blaché'}
scraping  Edie
{'domestic': 71790, 'international': 2659380, 'worldwide': 2731170, 'budget': None, 'genres': ['11', 'theaters'], 'title': 'Edie'}
scraping  The Painted Bird
{'domestic': 1460, 'international': 577566, 'worldwide': 579026, 'budget': None, 'genres': ['8', 'theaters'], 'title': 'The Painted Bird'}
scraping  End of the Century
{'domestic': 66309, 'international': 13298, 'worldwide': 79607,

In [6]:
bom_df_2020 = pd.DataFrame.from_dict(movie_dict_2020,orient="index")
bom_df_2020.head()

Unnamed: 0,domestic,international,worldwide,budget,genres,title
/release/rl1182631425/?ref_=bo_yld_table_1,204417855,220200000.0,424617855,90000000.0,"[Action, Comedy, Crime, Thriller]",Bad Boys for Life
/release/rl2969994753/?ref_=bo_yld_table_2,159227644,221775855.0,381003499,95000000.0,"[Drama, War]",1917
/release/rl4244997633/?ref_=bo_yld_table_3,146066470,160700000.0,306766470,85000000.0,"[Action, Adventure, Comedy, Family, Sci-Fi]",Sonic the Hedgehog
/release/rl755467777/?ref_=bo_yld_table_4,316831246,479744747.0,796575993,125000000.0,"[Action, Adventure, Comedy, Fantasy]",Jumanji: The Next Level
/release/rl3305145857/?ref_=bo_yld_table_5,515202542,558941706.0,1074144248,275000000.0,"[Action, Adventure, Fantasy, Sci-Fi]",Star Wars: The Rise of Skywalker


In [7]:
bom_df_2020.to_csv("zippedData/bom-budget-and-genres-2020.csv")