In [1]:
import requests
import sys
import numpy as np
import time
import pandas as pd
from bs4 import BeautifulSoup
#time.sleep(1)

In [2]:
def get_links(year):
    """returns a list of link suffixes that connect to every movie in the
    boxofficemojo page for the given year"""
    page = requests.get("https://www.boxofficemojo.com/year/{}/".format(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    if page.status_code >=400:
        return [page.status_code,"https://www.boxofficemojo.com/year/{}/".format(year)]
    links = soup.select(".a-text-left.mojo-field-type-release.mojo-cell-wide a")
    return links


In [3]:
def cash_str_to_int(string):
    #takes string containing a number with a symbol at the front and returns the int
    return int(string[1:].replace(",",""))

In [4]:
def make_movie_dict(links):
    """accepts a list of link suffixes and returns a dictionary that
    has a key for each link. The value of each item is also a dictionary
    containing keys
                    domestic, international, worldwide, budget, genres, and title,
    with types      float     float          float      float   list[strs]   str
    
    if an int is missing its replaced with none and if genres is missing it is
    replaced with an empty list
    """
    movie_dict = {}
    for link in links:
        time.sleep(np.random.random())
        current = requests.get("https://www.boxofficemojo.com" + link["href"])
        currysoup = BeautifulSoup(current.content, "html.parser")
        try:
            title = currysoup.find("h1").text
        except AttributeError:
            title = None
        ##print("scraping ", title)
        try:
            budget = cash_str_to_int(currysoup.find(text="Budget").next_element.text)
        except AttributeError:
            budget = None
        grossli = currysoup.select(".a-section.a-spacing-none.mojo-performance-summary-table .a-section.a-spacing-none")
        grosses = []
        for i in grossli:
            try:
                grosses.append(cash_str_to_int(i.select(".money")[0].text))
            except IndexError:
                grosses.append(None)
        try:
            genres = currysoup.find(text = "Genres").next_element.text.split()##[1].text.split()
            ##print(genres)
        except AttributeError:
            genres = []
        movie_dict[link["href"]] = {"domestic":grosses[0],
                                    "international":grosses[1],
                                    "worldwide":grosses[2],
                                    "budget":budget,
                                    "genres":genres,
                                    "title":title
                                   }
        ##print(movie_dict[link["href"]])
        #uncomment the below to see only one pull
        #return(movie_dict)
    return movie_dict
    
    

In [21]:
##Creating a dictionary with keys that are year integers 
##and values that are dictionaries from the make_movie_dict function
##Running a for loop to scrape all years

for year in range(2005,2021):
    current_links = get_links(year)
    movie_dict[year] = make_movie_dict(current_links)
    print(year)
    

2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [22]:
movie_dict[2018]

{'/release/rl2992866817/?ref_=bo_yld_table_1': {'domestic': 700059566,
  'international': 646853595,
  'worldwide': 1346913161,
  'budget': None,
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'title': 'Black Panther'},
 '/release/rl3043198465/?ref_=bo_yld_table_2': {'domestic': 678815482,
  'international': 1369544272,
  'worldwide': 2048359754,
  'budget': None,
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'title': 'Avengers: Infinity War'},
 '/release/rl2071758337/?ref_=bo_yld_table_3': {'domestic': 608581744,
  'international': 634223615,
  'worldwide': 1242805359,
  'budget': None,
  'genres': ['Action', 'Adventure', 'Animation', 'Comedy', 'Family', 'Sci-Fi'],
  'title': 'The Incredibles 2'},
 '/release/rl1602061825/?ref_=bo_yld_table_4': {'domestic': 417719760,
  'international': 890748184,
  'worldwide': 1308467944,
  'budget': 170000000,
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'title': 'Jurassic World: Fallen Kingdom'},
 '/release/rl2488436225/?ref_=bo_yld_table_

In [23]:
bom_df_dict= {}
for year in range(1977,2021):
    bom_df_dict[year] = pd.DataFrame.from_dict(movie_dict[year],orient="index")
    bom_df_dict[year].to_csv("zippedData/bom-budget-and-genres-{}.csv".format(year))
