In [1]:
import requests
import sys
import numpy as np
import time
import pandas as pd
from bs4 import BeautifulSoup
#time.sleep(1)

In [None]:
def get_links(year):
    """returns a list of link suffixes that connect to every movie in the
    boxofficemojo page for the given year"""
    page = requests.get("https://www.boxofficemojo.com/year/{}/".format(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    if page.status_code >=400:
        return [page.status_code,"https://www.boxofficemojo.com/year/{}/".format(year)]
    links = soup.select(".a-text-left.mojo-field-type-release.mojo-cell-wide a")
    return links


In [None]:
def cash_str_to_int(string):
    #takes string containing a number with a symbol at the front and returns the int
    return int(string[1:].replace(",",""))

In [None]:
def make_movie_dict(links):
    """accepts a list of link suffixes and returns a dictionary that
    has a key for each link. The value of each item is also a dictionary
    containing keys
                    domestic, international, worldwide, budget, genres, and title,
    with types      float     float          float      float   list[strs]   str
    
    if an int is missing its replaced with none and if genres is missing it is
    replaced with an empty list
    """
    movie_dict = {}
    for link in links:
        time.sleep(np.random.random())
        current = requests.get("https://www.boxofficemojo.com" + link["href"])
        currysoup = BeautifulSoup(current.content, "html.parser")
        title = currysoup.find("h1").text
        print("scraping ", title)
        try:
            budget = cash_str_to_int(currysoup.find(text="Budget").next_element.text)
        except AttributeError:
            budget = None
        grossli = currysoup.select(".a-section.a-spacing-none.mojo-performance-summary-table .a-section.a-spacing-none")
        grosses = []
        for i in grossli:
            try:
                grosses.append(cash_str_to_int(i.select(".money")[0].text))
            except IndexError:
                grosses.append(None)
        try:
            genres = list(currysoup.select(".a-section.a-spacing-none.mojo-summary-values.mojo-hidden-from-mobile")[0])[6].find_all("span")[1].text.split()
        except IndexError:
            genres = []
        movie_dict[link["href"]] = {"domestic":grosses[0],
                                    "international":grosses[1],
                                    "worldwide":grosses[2],
                                    "budget":budget,
                                    "genres":genres,
                                    "title":title
                                   }
        print(movie_dict[link["href"]])
        #uncomment the below to see only one pull
        #return(movie_dict)
    return movie_dict
    
    

In [None]:
links_2019 = get_links(2019)
movie_dict_2019 = make_movie_dict(links_2019)

In [None]:
bom_df_2019 = pd.DataFrame.from_dict(movie_dict_2019,orient="index")
bom_df_2019.head()

In [None]:
bom_df_2019.to_csv("zippedData/bom-budget-and-genres-2019.csv")