In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [2]:
import os, re, sys, pickle, requests, resource, json
import time
from random import random
from IPython.core.display import clear_output

### Webscraping Movie Budgets and Domestic Grosses

For webscraping budget numbers, I used [The Numbers](https://www.the-numbers.com/).  The budget information can be accessed by individual movie page, or by a ranked list in the form of tables containing 100 titles at a time.


Although the list has a great many more titles, I chose initially to scrape the budget and gross information for the first 5000 titles in their ranking tables.  Since the ranking includes all genres, and not just science fiction, I assessed that this number would yield a sufficient number of records for a baseline linear regression.  I plan to do a larger scrape, including all budget and gross information available, at a future point.

The code below formats for the url for each of the 50 table pages, 100 titles each.  It then selects table information for title, release date, budget, domestic grosses, and worldwide grosses.

In [73]:
budget_data_list = []

In [80]:

budget_url = "https://www.the-numbers.com/movie/budgets/all/{}"

pages = [str(i) for i in range(1,5000,100)]

# preparing to monitor
start_time = time.time()
tries =0

for page in pages:

    response = requests.get(budget_url.format(page))
    source = response.text

    time.sleep(0.5+2*random())
    tries +=1
    elapsed_time = time.time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(tries, tries/elapsed_time))
    clear_output(wait = True)

    if tries > 2200:
        print("Number of requests exceeded")
        break

    budget_soup = BeautifulSoup(source, "lxml")

    budget_table = budget_soup.find('table')

    rows = [row for row in budget_table.find_all('tr')]

    for row in rows:
        items = row.find_all('td')
        if (len(items)<1) : 
            release_date = np.nan
            title = np.nan
            budget = np.nan
            dom_gross = np.nan
            ww_gross = np.nan
            
        else:
            release_date = items[1].text
            if not release_date: 
                next_element = item.findNext()

                if next_element:
                    release_date = next_element.text 
                else:
                    release_date = np.nan
            
            title = items[2].text
            if not title: 
                next_element = item.findNext()

                if next_element:
                    title = next_element.text 
                else:
                    title= np.nan 

            budget = items[3].text
            if not budget: 
                next_element = item.findNext()

                if next_element:
                    budget = next_element.text 
                else:
                    budget = np.nan

            dom_gross = items[4].text
            if not dom_gross: 
                next_element = item.findNext()

                if next_element:
                    dom_gross = next_element.text 
                else:
                    dom_gross = np.nan

            ww_gross = items[5].text
            if not ww_gross: 
                next_element = item.findNext()

                if next_element:
                    ww_gross = next_element.text 
                else:
                    ww_gross = np.nan

        budget_dict = dict(zip(headers, [title, release_date, budget, 
                                     dom_gross, ww_gross]))

        budget_data_list.append(budget_dict)

Request:50; Frequency: 0.37544668561893496 requests/s


Checking that data was scraped appropriately for all records, converting the list of dictionaries to dataframe

In [81]:
len(budget_data_list)

5100

In [82]:
len(budget_data_list[49])

5

In [83]:
budget_df = pd.DataFrame(budget_data_list)

In [84]:
budget_df.shape

(5100, 5)

In [92]:
budget_df = budget_df[budget_df['title'].notna()]

In [93]:
budget_df['title'].notna().sum()

5050

Assessing to see that all intended categories of data were scraped

In [99]:
budget_df.sort_values(by="ww_gross", ascending=False)

Unnamed: 0,title,release_date,budget,dom_gross,ww_gross
133,The Dark Knight,"Jul 11, 2008","$185,000,000","$533,720,947","$999,434,419"
4003,Fifty Dead Men Walking,"Aug 21, 2009","$10,000,000",$0,"$997,921"
3671,Duma,"Sep 30, 2005","$12,000,000","$870,067","$994,790"
2659,Robot Overlords,"Mar 15, 2015","$21,000,000",$0,"$991,329"
982,Hellboy,"Apr 2, 2004","$60,000,000","$59,623,958","$99,823,958"
...,...,...,...,...,...
4367,The Kings of Appletown,"Dec 12, 2008","$7,000,000",$0,$0
4368,Stargate - The Ark of Truth,"Mar 11, 2008","$7,000,000",$0,$0
71,No Time to Die,"Mar 31, 2021","$250,000,000",$0,$0
4369,Alpha & Omega: The Legend of the Saw Tooth Cave,"Sep 1, 2015","$7,000,000",$0,$0


#### Preliminary cleaning and formatting

The budget data will need to be merged with the IMDB and Rotten Tomatoes data.  Since I have already created a lowercase title string with underscores replacing spaces in order to format the Rotten Tomatoes url, I am formatting the budget data titles here to match this, to facilitate the merge.

In [114]:
budget_df['title_cc'] = budget_df['title'].str.lower()

In [160]:
budget_df['title_cc'] = budget_df['title_cc'].str.replace('&','and',regex=False)
budget_df['title_cc'] = budget_df['title_cc'].str.replace('\\','_',regex=True)
budget_df['title_cc'] = budget_df['title_cc'].str.replace('episode\s([ivx]*)\s-\s','',regex=True)
budget_df['title_cc'] = budget_df['title_cc'].str.replace("\s",'_',regex=True)
budget_df['title_cc'] = budget_df['title_cc'].str.replace("\W",'',regex=True)
budget_df['title_cc'] = budget_df['title_cc'].str.replace('_+','_',regex=True)

In [119]:
budget_df.head()

Unnamed: 0,title,release_date,budget,dom_gross,ww_gross,title_cc
0,Valerian and the City of a Thousand Planets,"Jul 20, 2017","$180,000,000","$40,479,370","$215,098,356",valerian_and_the_city_of_a_thousand_planets
1,Bolt,"Nov 21, 2008","$150,000,000","$114,053,579","$328,015,029",bolt
2,Monster Trucks,"Dec 29, 2016","$125,000,000","$33,370,166","$61,642,798",monster_trucks
3,The Expendables 2,"Aug 15, 2012","$100,000,000","$85,028,192","$311,979,256",the_expendables_2
4,Bicentennial Man,"Dec 17, 1999","$90,000,000","$58,220,776","$87,420,776",bicentennial_man


In [161]:
budget_df['budget'] = budget_df['budget'].str.strip('$')

In [162]:
budget_df['dom_gross'] = budget_df['dom_gross'].str.strip('$')

In [163]:
budget_df['ww_gross'] = budget_df['ww_gross'].str.strip('$')

In [169]:
budget_df.head()

Unnamed: 0,title,release_date,budget,dom_gross,ww_gross,title_cc,release_date2
0,Valerian and the City of a Thousand Planets,"Jul 20, 2017","$180,000,000","$40,479,370","$215,098,356",valerian_and_the_city_of_a_thousand_planets,"Jul 20, 2017"
1,Bolt,"Nov 21, 2008","$150,000,000","$114,053,579","$328,015,029",bolt,"Nov 21, 2008"
2,Monster Trucks,"Dec 29, 2016","$125,000,000","$33,370,166","$61,642,798",monster_trucks,"Dec 29, 2016"
3,The Expendables 2,"Aug 15, 2012","$100,000,000","$85,028,192","$311,979,256",the_expendables_2,"Aug 15, 2012"
4,Bicentennial Man,"Dec 17, 1999","$90,000,000","$58,220,776","$87,420,776",bicentennial_man,"Dec 17, 1999"


In [120]:
budget_df.shape

(5050, 6)

In [148]:
budget_df['release_date2'] = budget_df['release_date'].replace('Unknown', np.nan)

In [149]:
budget_df['release_date2'] = budget_df['release_date2'].replace('Fall, 2021', np.nan)

#### Exporting and Saving Data

I exported the data in json format, for easy import into other notebooks for cleaning, merging, and analysis.

In [168]:
budget_df.to_json('budget_data.json', orient='records', lines=True)