# Get movie info from title URLs on boxofficemojo.com

In [100]:
from bs4 import BeautifulSoup
import requests
import re
import dateutil.parser

In [101]:
def get_movie_info_from_title(url):
    '''
    Parse the following data from a boxofficemojo.com Title url: 
    ['Movie_Title','Domestic_Distributor','Domestic_Total_Gross',
    'Runtime','Rating','Release_Date','Budget', 'Cast1','Cast2','Cast3','Cast4']
    
    Input: boxofficemojo.com url like:
    'https://www.boxofficemojo.com/title/tt0848228/credits/?ref_=bo_tt_tab'
    Needs to have 'credits/?ref_=bo_tt_tab' in url after title id
    
    Returns [{}]
    
    '''
    
    headers = ['Movie_Title','Domestic_Distributor','Domestic_Total_Gross',
           'Runtime','Rating','Release_Date','Budget', 'Cast1','Cast2','Cast3','Cast4']
    movie_data = []
    
    response = requests.get(url)
    print('requests.get status: ',response.status_code)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    del page, response
    
    # title
    title = soup.find('title').text.split('-')[0].strip()
    
    # domestic distributor
    distributor = get_movie_value(soup,'Domestic Distributor').split('See')[0]
    
    # domestic total gross
    raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                .find_all('span', class_='money')[0].text)
    domestic_total_gross = money_to_int(raw_domestic_total_gross)
    
    # runtime
    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    
    # rating
    rating = get_movie_value(soup,'MPAA')
    
    # release date
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)

    # Budget
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)
    
    # Get Cast info
    castInfo = soup.find(id="principalCast").find_all('tr')
    cast1 = castInfo[1].text.split('\n')[0]
    cast2 = castInfo[2].text.split('\n')[0]
    cast3 = castInfo[3].text.split('\n')[0]
    cast4 = castInfo[4].text.split('\n')[0]
    

    movie_dict = dict(zip(headers, [title,
                                    distributor,
                                    domestic_total_gross,
                                    runtime,
                                    rating, 
                                    release_date,
                                    budget,
                                    cast1,
                                    cast2,
                                    cast3,
                                    cast4]))

    movie_data.append(movie_dict)
    return movie_data

In [102]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

# Test it

In [110]:
url = 'https://www.boxofficemojo.com/title/tt3794354/credits/?ref_=bo_tt_tab'
data = get_movie_info_from_title(url)
data

requests.get status:  200


[{'Movie_Title': 'Sonic the Hedgehog',
  'Domestic_Distributor': 'Paramount Pictures',
  'Domestic_Total_Gross': 146066470,
  'Runtime': 99,
  'Rating': 'PG',
  'Release_Date': datetime.datetime(2020, 2, 12, 0, 0),
  'Budget': 85000000,
  'Cast1': 'Ben Schwartz',
  'Cast2': 'James Marsden',
  'Cast3': 'Jim Carrey',
  'Cast4': 'Tika Sumpter'}]

In [106]:
import pandas as pd

In [107]:
df = pd.DataFrame(data)
df

Unnamed: 0,Movie_Title,Domestic_Distributor,Domestic_Total_Gross,Runtime,Rating,Release_Date,Budget,Cast1,Cast2,Cast3,Cast4
0,The Avengers,Walt Disney Studios Motion Pictures,623357910,143,PG-13,2012-04-25,220000000,Robert Downey Jr.,Chris Evans,Scarlett Johansson,Jeremy Renner


In [112]:
df = df.append(data)
df


Unnamed: 0,Movie_Title,Domestic_Distributor,Domestic_Total_Gross,Runtime,Rating,Release_Date,Budget,Cast1,Cast2,Cast3,Cast4
0,The Avengers,Walt Disney Studios Motion Pictures,623357910,143,PG-13,2012-04-25,220000000,Robert Downey Jr.,Chris Evans,Scarlett Johansson,Jeremy Renner
0,Sonic the Hedgehog,Paramount Pictures,146066470,99,PG,2020-02-12,85000000,Ben Schwartz,James Marsden,Jim Carrey,Tika Sumpter
