In [1]:
import os
import re
import time
from tqdm import tqdm

import dateutil.parser

import requests

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

import urllib


In [2]:
def monetary_to_int(monetary):
    try:
        monetary = monetary.replace('$', '').replace(',', '')
        return int(monetary)
    except:
        return None

def runtime_to_minutes(runtime):
    runtime = runtime.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(date):
    date = dateutil.parser.parse(date)
    return date

def height_to_float(height):
    try:
        height = height.encode('ascii', 'ignore').decode()
        height = height.replace('m', '')
        return float(height)
    except:
        return None


In [3]:
def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))

    if not obj: 
        return None

    next_element = obj.findNext()

    if next_element:
        return next_element.text 
    else:
        return None
    

In [4]:
def get_gender_female(search_res):
    
    result = search_res.findNext(class_='desc').text.strip()
    
    if result == 'No results.':
        return False
    else:
        return True
    

In [5]:
def get_details(url):
    
    detailed_url = f'https://www.boxofficemojo.com/title/{url}/'
    resp = requests.get(detailed_url)
    
    if resp.status_code != 200:
        return [None, None, None, None, None, None, None]
    
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    table = soup.find(class_='mojo-summary-values')
    
    rows = [row for row in table.find_all('span')]
    
    distributor = get_movie_value(soup, 'Domestic Distributor')
    if distributor:
        distributor = distributor[:-len('See full company information\n\n')]
        
    domestic_opening = get_movie_value(soup, 'Domestic Opening')
    if domestic_opening:
        domestic_opening = monetary_to_int(domestic_opening)
        
    budget = get_movie_value(soup, 'Budget')
    if budget:
        budget = monetary_to_int(budget)
        
    release_date = to_date(re.sub('[\(\[].*?[\)\]]', '', get_movie_value(soup, 'Release Date')))
    rating = get_movie_value(soup, 'MPAA')
    run_time = runtime_to_minutes(get_movie_value(soup, 'Run'))
    genres = " ".join(get_movie_value(soup, 'Genres').split())
        
    return [distributor, domestic_opening, budget, release_date, rating, run_time, genres]
    
    

In [6]:
def get_lead(url):
    
    detailed_url = f'https://www.boxofficemojo.com/title/{url}/credits/'
    resp = requests.get(detailed_url)
    
    if resp.status_code != 200:
        return [None, None]
    
    soup = BeautifulSoup(resp.text, 'html.parser')

    cast = soup.find(id='principalCast')

    rows = [row for row in cast.find_all('tr')][1]
    
    actor = rows.find('td').text.strip()
    bio_link = rows.find('a')['href'].split('/')[4]
    
    return [actor, bio_link]
    

In [7]:
def get_lead_bio(actor_list):
    
    name = actor_list[0]
    name = '+'.join(name.split())
    
    url = actor_list[1]
    
    detailed_url = f'https://www.imdb.com/name/{url}/'
    resp = requests.get(detailed_url)
    
    if resp.status_code != 200:
        return [None, None, None]
    
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    birth_date = soup.find('time')
    if birth_date:
        birth_date = to_date(birth_date.text)
    height = soup.find(id='details-height')
    if height:
        height = height.text
        height = height_to_float(height[height.find("(")+1:height.find(")")])
    
    search_url = f'https://www.imdb.com/search/name/?name={name}&gender=female'
    resp = requests.get(search_url)
    
    if resp.status_code != 200:
        return [birth_date, height, None]
    
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    gender = get_gender_female(soup.find(text=re.compile('Name Matching')))
    
    return [birth_date, height, gender]


In [8]:
def try_title(title):
    
    search_url = f'https://www.rottentomatoes.com/m/{title}'
    resp = requests.get(search_url)
    
    if resp.status_code != 200:
        return [None, None]
    
    else:
        soup = BeautifulSoup(resp.text, 'html.parser')
    
        scores = soup.find('score-board')
        
        if scores:
            audience_score = scores['audiencescore']
    
            tomatometer = scores['tomatometerscore']
        
    return [audience_score, tomatometer]
    

In [9]:
def get_rotten_tomatoes(title, year):
    
    title = title.replace("'", "")
    title = '_'.join(re.sub(r'[\W_\s]+', ' ', title).lower().split())

    audience_score, tomatometer = try_title(title)
    
    if audience_score and tomatometer:
        return [audience_score, tomatometer]
    else:
        audience_score, tomatometer = try_title(title+'_'+year)
        return [audience_score, tomatometer]
    
    return [None, None]
    

In [10]:
def get_top_grossers(offset):
    
    url = f'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset={offset}'

    resp = requests.get(url)
    
    assert resp.status_code == 200, f'No response from url: {url}.'
    
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    table = soup.find('table')
    
    rows = [row for row in table.find_all('tr')]
    
    movies = {}
    
    for row in tqdm(rows[1:]):
    #for row in tqdm(rows[10:]): # to run tests for speicifc rows (i.e., movies) with issues
        items = row.find_all('td')
        rank = monetary_to_int(items[0].text)
        link = items[1].find('a')
        title, basic_url = link.text, link['href']
        worldwide_gross = monetary_to_int(items[2].text)
        domestic_gross = monetary_to_int(items[3].text)
        foreign_gross = monetary_to_int(items[5].text)
        year = items[7].text
        
        movie_list = [title, worldwide_gross, domestic_gross, foreign_gross]
        
        
        basic_url = basic_url.split('/')[2]
        
        detail_list = get_details(basic_url)
        
        actor_list = get_lead(basic_url)
        
        actor_bio = get_lead_bio(actor_list)
        
        scores = get_rotten_tomatoes(title, year)
                               
        movies[rank] = movie_list + detail_list + [actor_list[0]] + actor_bio + scores
        
    headers = ['title', 'world_lifetime_gross', 'domestic_lifetime_gross', 'international_lifetime_gross', \
               'studio', 'domestic_opening_gross', 'budget', 'release_date', 'MPAA_rating', 'run_time', \
               'genre_list', 'lead_name', 'lead_birth_date', 'lead_height_m', 'lead_is_female', \
               'audience_score', 'tomatometer']
    
    movies_df = pd.DataFrame(movies).T
    movies_df.columns = headers
    
    return movies_df
    

In [None]:
filename = 'top_gross_films.csv'
page_size = 200
min_movies = 0 #must be a multiple of 200
max_movies = 200 #must be a multiple of 200, should be ranking of last movie scraped
offsets = np.arange(min_movies, max_movies, page_size)

for num in offsets:
    df = get_top_grossers(num)
    
    if not os.path.exists(filename):
        df.to_csv(filename)
    
    elif os.path.exists(filename):
        df.to_csv(filename, mode='a', header=False)


  0%|          | 1/200 [00:06<22:32,  6.79s/it]