In [137]:
import requests
from bs4 import BeautifulSoup
import re
import pickle
import pandas as pd
import numpy as np

In [138]:
def save_requests(main, pages, f_prefix='request', adj='', start=1, stop=None):
    """
    Return: Save requests to working directory subdirectory requests/ with filename
    equal to the f_prefix string followed by index starting at 1.
    Arguments:
    main: homepage url of requests as string
    pages: list of subpages as strings
    adj: extension for adjusted url
    start: start index
    stop: stop index
    """
    for i, page in enumerate(pages[start-1:stop],start=start):
        save_request(main + page + adj, f'{f_prefix}{i}')

def save_request(url, filename):
    response = requests.get(url)
    with open(f'requests/{filename}', 'wb') as f:
        f.write(response.content)

def make_soup(filename):
    with open(f'requests/{filename}', 'rb') as f:
        return BeautifulSoup(f, 'lxml')

def get_soups(f_prefix, n):
    soups = []
    for i in range(1, n+1):
        soups.append(make_soup(f'{f_prefix}{i}'))
    return soups

def get_soup(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    return soup

def get_links(soup, page):
    links = {a['href'] for a in soup.find_all('a')\
            if a['href'][:len(page)] == page}
    return links

def get_AZ(main, page):
    num_page = '/movies/alphabetical.htm?letter=NUM&p=.htm'
    links = get_links(get_soup(main + page), page)
    AZ = set()
    for link in links:
        soup = get_soup(main + link)
        AZ = AZ.union(get_links(soup, page))
    AZ.remove(num_page)
    return [num_page] + sorted(AZ)

def get_tables_movies(AZ_soups):
    tables = [['Link','Title', 'Studio', 'Total Gross', 'Total Theaters', \
              'Opening', 'Opening Theaters', 'Open']]
    movies = []
    for soup in AZ_soups:
        links = ['Link']+[tr.find("a")['href'] for tr \
                in soup.find_all('table')[3].find_all('tr')[1:]]
        contents = [[td.get_text(separator=' ') for td in tr.find_all('td')] \
                    for tr in soup.find_all('table')[3].find_all('tr')]
        for link, content in list(zip(links,contents))[1:]:
            if link not in movies:
                tables.append([link] + content)
                movies.append(link)
    return tables, movies

def get_title_year(soup):
    string = soup.title.text.partition(" - Box Office Mojo")[0]
    if string[-1] == ")":
        title = string[:-7]
        year = string[-5:-1]
    else:
        title = string.strip()
        year = None
    return title, year

def get_series(soup):
    series = soup.find(text=re.compile('Series:'))
    if series:
        series = series.partition('Series: ')[-1]
    return series

def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML

    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj:
        return None
    # this works for most of the values
    next_sibling = obj.next_sibling
    next_element = obj.next_element
    if next_sibling:
        return next_sibling.get_text(separator=', ')
    elif next_element:
        try:
            return obj.next_element.get_text(separator=', ')
        except AttributeError:
            try:
                return obj.next_element.next_element.get_text(separator=', ')
            except:
                return None
    else:
        return None

def get_movie_info(movies, soups, adj=False, header=False):
    if not header:
        movie_info = []
    elif not adj:
        movie_info = [['page', 'title', 'year', 'opening', 'domestic', \
                    'budget', 'series', 'distr', 'rating', 'release', 'genre',\
                    'runtime', 'directors', 'writers', 'actors', 'producers']]
    else:
        movie_info = [['page', 'title', 'year', 'opening_adj', 'domestic_adj', \
                    'budget_adj']]
    for page, soup in zip(movies, soups):
        title, year = get_title_year(soup)
        opening = get_movie_value(soup, 'Weekend:')
        domestic = get_movie_value(soup,'Domestic Total')
        budget = get_movie_value(soup, 'Budget:')
        if not adj:
            series = get_series(soup)
            distr = get_movie_value(soup, 'Distributor')
            rating = get_movie_value(soup,'MPAA Rating')
            release = get_movie_value(soup,'Release Date')
            genre = get_movie_value(soup, 'Genre:')
            runtime = get_movie_value(soup,'Runtime')
            directors = get_movie_value(soup, 'Director')
            writers = get_movie_value(soup, 'Writer')
            actors = get_movie_value(soup, 'Actor')
            producers = get_movie_value(soup, 'Producer')
            movie_info.append([page, title, year, opening, domestic, budget, \
                                series, distr, rating, release, genre, runtime,\
                                directors, writers, actors, producers])
        else:
            movie_info.append([page, title, year, opening, domestic, budget])
    return movie_info

def make_pickles(objs):
    for filename, obj in objs:
        with open(filename, "wb") as picklefile:
            pickle.dump(obj, picklefile)

def get_pickles(pkls):
    if len(pkls) == 1:
        with open(pkls[0], "rb") as picklefile:
            objs = pickle.load(picklefile)
    else:
        objs = []
        for filename in pkls:
            with open(filename, "rb") as picklefile:
                objs.append(pickle.load(picklefile))
    return objs

import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [139]:
main = 'http://www.boxofficemojo.com'
page = '/movies/alphabetical.htm'
adj = '&adjust_yr=2017&p=.htm'

In [66]:
#AZ = get_AZ(main, page)
#save_requests(main, AZ, 'AZ')
#AZ_soups = get_soups('AZ', len(AZ))
AZ_tables, movies = get_tables_movies(AZ_soups)

In [69]:
make_pickles([('AZ.pkl', AZ), ('AZ_tables.pkl', AZ_tables), ('movies.pkl',movies)])

In [3]:
AZ, AZ_tables, movies = get_pickles(['AZ.pkl', 'AZ_tables.pkl', 'movies.pkl'])

In [110]:
start = 17557
stop = None
save_requests(main, movies, 'movies', '', start, stop)

In [111]:
start = 17557
save_requests(main, movies, 'movieadj', adj, start)

In [112]:
movies_soups = get_soups('movies', len(movies))

In [113]:
movieadj_soups = get_soups('movieadj', len(movies))

In [29]:
main = 'http://www.the-numbers.com/movie/budgets/all/'
pages = [str(100*x+1) for x in range(55)]

start = 1
save_requests(main, pages, 'numbers', '', start)

In [114]:
%%time
movie_info = get_movie_info(movies, movies_soups, adj=False, header=True)

Wall time: 5min 55s


In [115]:
make_pickles([('movie_info.pkl', movie_info)])

In [116]:
%%time
movieadj_info = get_movie_info(movies, movieadj_soups, adj=True, header=True)

Wall time: 1min 17s


In [117]:
make_pickles([('movieadj_info.pkl', movieadj_info)])

In [121]:
movie_df = pd.DataFrame(np.array(movie_info[1:]), columns=movie_info[0])

In [122]:
movieadj_df = pd.DataFrame(np.array(movieadj_info[1:]), columns=movieadj_info[0])

In [123]:
make_pickles([('movie_df.pkl',movie_df),('movieadj_df.pkl',movieadj_df)])

In [124]:
df = movie_df.merge(movieadj_df, on=['page', 'title', 'year'])

In [125]:
make_pickles([('df.pkl',df)])

In [126]:
df_AZ = pd.DataFrame(AZ_tables[1:], columns=AZ_tables[0])

In [127]:
make_pickles([('df_AZ.pkl',df_AZ)])

In [128]:
numbers_soups = get_soups('numbers', 55 )

In [129]:
def get_tables_budgets(numbers_soups):
    tables = [['Release Date', 'Movie', 'Production Budget', 'Domestic Gross', 'Worldwide Gross']]  
    for soup in numbers_soups:
        contents = [[td.get_text().replace('â\x80\x99','\'') for td in tr.find_all('td')] \
                    for i, tr in enumerate(soup.find('table').find_all('tr')) if i%2!=0]
        for content in contents:
            tables.append(content[1:])
    return tables

In [130]:
numbers_tables = get_tables_budgets(numbers_soups)

In [132]:
df_numbers = pd.DataFrame(numbers_tables[1:], columns=numbers_tables[0])

In [133]:
df_numbers[df_numbers.duplicated(keep=False)].sort_values(by='Movie')

Unnamed: 0,Release Date,Movie,Production Budget,Domestic Gross,Worldwide Gross
5308,12/31/2007,A Dog's Breakfast,"$120,000",$0,$0
5309,12/31/2007,A Dog's Breakfast,"$120,000",$0,$0


In [134]:
df_numbers.drop_duplicates(inplace=True)

In [135]:
make_pickles([('df_numbers.pkl', df_numbers), ('numbers_tables.pkl', numbers_tables)])