In [215]:
import requests
from bs4 import BeautifulSoup
import string
import pickle
import re

In [13]:
movie_urls = []

In [24]:
alphabet = list(string.ascii_lowercase)

In [144]:
base_url = 'http://www.boxofficemojo.com'
rel_by_alphabet = '/movies/alphabetical.htm?'

In [145]:
def create_url(letter,page):
    '''this function takes in the query params letter and page 
       and returns the full url string
    '''
    q_letter = 'letter=%s' % letter
    q_page = 'page=%s' % page
    url = base_url + rel_by_alphabet + q_letter + '&' + q_page
    return url

In [86]:
def get_soup(url):
    response = requests.get(url)
    page = response.text

    return BeautifulSoup(page)

In [149]:
 def get_all_movie_links():
    movie_links = []
    for letter in alphabet:
        page = 1
        last_link_count = 0
        while len(movie_links) > last_link_count or page == 1:
            last_link_count = len(movie_links)
            soup = get_soup(create_url(letter,page)).find('div',id='body')
            movie_links += [m['href'] for m in soup.find_all('a') 
                 if m['href'].startswith('/movies/?id=')]
            print letter, page 
            page += 1
        with open('movie_links.pkl', 'wb') as output:
            pickle.dump(movie_links, output, pickle.HIGHEST_PROTOCOL)
    return movie_links

In [150]:
# movie_links = get_all_movie_links()

In [213]:
import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [212]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [286]:
def get_weekly_dict(soup):
    weekly_data = {}
    names = ['week','rank','weekly_gross','change_gross','num_theaters',
             'theaters_change','avg_gross_by_theatre','cum_gross','week_num']
    table =  soup.find_all('table', { "class" : 'chart-wide'})[0]
    table_data = [[cell.text for cell in row.find_all("td")]
                         for row in table.find_all("tr")]
    table_data = table_data[1:]
    for i in range(1,9):
        weekly_data[names[i]] = [str(x[i]) for x in table_data]
    return weekly_data

In [287]:
def get_movie_attributes(soup):
    
    title = soup.find('title').text.split('(')[0].strip()
    domestic_total_gross = money_to_int(get_movie_value(soup,'Domestic Total'))
    release_date = to_date(get_movie_value(soup,'Release Date'))
    runtime = runtime_to_minutes(get_movie_value(soup,'Runtime'))
    rating = get_movie_value(soup,'MPAA Rating')
    weekly_data = get_weekly_dict(soup)

    headers = ['title', 'domestic_total',
               'release_date', 'runtime', 'rating','weekly_data']
   
    


    movie_dict = dict(zip(headers,[title,domestic_total_gross,
                                   release_date,runtime,rating,weekly_data]))
    return movie_dict

In [290]:
s = get_movie_attributes(soup)

In [292]:
import pprint
pprint.pprint(s)

{'domestic_total': 175705,
 'rating': u'PG-13',
 'release_date': datetime.datetime(2013, 10, 4, 0, 0),
 'runtime': 88,
 'title': u'A.C.O.D.',
 'weekly_data': {'avg_gross_by_theatre': ['$8,813',
                                          '$2,717',
                                          '$1,546',
                                          '$1,298',
                                          '$719'],
                 'change_gross': ['-',
                                  '+85.0%',
                                  '+32.7%',
                                  '-62.0%',
                                  '-56.3%'],
                 'cum_gross': ['$26,439',
                               '$75,348',
                               '$140,264',
                               '$164,920',
                               '$175,705'],
                 'num_theaters': ['3', '18', '42', '19', '15'],
                 'rank': ['61', '55', '47', '54', '67'],
                 'theaters_change': ['-', '+15',

In [293]:
weekly_movie_links = [base_url+x.replace('/movies/?','/movies/?page=weekly&') for x in movie_links]
no_weekly = 'NO WEEKLY DATA AVAILABLE'

In [294]:
test = weekly_movie_links[:10]

In [296]:
for link in test: 
    soup = get_soup(link)
    if not soup.find(text=no_weekly):
        pprint.pprint(get_movie_attributes(soup))
    else:
        print 'no weekly'

{'domestic_total': 77222099,
 'rating': u'PG-13',
 'release_date': datetime.datetime(2010, 6, 11, 0, 0),
 'runtime': 117,
 'title': u'The A-Team',
 'weekly_data': {'avg_gross_by_theatre': ['$10,190',
                                          '$5,875',
                                          '$4,278',
                                          '$2,824',
                                          '$3,235',
                                          '$1,641',
                                          '$1,531',
                                          '$1,533',
                                          '$1,298',
                                          '$1,105',
                                          '$1,040',
                                          '$989',
                                          '$1,406',
                                          '$832'],
                 'change_gross': ['-',
                                  '-42.2%',
                                  '-55.6%',
