### Scrapes all movies from www.boxofficemojo.com/alltime/domestic and pickles for further processing

In [6]:
import urllib2
from bs4 import BeautifulSoup
import re
import pickle
import time
import logging
logging.basicConfig(level=logging.DEBUG)

In [31]:
def get_all_movies():
    """ Returns a list of all the movie urls from www.boxofficemojo.com/alltime/domestic"""
    urls = [] 
    max_p = 141  # maximum page number
    p = 0        # numbering starts at 1
    
    while True:
        p += 1
        if p > max_p:
            break
            
        url = "http://www.boxofficemojo.com/alltime/domestic.htm?page=%s&p=.htm" % p
        
        try:
            page = urllib2.urlopen(url)
            soup = BeautifulSoup(page,'html.parser')
            movie_urls = (soup.find(id='body').find_all('table')[1].
                          find_all("a", attrs={'href': re.compile('/movies/?')}))
            
            # include only first release (id=movie_title)
            for i in movie_urls:
                if 'page=releases&' in i['href']:
                    movie_url = i['href'].replace('page=releases&','')
                else:
                    movie_url = i['href']
                # do not include duplicates
                if movie_url not in urls:
                    urls.append(movie_url)
            
        except Exception:
            pass
        
    return urls

In [18]:
def get_url(m_url):
    """ Returns url for each movie """
    
    base_url = 'http://www.boxofficemojo.com'
    return base_url + m_url

In [19]:
def get_movie_value(soup,field_name):
    """ Takes an attribute of a movie 
    and returns string of the next sibling object"""
    
    try:
        obj = soup.find(text= re.compile(field_name))
        if not object:
            return None
        next_sibling = obj.findNextSibling()
        if next_sibling:
            return next_sibling.text
    except AttributeError:
        pass

In [20]:
def get_title(soup):
    """ Return title"""
    
    try:
        title = soup.find("title").text.split("-")[0].strip()
        return title
    
    except AttributeError:
        pass

In [21]:
def get_grosses(soup,gross):
    """ Return Total Lifetime Gross/ Opening Weekend Gross as float or ignore if missing """
    
    try:
        
        if gross == 'TLG':
            gross = (soup.find_all(class_ = 'mp_box_content')[0].find_all('tr')[0].find_all('td')[1]).text
        else:
            gross = (soup.find_all(class_ = 'mp_box_content')[1].find_all('tr')[0].find_all('td')[1]).text
     

        if gross and "$" in gross:
            return float(gross.replace("$","").replace(",",""))
        else:
            return None
        
    except Exception:
        pass
                

In [22]:
def opening_theaters(soup):
    """Return string containing info on opening weekend theaters or ignore if missing"""
    
    try:
        
        open_stuff = soup.find_all(class_ = 'mp_box_content')[1].find_all('tr')[1].find_all('td')[0].text
        return open_stuff
    
    except Exception:
        pass

In [23]:
def players(soup,player):
    """ Return a list of players, e.g., actors, directors, etc."""
    
    try:
        players = soup.find_all('a', attrs={'href' : re.compile("/people/chart/[?]view=%s" %  player)}) 
        
        player_name=[]
        for _ in players:
            player_name.append(_.text.strip('*'))
        return player_name
    
    except AttributeError:
        pass

In [34]:
all_movies = []

for num, i in enumerate(urls):
    try:
        data = {} # dictionary 
        url_m = get_url(i)
        page_m = urllib2.urlopen(url_m)
        soup_m = BeautifulSoup(page_m,'html.parser')
    
        data['title'] = get_title(soup_m)
    
        tab1 = ['Domestic Total', 'Distributor', 'Genre:', 'MPAA Rating', 'Release Date',\
                'Runtime', 'Production Budget']
        for _ in tab1:
            data[_] = get_movie_value(soup_m,_)
        
        grosses = ['TLG', 'openWDG']    
        for gross in grosses:
            data[gross] = get_grosses(soup_m,gross)
        
        
        data['OWtheaters'] = opening_theaters(soup_m)
    
        cast = ['Director', 'Writer', 'Actor','Producer', 'Composer']
        for member in cast:
            data[member] = players(soup_m,member)
            
        # append dictionary to a list containing all movies    
        all_movies.append(data)
    
        # add delay for scraping
        if num % 100 == 0:
            print 'Events processed %d' % num
            time.sleep(10) 
        elif num % 500 == 0:
            time.sleep(30)
        elif num % 1000 == 0:
            time.sleep(60)
    
    except Exception:
        #logging.exception(e)
        pass
    
with open('all_data.pkl', 'w') as picklefile:
    pickle.dump(all_movies, picklefile)
    

Events processed 0
Events processed 100
Events processed 200
Events processed 300
Events processed 400
Events processed 500
Events processed 600
Events processed 700
Events processed 800
Events processed 900
Events processed 1000
Events processed 1100
Events processed 1200
Events processed 1300
Events processed 1400
Events processed 1500
Events processed 1600
Events processed 1700
Events processed 1800
Events processed 1900
Events processed 2000
Events processed 2100
Events processed 2200
Events processed 2300
Events processed 2400
Events processed 2500
Events processed 2600
Events processed 2700
Events processed 2800
Events processed 2900
Events processed 3000
Events processed 3100
Events processed 3200
Events processed 3300
Events processed 3400
Events processed 3500
Events processed 3600
Events processed 3700
Events processed 3800
Events processed 3900
Events processed 4000
Events processed 4100
Events processed 4200
Events processed 4300
Events processed 4400
Events processed 4500


In [36]:
len(all_movies)

14006

In [25]:
with open("all_boxofficemojo.pkl", 'r') as picklefile: 
    my_old_data = pickle.load(picklefile)

In [30]:
print 'Total number of events/movies pickled = %d' %len(my_old_data)
print
for _ in my_old_data[:10]:
    print _['title']

Total number of events/movies pickled = 14006

Star Wars: The Force Awakens (2015)
Avatar (2009)
Titanic (1997)
Jurassic World (2015)
Marvel's The Avengers (2012)
The Dark Knight (2008)
Star Wars: Episode I
Star Wars (1977)
Avengers: Age of Ultron (2015)
The Dark Knight Rises (2012)
