### Get the HTML from a page, and convert to a BeautifulSoup object
#### We'll start by scraping some information from Box Office Mojo.

In [73]:
import urllib2
import re
from bs4 import BeautifulSoup
import pickle

### My Own Testing

In [55]:
def build_soup_page(url):
    """
    builds a beautifulsoup object from a url
    """
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page) 
                         #'xml')
    return soup

In [15]:
def build_top_index(soup, to_search, beg_url):
    """
    builds an index from a soup object with a search string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(to_search):
            index.append(beg_url + a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [16]:
def build_sub_index(soup, search_one, search_two, beg_url):
    """
    builds an index from a soup object with a search and count string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(search_one) and re.search(search_two, a['href']) != None:
            index.append(beg_url + a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [83]:
def build_single_movie_url_list(total_urls, box_url):
    """
    builds a list of every movies url
    """
    
    single_movie_url_tags = []

    for full_site in total_urls:
        soup = build_soup_page(full_site)
        for a in soup.find_all('a', href=True):
            if re.search('id', a['href']) != None and a['href'] not in single_movie_url_tags:
                single_movie_url_tags.append(box_url + a['href'])   
    return set(single_movie_url_tags)

###Build soup document and build index of movies pages from A-Z

In [56]:
# creates a soup object containing the href for all pages A-Z & NUM
movie_pages_az_level = build_soup_page("http://www.boxofficemojo.com/movies")

In [57]:
# builds a list of each url ending for NUM and A-Z movie pages
movie_pages_az_urls = build_top_index(movie_pages_az_level, 'alphabetical', "http://www.boxofficemojo.com/movies/")

In [64]:
#movie_pages_az_urls[1]

### Build sub level pages for each letter of title

In [70]:
# Builds total urls for all pages of movies
total_urls = movie_pages_az_urls[:]

for movie_page in movie_pages_az_urls:
    top_level_soup = build_soup_page(movie_page)
    sub_level = build_sub_index(top_level_soup, '/movies/', 'page', "http://www.boxofficemojo.com")
    if len(sub_level) > 0:
        for sub_level_url in sub_level:
            if sub_level_url.count('id') == 0:
                total_urls.append(sub_level_url)

In [203]:
sorted(total_urls)[:10]

['http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=10&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=2&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=3&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=4&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=5&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=6&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=7&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=8&p=.htm',
 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=9&p=.htm']

### Need to loop through and filter out urls that have foreign pages and have u.k. values

In [84]:
#movie_urls
#single_movies_A = build_single_movie_url_list([total_urls[1]], 'http://www.boxofficemojo.com')
#for url in sorted(total_urls)[0:10]:

In [210]:
#single_movies_A

### Testing on a single movie page if there is a foreign page

In [164]:
test = build_soup_page('http://www.boxofficemojo.com/movies/?id=ateam.htm')

In [187]:
test_list2 = []
#print test.find(attrs={'valign': re.compile('Genre')})
print str(test.find(text = re.compile('Genre: ')).findNextSibling().text)
#test_val = str(obj.findNextSibling().text)
#test_val == 'Action'

Action


In [194]:
test_list = []

In [195]:
for movie in list(single_movies_A):
    movie_soup = build_soup_page(movie)
    if movie_soup.find(attrs={'href': re.compile('page=intl')}) != None and \
                            str(movie_soup.find(text = re.compile('Genre: ')).findNextSibling().text) != 'Foreign':
        test_list.append(movie)

### List of urls in test_list are the movies that aren't foreign but have an international page

In [213]:
#for url in sorted(test_list):
#    print url

### Pickle (save a file) of things as needed

In [74]:
def store_pickles(filename, to_store):
    with open(filename, 'w') as f:
        pickle.dump(to_store, f)

In [205]:
def eat_pickles(filename):
    with open(filename, 'r') as f:
        return pickle.load(f)

In [206]:
jar_of_pickles = eat_pickles('page_data.pkl')

In [223]:
#jar_of_pickles['http://www.boxofficemojo.com/movies/?id=ateam.htm']

### Build dictionary of actor name as key, number of movies as value

In [25]:
actor_names = {}

In [42]:
actor_data = []

In [35]:
for el in sub_people_one.findAll(align='right'):
    if re.search('\$', el.text) == None:
        actor_data.append(el.text)

In [3]:
#actor_data

In [8]:
new_actor_data = []

In [9]:
for a in test2.find_all('a', href=True):
    if a['href'].startswith('./chart/?view'): 
        new_actor_data.append(a.text)

### Start building data structure of features

In [11]:
def get_movie_value(soup, field_name):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None  

In [16]:
test = []

for movie in single_movies[:1000]:
    try:
        soup = build_soup_page(movie)
    except:
        continue
    dtg = get_movie_value(soup, "Domestic Total")
    runtime = get_movie_value(soup, "Runtime")
    rating = get_movie_value(soup, "MPAA Rating")
    release_date = get_movie_value(soup, "Release Date")
    test.append([dtg, runtime, rating, release_date])

In [106]:
#for el in test:
#    print el