### Get the HTML from a page, and convert to a BeautifulSoup object
#### We'll start by scraping some information from Box Office Mojo.

In [2]:
import urllib2
import re
from bs4 import BeautifulSoup
import string

In [12]:
url = "http://www.boxofficemojo.com/movies/?id=biglebowski.htm"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)

In [7]:
#print soup

###soup.find()
  Search for a type of tag by using the tag as a string (like 'body', 'div', 'a', 'p', etc) as an argument

In [13]:
soup.find("a")

<a href="/goto.php?a=5" target="4"><font face="Verdana" size="3"><b>'Furious 7' hits $800 million worldwide... &gt;</b></font><br/></a>

In [11]:
#print soup.a
print soup.a.prettify()

<a href="/goto.php?a=5" target="4">
 <font face="Verdana" size="3">
  <b>
   'Furious 7' hits $800 million worldwide... &gt;
  </b>
 </font>
 <br/>
</a>



### soup.find_all()
  Returns a list of all matches.

In [17]:
#for link in soup.find_all('a'):
#    print link

# retrive url from an anchor tag
print soup.find('a')
print
print soup.find('a')['href']

<a href="/goto.php?a=5" target="4"><font face="Verdana" size="3"><b>'Furious 7' hits $800 million worldwide... &gt;</b></font><br/></a>

/goto.php?a=5


In [28]:
# you can match on an attribute like a class, or an id
print soup.find(class_="mp_box_content").find_all('td')

[<td width="40%"><b>Domestic:</b></td>, <td align="right" width="35%"> <b>$17,451,873</b></td>]


In [36]:
#soup.find(id="hp_footer").encode('utf-8')
#print soup.find(id="hp_footer")

## Items to scrape for in each movie:

### movie title, total domestic gross, release date, runtime, rating

In [69]:
# Movie title
print soup.find('title')
title_string = soup.find('title').text
title_string =  title_string.split("(")[0].strip()

<title>The Big Lebowski (1998) - Box Office Mojo</title>


In [58]:
# Domestic Total Gross
print soup.find(text='Domestic Total Gross: ')

#regex
dtg_string = soup.find(text=re.compile("Domestic Total Gross:"))
print dtg_string.findNextSibling().text

Domestic Total Gross: 
$17,451,873


In [62]:
def get_movie_value(soup, field_name):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None    

In [66]:
dtg = get_movie_value(soup, "Domestic Total")
print dtg

#run time
runtime = get_movie_value(soup, "Runtime")
print runtime

#rating
rating = get_movie_value(soup, "MPAA Rating")
print rating

#release date
release_date = get_movie_value(soup, "Release Date")
print release_date

$17,451,873
1 hrs. 57 min.
R
March 6, 1998


In [77]:
headers = ["movie title", "domestic total gross", "release date", "runtime", "rating"]
movie_data = []
movie_dict = dict(zip(headers, [title_string, dtg, release_date, runtime, rating]))
movie_data.append(movie_dict)

In [78]:
print movie_data

[{'rating': u'R', 'runtime': u'1 hrs. 57 min.', 'movie title': u'The Big Lebowski', 'domestic total gross': u'$17,451,873', 'release date': u'March 6, 1998'}]


### My Own Testing

In [201]:
box_office_movies = "http://www.boxofficemojo.com/movies/"
box_office = "http://www.boxofficemojo.com"

single_movie_url_tags = []

for letter_url in alpha_index:
    letter_movies_url = box_office_movies + letter_url
    url = letter_movies_url
    try:
        page = urllib2.urlopen(url)
    except:
        continue
    soup = BeautifulSoup(page)
    for a in soup.find_all('a', href=True):
        if a['href'].count('id') >= 1:
            single_movie_url_tags.append(a['href'])
    for sub_letter_url in sub_alpha_index:
        sub_letter_movies_url = box_office + sub_letter_url
        url = sub_letter_movies_url
        try:
            page = urllib2.urlopen(url)
        except:
            continue
        soup = BeautifulSoup(page)
        for a in soup.find_all('a', href=True):
            if a['href'].count('id') >= 1:
                single_movie_url_tags.append(a['href'])

### Fresh Start

In [3]:
def build_soup_page(url):
    """
    builds a beautifulsoup object from a url
    """
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    return soup

In [4]:
def build_index(soup, to_search):
    """
    builds an index from a soup object with a search string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(to_search):
            index.append(a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [5]:
def build_sub_index(soup, to_search, to_count):
    """
    builds an index from a soup object with a search and count string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(to_search) and a['href'].count(to_count) >= 1:
            index.append(a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]
    

In [6]:
# builds the top level pages, i.e. NUM and A-Z movie pages
top_level = build_soup_page("http://www.boxofficemojo.com/movies")

In [7]:
# builds an alpha index of each site of NUM and A-Z movie pages
alpha_index = build_index(top_level, 'alphabetical')

In [87]:
#alpha_index

In [85]:
# builds a total index of all urls for all movies on all pages
tot_index = []
box_office_movies = "http://www.boxofficemojo.com/movies/"
box_office = "http://www.boxofficemojo.com"

for site in alpha_index:
    sub_level = build_soup_page(box_url + site)
    tot_index.append(box_office_movies + site)
    sub_index_lst = build_sub_index(sub_level, '/movies/', 'page')
    if len(sub_index_lst) > 0:
        for sub_site in sub_index_lst:
            if sub_site.count('id') == 0:
                tot_index.append(box_office + sub_site)

In [10]:
def build_single_movie_url(comp_url):

    single_movie_url_tags = []

    for full_site in comp_url:
        soup = build_soup_page(full_site)
        for a in soup.find_all('a', href=True):
            if a['href'].count('id') >= 1 and a['href'] != '/movies/?id=fast7.htm' :
                single_movie_url_tags.append(box_office + a['href'])
    single_movie_url_tags.append(box_office + '/movies/?id=fast7.htm')
    
    return single_movie_url_tags

In [11]:
def get_movie_value(soup, field_name):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None  

In [12]:
single_movies = build_single_movie_url(tot_index)

In [28]:
#gets rid of duplicate movies
single_movies_copy = single_movies[:]

for el in single_movies_copy:
    if single_movies_copy.count(el) > 1:
        single_movies.remove(el)

In [16]:
test = []

for movie in single_movies[:1000]:
    try:
        soup = build_soup_page(movie)
    except:
        continue
    dtg = get_movie_value(soup, "Domestic Total")
    runtime = get_movie_value(soup, "Runtime")
    rating = get_movie_value(soup, "MPAA Rating")
    release_date = get_movie_value(soup, "Release Date")
    test.append([dtg, runtime, rating, release_date])

In [19]:
for el in test:
    print el

[u'$52,384', u'1 hrs. 18 min.', u'R', u'December 12, 2008']
[u'$15,919', u'1 hrs. 36 min.', u'Unrated', u'August 10, 2012']
[u'$32,391,374', u'1 hrs. 35 min.', u'PG-13', u'July 17, 2009']
[u'$230,600', u'1 hrs. 36 min.', u'R', u'October 23, 2009']
[u'$33,300,000', u'1 hrs. 59 min.', u'R', u'October 19, 1979']
[None, u'1 hrs. 27 min.', u'Not Yet Rated', u'June 7, 2013']
[None, u'N/A', u'Unrated', u'March 28, 2003']
[u'$53,895', u'1 hrs. 57 min.', u'Unrated', u'October 24, 2014']
[u'$74,865,517', u'2 hrs. 1 min.', u'R', u'October 5, 1979']
[None, u'1 hrs. 51 min.', u'Unrated', u'September 25, 2015']
[u'$83,291', u'1 hrs. 22 min.', u'R', u'December 1, 2006']
[u'$224,546', u'1 hrs. 29 min.', u'Unrated', u'April 27, 2007']
[None, u'1 hrs. 34 min.', u'R', u'April 4, 2014']
[u'$38,178,166', u'1 hrs. 37 min.', u'PG-13', u'March 31, 1999']
[u'$7,175,592', u'1 hrs. 43 min.', u'R', u'March 11, 1983']
[u'$203,373', u'1 hrs. 40 min.', u'PG-13', u'September 14, 2012']
[u'$94,784,201', u'1 hrs. 49 mi