### Get the HTML from a page, and convert to a BeautifulSoup object
#### We'll start by scraping some information from Box Office Mojo.

In [2]:
import urllib2
import re
from bs4 import BeautifulSoup
import string

In [12]:
url = "http://www.boxofficemojo.com/movies/?id=biglebowski.htm"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)

In [7]:
#print soup

###soup.find()
  Search for a type of tag by using the tag as a string (like 'body', 'div', 'a', 'p', etc) as an argument

In [13]:
soup.find("a")

<a href="/goto.php?a=5" target="4"><font face="Verdana" size="3"><b>'Furious 7' hits $800 million worldwide... &gt;</b></font><br/></a>

In [11]:
#print soup.a
print soup.a.prettify()

<a href="/goto.php?a=5" target="4">
 <font face="Verdana" size="3">
  <b>
   'Furious 7' hits $800 million worldwide... &gt;
  </b>
 </font>
 <br/>
</a>



### soup.find_all()
  Returns a list of all matches.

In [17]:
#for link in soup.find_all('a'):
#    print link

# retrive url from an anchor tag
print soup.find('a')
print
print soup.find('a')['href']

<a href="/goto.php?a=5" target="4"><font face="Verdana" size="3"><b>'Furious 7' hits $800 million worldwide... &gt;</b></font><br/></a>

/goto.php?a=5


In [28]:
# you can match on an attribute like a class, or an id
print soup.find(class_="mp_box_content").find_all('td')

[<td width="40%"><b>Domestic:</b></td>, <td align="right" width="35%"> <b>$17,451,873</b></td>]


In [36]:
#soup.find(id="hp_footer").encode('utf-8')
#print soup.find(id="hp_footer")

## Items to scrape for in each movie:

### movie title, total domestic gross, release date, runtime, rating

In [69]:
# Movie title
print soup.find('title')
title_string = soup.find('title').text
title_string =  title_string.split("(")[0].strip()

<title>The Big Lebowski (1998) - Box Office Mojo</title>


In [58]:
# Domestic Total Gross
print soup.find(text='Domestic Total Gross: ')

#regex
dtg_string = soup.find(text=re.compile("Domestic Total Gross:"))
print dtg_string.findNextSibling().text

Domestic Total Gross: 
$17,451,873


In [62]:
def get_movie_value(soup, field_name):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None    

In [66]:
dtg = get_movie_value(soup, "Domestic Total")
print dtg

#run time
runtime = get_movie_value(soup, "Runtime")
print runtime

#rating
rating = get_movie_value(soup, "MPAA Rating")
print rating

#release date
release_date = get_movie_value(soup, "Release Date")
print release_date

$17,451,873
1 hrs. 57 min.
R
March 6, 1998


In [77]:
headers = ["movie title", "domestic total gross", "release date", "runtime", "rating"]
movie_data = []
movie_dict = dict(zip(headers, [title_string, dtg, release_date, runtime, rating]))
movie_data.append(movie_dict)

In [78]:
print movie_data

[{'rating': u'R', 'runtime': u'1 hrs. 57 min.', 'movie title': u'The Big Lebowski', 'domestic total gross': u'$17,451,873', 'release date': u'March 6, 1998'}]


### My Own Testing

In [150]:
def build_soup_page(url):
    """
    builds a beautifulsoup object from a url
    """
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    return soup

In [191]:
def build_top_index(soup, to_search, beg_url):
    """
    builds an index from a soup object with a search string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(to_search):
            index.append(beg_url + a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [206]:
def build_sub_index(soup, search_one, search_two, beg_url):
    """
    builds an index from a soup object with a search and count string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(search_one) and re.search(search_two, a['href']) != None:
            index.append(beg_url + a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [193]:
# creates a soup object containing the href for all pages A-Z & NUM
top_level = build_soup_page("http://www.boxofficemojo.com/movies")
# builds a list of each url ending for NUM and A-Z movie pages
top_level_urls = build_top_index(top_level, 'alphabetical', "http://www.boxofficemojo.com/movies/")

In [203]:
#for a in top_level.find_all('a', href=True):
#    if a['href'].find('alphabetical') != -1:
#        print a['href']
total_urls = []

In [204]:
total_urls = top_level_urls[:]

for top_level_url in top_level_urls:
    top_level_soup = build_soup_page(top_level_url)
    sub_level = build_sub_index(top_level_soup, '/movies/', 'page', "http://www.boxofficemojo.com")
    if len(sub_level) > 0:
        for sub_level_url in sub_level:
            if sub_level_url.count('id') == 0:
                total_urls.append(sub_level_url)

In [207]:
#sorted(total_urls)

In [208]:
def build_single_movie_url_list(total_urls, box_url):
    """
    builds a list of every movies url
    """
    
    single_movie_url_tags = []

    for full_site in total_urls:
        soup = build_soup_page(full_site)
        for a in soup.find_all('a', href=True):
            if re.search('id', a['href']) != None and a['href'] != '/movies/?id=fast7.htm':
                single_movie_url_tags.append(box_url + a['href'])
    single_movie_url_tags.append(box_url + '/movies/?id=fast7.htm')
    
    return single_movie_url_tags

In [210]:
#builds a list of every movies full url to scrape
single_movies = build_single_movie_url_list(total_urls, 'http://www.boxofficemojo.com')

In [212]:
len(sorted(single_movies))

14966

In [165]:
#gets rid of duplicate movies
single_movies_copy = single_movies[:]

for el in single_movies_copy:
    if single_movies_copy.count(el) > 1:
        single_movies.remove(el)

In [167]:
#sorted(single_movies)

### Start building data structure of features

In [11]:
def get_movie_value(soup, field_name):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None  

In [16]:
test = []

for movie in single_movies[:1000]:
    try:
        soup = build_soup_page(movie)
    except:
        continue
    dtg = get_movie_value(soup, "Domestic Total")
    runtime = get_movie_value(soup, "Runtime")
    rating = get_movie_value(soup, "MPAA Rating")
    release_date = get_movie_value(soup, "Release Date")
    test.append([dtg, runtime, rating, release_date])

In [106]:
#for el in test:
#    print el