In [1]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Chapter 2 - More BeautifulSoup - Getting all the information from one movie

## Summmary of this notebook:
TODO:
We'll pick one movie from our list, the harry potter movie, and see how we can extract these following information:
* movie title
* domestic total gross -- adjusted
* movie production budget
* movie genre
* movie MPAA ratings
* movie runtime
* movie release date

### Consistency Web scraping is made simple by the consistent format of information among like pages of a website.



In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Let's see what our top 5 movies are
!head -n5 ../data/movie_urls.txt

/movies/?id=gonewiththewind.htm
/movies/?id=starwars4.htm
/movies/?id=soundofmusic.htm
/movies/?id=et.htm
/movies/?id=titanic.htm


In [4]:
base_url = 'http://www.boxofficemojo.com'
movie_url = '/movies/?id=soundofmusic.htm'

In [5]:
response = requests.get(base_url + movie_url)

In [6]:
# this is another way to check if response status_code is 200
if response.ok:
    soup = BeautifulSoup(response.text,'html.parser')

## Let's go to the url and take a look at the page. 
## Something looks a little weird here: the movie only made $ 158 million.        

This is NOT the adjusted gross. After we go to the individual page, the link took us to the default "actual gross" values. How do we adjust to year 2017? What do you discover in the url?

In [7]:
# Let's try again to get the correct adjusted values:
response = requests.get(base_url + movie_url + '&adjust_yr=2017')
if response.ok:
    soup = BeautifulSoup(response.text,'html.parser')

# Soup methods table

## `Soup.find()` refresher


In [24]:
# Movie Title
soup.find('title')

<title>The Sound of Music (1965) - Box Office Mojo</title>

In [29]:
(
    soup.find('title')
    .string 
    .split('(')[0]  # take the substring before the left paran
    .strip()        # strip off the trailing spaces
)


'The Sound of Music'

In [30]:
movie_title = _

In [35]:
# Adjusted Domestic Total Gross
## text does an exact match search!
print(soup.find(string="Domestic Total"))

None


In [37]:
# You have to find a perfect match in order for it to work:
print(soup.find(text="Domestic Total Adj. Gross: "))

Domestic Total Adj. Gross: 


## You could also use regular expressions
![regular expressions](http://imgs.xkcd.com/comics/regular_expressions.png)

[Handy Tool for making RegEx](http://pythex.org/)

In [8]:
import re
soup.find(text=re.compile('Domestic Total'))

'Domestic Total Adj. Gross: '

In [41]:
%%html
<td align="center" colspan="2">
    <font size="4">
        Domestic Total Adj. Gross: 
        <b>$580,914,300</b>
    </font>
</td>

In [69]:
soup.find(text=re.compile('Domestic Total')).findParent()

<font size="4">Domestic Total Adj. Gross: <b>$1,258,951,900</b></font>

In [64]:
(
soup.find(text=re.compile('Domestic Total'))
.findNextSibling()
)

<b>$1,258,951,900</b>

In [None]:
# you could also use 
# soup.find(text=re.compile('Domestic Total')).parent
# and
# soup.find(text=re.compile('Domestic Total')).next_sibling
# to achieve the same effect

In [None]:
# Ok, now we have our domestic total gross adjusted. But it's a string, lets re-format it and convert it to a number

In [66]:
dtg  = (
soup.find(text=re.compile('Domestic Total'))
.findNextSibling()
.text
)

In [67]:
dtg = dtg.replace('$','').replace(',','')
domestic_total_gross = int(dtg)
print(domestic_total_gross)

1258951900


# Exercise: 

In [None]:
# Can you also find the unadjusted actual domestic total on this page?

In [42]:
actual_dtg = (
soup.find(text=re.compile('Domestic:'))
.findParent()
.findParent()
.findNextSibling()
.find('b').text
)

In [48]:
actual_dtg = int(actual_dtg.replace('$','').replace(',',''))
print(actual_dtg)

158671368


### Since we are going to re-use some of the soup.find code, let's wrap them in functions

In [9]:
### We can actually do several of these using the text matching method, so let's make a function for that

def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [12]:
# domestic total gross
dtg = get_movie_value(soup,'Domestic Total')
print(dtg)

# production budget
budget = get_movie_value(soup, 'Production Budget')
print(budget)

# runtime
runtime = get_movie_value(soup,'Runtime')
print(runtime)

# rating
rating = get_movie_value(soup,'MPAA Rating')
print(rating)

release_date = get_movie_value(soup,'Release Date')
print(release_date)


$1,258,951,900
$8.2 million
2 hrs. 54 min.
G
March 2, 1965


In [26]:
### We need a few helper methods to parse the strings we've gotten

import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def budget_to_num(budgetstring):
    budgetstring = budgetstring.replace('$', '').split()[0]
    return float(budgetstring) * 1.0e6

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [27]:
# Let's get these again and format them all in one swoop

from pprint import pprint

raw_release_date = get_movie_value(soup,'Release Date')
release_date = to_date(raw_release_date)

raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
domestic_total_gross = money_to_int(raw_domestic_total_gross)

raw_production_budget = get_movie_value(soup,'Production Budget')
budget = budget_to_num(raw_production_budget)

raw_runtime = get_movie_value(soup,'Runtime')
runtime = runtime_to_minutes(raw_runtime)

In [49]:
headers = ['movie title', 'adjusted domestic total gross', 'actual dtg',
           'release date', 'runtime (mins)', 'rating', 'budget']

movie_data = []
movie_dict = dict(zip(headers, [movie_title,
                                domestic_total_gross,
                                actual_dtg,
                                release_date,
                                runtime,
                                rating,
                                budget]))
movie_data.append(movie_dict)

pprint(movie_data)


[{'actual dtg': 158671368,
  'adjusted domestic total gross': 1258951900,
  'budget': 8199999.999999999,
  'movie title': 'The Sound of Music',
  'rating': 'G',
  'release date': datetime.datetime(1965, 3, 2, 0, 0),
  'runtime (mins)': 174}]


# Great! Now we scraped all the info for this movie. We can move on scrape all the movies!