In [1]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Chapter 3 - Scraping all movies - loop magic

## TODO:
Summary:  In this notebook, we will assemble pieces together and put all the scraping code into a loop to go through all 200 movies automatically

We'll also save our scraped information in a data frame and answer some questions.


In [32]:
import requests
from bs4 import BeautifulSoup

import re
import time
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
# Let's see what our top 5 movies are
!head -n5 ../data/movie_urls.txt

/movies/?id=gonewiththewind.htm
/movies/?id=starwars4.htm
/movies/?id=soundofmusic.htm
/movies/?id=et.htm
/movies/?id=titanic.htm


In [19]:
base_url = 'http://www.boxofficemojo.com'

In [16]:
movie_urls = pd.read_csv('../data/movie_urls.txt', names = ['url'], header=None)

In [17]:
movie_urls.head()

Unnamed: 0,url
0,/movies/?id=gonewiththewind.htm
1,/movies/?id=starwars4.htm
2,/movies/?id=soundofmusic.htm
3,/movies/?id=et.htm
4,/movies/?id=titanic.htm


In [25]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [33]:
# If you want to scrape all movies, uncomment this section
##

# headers = ['movie title', 'domestic total gross','genre',
#            'release date', 'runtime (mins)', 'rating']

# movie_data = []


# for movie_url in movie_urls['url']:
#     response = requests.get(base_url + movie_url + '&adjust_yr=2017')
#     if response.ok:
#         soup = BeautifulSoup(response.text,'html.parser')
                
#         # Scrape all fields for each movie
#         #######################
#         # movie title
#         title = soup.find('title').string.split('(')[0].strip()        
#         # domestic total gross
#         dtg = get_movie_value(soup,'Domestic Total')        
#         # genre
#         genre = get_movie_value(soup,'Genre:')   
#         # release date
#         release_date = get_movie_value(soup,'Release Date')
#         # runtime
#         runtime = get_movie_value(soup,'Runtime')
#         # rating
#         rating = get_movie_value(soup,'MPAA Rating')
        
        
#         # store it in a movie object
#         movie_dict = dict(zip(headers, [title,
#                                 dtg,
#                                 genre,
#                                 release_date,
#                                 runtime,
#                                 rating]))
#         movie_data.append(movie_dict)
        
#         time.sleep(0.5)
#     else:
#         print("Request errored out with code {}".format(response.status_code))


In [None]:
len(movie_data)

In [36]:
# Save our precious data into a file
pd.DataFrame(movie_data).to_csv('../data/alltime_movies200.csv', index=False)

In [37]:
# Let's use our old trick to peek at the output data
!head -n5 ../data/alltime_movies200.csv

domestic total gross,genre,movie title,rating,release date,runtime (mins)
"$1,768,000,000",Historical Epic,Gone with the Wind,G,"December 15, 1939",3 hrs. 58 min.
"$1,261,768,700",Sci-Fi Fantasy,Star Wars,PG,"May 25, 1977",2 hrs. 1 min.
"$1,258,951,900",Musical,The Sound of Music,G,"March 2, 1965",2 hrs. 54 min.
"$1,099,155,300",Family Adventure,E.T.: The Extra-Terrestrial,PG,"June 11, 1982",1 hrs. 57 min.


## We are almost done. But remember how we re-formatted the data in the last section? Let's do that to the dataframe too. We'll use a series of  `.map()` functions to directly operate on the columns

In [38]:
df = pd.read_csv('../data/alltime_movies200.csv')

In [39]:
df.head()

Unnamed: 0,domestic total gross,genre,movie title,rating,release date,runtime (mins)
0,"$1,768,000,000",Historical Epic,Gone with the Wind,G,"December 15, 1939",3 hrs. 58 min.
1,"$1,261,768,700",Sci-Fi Fantasy,Star Wars,PG,"May 25, 1977",2 hrs. 1 min.
2,"$1,258,951,900",Musical,The Sound of Music,G,"March 2, 1965",2 hrs. 54 min.
3,"$1,099,155,300",Family Adventure,E.T.: The Extra-Terrestrial,PG,"June 11, 1982",1 hrs. 57 min.
4,"$1,134,577,600",Romance,Titanic,PG-13,"December 19, 1997",3 hrs. 14 min.


In [46]:
# convert domestic total gross into numbers
df['domestic total gross'] = df['domestic total gross'].map(lambda x: int(x.replace('$','').replace(',','')))

In [50]:
# convert release date into datetime
df['release date'] = pd.to_datetime(df['release date'])

In [51]:
def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [54]:
# convert runtime into integer in minutes
df['runtime (mins)'] = df['runtime (mins)'].map(lambda x: runtime_to_minutes(x))

In [55]:
df.head()

Unnamed: 0,domestic total gross,genre,movie title,rating,release date,runtime (mins)
0,1768000000,Historical Epic,Gone with the Wind,G,1939-12-15,238
1,1261768700,Sci-Fi Fantasy,Star Wars,PG,1977-05-25,121
2,1258951900,Musical,The Sound of Music,G,1965-03-02,174
3,1099155300,Family Adventure,E.T.: The Extra-Terrestrial,PG,1982-06-11,117
4,1134577600,Romance,Titanic,PG-13,1997-12-19,194


In [None]:
# Awesome! Now we have all our data in one place, in the desired format. 
# Now we get to actually have fun with our data! 
# There are many questions I want to ask about the most highly grossed movies.

In [64]:
# With the inflation adjuster in mind, which decade has produced the most high-grossing movies?
df.sort_values(by='release date').groupby(df['release date'].map(lambda x: int(x.year/10)*10)).count()

Unnamed: 0_level_0,domestic total gross,genre,movie title,rating,release date,runtime (mins)
release date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1920,1,1,1,1,1,1
1930,2,2,2,2,2,2
1940,7,7,7,7,7,7
1950,13,13,13,13,13,13
1960,19,19,19,19,19,19
1970,26,26,26,26,26,26
1980,25,25,25,25,25,25
1990,32,32,32,32,32,32
2000,42,42,42,42,42,42
2010,33,33,33,33,33,33


In [70]:
# What is the average 
df.groupby('rating').median().sort_values('domestic total gross')

Unnamed: 0_level_0,domestic total gross,runtime (mins)
rating,Unnamed: 1_level_1,Unnamed: 2_level_1
R,408598050.0,122.0
PG-13,424230800.0,133.0
PG,425264300.0,115.0
G,476258700.0,94.0
Unrated,486200000.0,135.0
M,500452400.0,109.5
GP,602437800.0,104.0


In [84]:
df[df.genre.str.contains('Animation')].sort_values('domestic total gross', ascending=False)

Unnamed: 0,domestic total gross,genre,movie title,rating,release date,runtime (mins)
9,684844800,Animation,Snow White and the Seven Dwarfs,G,1937-12-21,83
22,660693100,Animation,Fantasia,G,1941-01-29,120
18,659679500,Animation,The Lion King,G,1994-06-15,89
34,628090200,Animation,Shrek 2,PG,2004-05-19,93
32,606428700,Animation,Sleeping Beauty,G,1959-01-29,75
54,498023300,Animation,Finding Nemo,G,2003-05-30,104
11,497243800,Animation,101 Dalmatians,G,1961-01-25,79
72,496437500,Animation,Finding Dory,PG,2016-06-17,103
31,496314400,Animation,The Jungle Book,G,1967-10-18,78
40,488985200,Animation,Pinocchio,G,1940-02-09,88


### TODO 

Some closing remarks!!

In [44]:
df.rating.value_counts()

PG-13      67
PG         63
G          31
R          22
Unrated    11
GP          4
M           2
Name: rating, dtype: int64