# Scrape Movies data from BoxOfficeMojo.com

### Collect URLs for top grossing movies categorized by MPAA rating

In [None]:
from bs4 import BeautifulSoup
import requests
import pickle

In [None]:
# Pages listing URLs for all the top grossing movies 
Top_URL_List = [
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G&ref_=bo_cso_ac',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G&offset=200',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&ref_=bo_cso_ac',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=200',               
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=400',            
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=600',         
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?offset=800&by_mpaa=PG',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&ref_=bo_cso_ac',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=200',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=400',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?offset=600&by_mpaa=PG-13',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?offset=800&by_mpaa=PG-13',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&ref_=bo_cso_ac',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=200',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?offset=400&by_mpaa=R',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=600',
'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=800']

In [None]:
# Collect URLs for each movie page
movie_pages = []
for URL in Top_URL_List:
    response = requests.get(URL)
    print(response.status_code)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    for div in soup.find_all(class_="a-text-left mojo-header-column mojo-truncate mojo-field-type-title"):
        for link in div.find_all('a'):
            movie_pages.append(link.get('href'))

In [None]:
# Pickle the URL list
with open('movie_pages.pickle', 'wb') as to_write:
    pickle.dump(movie_pages, to_write)

### Scrape features for each move with BeautifulSoup

In [None]:
# Bring in movies URL list
with open('movie_pages.pickle','rb') as read_file:
    movie_pages = pickle.load(read_file)

In [None]:
# Store movie features in a list of dictionaries; one dictionary per movie
main_page = 'https://www.boxofficemojo.com'
movies_data = []
for page in movie_pages:
    current_page = main_page + page
    response = requests.get(current_page)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    current_dict={}
    try:
        current_dict['Title'] = soup.find(name="title").text.replace(" - Box Office Mojo","")
    except AttributeError:
        current_dict['Title'] = 'na'
        pass

    try:
        current_dict['Studio'] = soup.find(text='Domestic Distributor').findNext().text.replace("See full company information\n\n","")
    except AttributeError:
        current_dict['Studio'] = 'na'
        pass
    
    try:
        current_dict['Opening'] = soup.find(text='Domestic Opening').findNext().text
    except AttributeError:
        current_dict['Opening'] = 'na'
        pass
    
    try:
        current_dict['Budget'] = soup.find(text='Budget').findNext().text
    except AttributeError:
        current_dict['Budget'] = 'na'
        pass
    
    try:
        current_dict['Release'] = soup.find(text='Earliest Release Date').findNext().text.split('\n')[0]
    except AttributeError:
        current_dict['Release'] = 'na'
        pass
    
    try:
        current_dict['Rating'] = soup.find(text='MPAA').findNext().text
    except AttributeError:
        current_dict['Rating'] = 'na'
        pass
    
    try:
        current_dict['Runtime'] = soup.find(text='Running Time').findNext().text
    except AttributeError:
        current_dict['Runtime'] = 'na'
        pass
    
    try:
        current_dict['Genre'] = soup.find(text='Genres').findNext().text.replace("\n","").split()
    except AttributeError:
        current_dict['Genre'] = 'na'
        pass
    
    try:
        current_dict['Domestic'] = soup.find(class_="a-section a-spacing-none mojo-performance-summary-table").find_all(class_="money")[0].text
    except AttributeError:
        current_dict['Domestic'] = 'na'
        pass
    
    try:
        current_dict['International'] = soup.find(class_="a-section a-spacing-none mojo-performance-summary-table").find_all(class_="money")[1].text
    except AttributeError:
        current_dict['International'] = 'na'
        pass
    
    movies_data.append(current_dict)

In [None]:
# Pickle the movie features data
with open('movies_data_raw.pickle', 'wb') as to_write:
    pickle.dump(movies_data, to_write)