In [1]:
import requests
import string
import time
from bs4 import BeautifulSoup
import re
import pprint 
pp = pprint.PrettyPrinter(indent=4)



def get_page(url):
    response = requests.get(url)
    page = response.text
    return page

'''
Parse film list
'''

def get_film_list(url):
    page = get_page(url)
    films_list = {}
    
    list_soup=BeautifulSoup(page,"html5lib")
    tables=list_soup.find_all("table")
    
    tables=list_soup.find_all("table")
    rows=[row for row in tables[2].find_all('tr')]
    rows.pop(0) #remove header
    for i,r in enumerate(rows):
        items = r.find_all('td')
        link=items[-1].find('a')['href']
        title= items[-1].find('a').text
        films_list[title] = "%s%s"%(base_url,link)
    return films_list

'''
Parse film details
'''
def get_film_details(url,title):
    page = get_page(url)
    film = {}
    film['title'] = title

    page_soup = BeautifulSoup(page,"html5lib")
    cast_table = get_table_values(page_soup,'The Players')
    sub_genre_table = get_table_values(page_soup,'Genres')
    
    tables = page_soup.find_all("table")
    parse_image(film,page_soup)
    parse_details(film,tables[5])
    parse_finances2(film,page_soup)
    if cast_table is not None:
        parse_cast(film,cast_table,page_soup)
    if sub_genre_table is not None:
        parse_sub_genre(film,sub_genre_table)
    return film
    
def clean_string(raw_string):
    printable = set(string.printable)
    clean=[s for s in raw_string if s in printable]
    return ''.join(clean)

'''Parse poster image'''
def parse_image(film,page_soup):
    poster_img = page_soup.find_all("img")[6]['src']
    film['image'] = poster_img
    
'''Parse details table'''
def parse_details(film,table):
    rows=[row for row in table.find_all('tr')]
    rows.pop(0) # remove header
    prop_map = {"Distributor":"distributor", "Release Date": "release_date","Genre":"genre","Runtime":"duration","MPAA Rating":"rating","Production Budget":"budget"}
    for row in rows:
        # each row will have a few cells: (td~ cell tag)
        items=row.find_all('td')
        [parse_details_row(item,film,prop_map) for item in items]

'''Parse dtails table rows'''
def parse_details_row(row,film,prop_map):
    data = row.text.split(':')
    text = data[1]
    prop = prop_map[data[0]]
    film[prop] = text
    
'''Parse finance table'''
def parse_finances(film,table):
    rows=[row for row in table.find_all('tr')]
    prop_map = {0:"gross_domestic",1:"gross_foreign"}
    for i,row in enumerate(rows):
    # prevent out of index error on irregular cell
        if i < len(rows)-2:
            items=row.find_all('td')
            film[prop_map[i]] = clean_string(items[1].text)
            
def parse_finances2(film,page_soup):
    film['gross_domestic'] = get_movie_value(page_soup,'Domestic')
    film['gross_foreign'] = get_movie_value(page_soup,'Foreign')
    film['gross_worldwide'] = get_movie_value(page_soup,'Worldwide')
    
    


'''Parse cast table'''
def parse_cast(film,table,page_soup):
    rows=[row for row in table.find_all('tr')]
    #print(rows)

    film['director'] = rows[0].text.split(':')[1]
    cast = [row.text for row in rows[1].find_all('a')]
    cast.pop(0)
    film['cast'] = ", ".join(cast) # comma seperated list

'''Parse genre table'''
def parse_sub_genre(film,table):
    rows=[row for row in table.find_all('tr')]
    rows.pop(0) # remove header
    sub_genres = []
    for row in rows:
        g = row.find('a').text
        sub_genres.append(g)
    sg = clean_string(', '.join(sub_genres)) # comma seperated list
    film['sub_genres'] = sg

    
base_url = 'http://www.boxofficemojo.com/'
url = 'http://www.boxofficemojo.com/yearly/'

def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

def get_table_values(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find('div',text=re.compile(field_name))
    if obj == None:
        return None
    table = obj.findNext('table')
    return table


'''Get film list by metric'''
film_list = get_film_list(url)

url = 'http://www.boxofficemojo.com//movies/?id=marvel2017b.htm'
films = []

for film,url in film_list.items():
    print(film)
    title = film
    film = get_film_details(url,title)
    films.append(film)
    #time.sleep(1)

pp.pprint(films)


    


[   {   'budget': ' N/A',
        'cast': "Chadwick Boseman, Lupita Nyong'o, Michael B. Jordan, Angela "
                'Bassett, Martin Freeman, Forest Whitaker, Andy Serkis',
        'director': 'Ryan Coogler',
        'distributor': ' Buena Vista',
        'duration': ' 2 hrs. 20 min.',
        'genre': ' Action / Adventure',
        'gross_domestic': None,
        'gross_foreign': None,
        'gross_worldwide': None,
        'image': 'https://ia.media-imdb.com/images/M/MV5BMTg1MTY2MjYzNV5BMl5BanBnXkFtZTgwMTc4NTMwNDI@._V1_UY222_CR0,0,150,222_AL.jpg',
        'rating': ' PG-13',
        'release_date': ' February 16, 2018',
        'sub_genres': '3D, Comic Book Adaptation, IMAX (Feature-length), '
                      'Superhero, Superhero - Origin',
        'title': 'Black Panther'},
    {   'budget': ' N/A',
        'cast': 'Rian Johnson',
        'director': 'Rian Johnson',
        'distributor': ' Buena Vista',
        'duration': ' 2 hrs. 31 min.',
        'genre': ' Sci-Fi 