## List of Imports 

In [90]:
from __future__ import print_function, division

import requests
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup
import re 
from dateutil.parser import parse
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

## Data Scraping 

In [91]:
def create_soups(url):
    """
    Pulls data out of a HTML file. 
    Returns a BeautifulSoup object, which represents the document 
    as a nested data structure.
    """
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "html5lib")
    return soup 

In [92]:
url_p1 = 'http://www.boxofficemojo.com/genres/chart/?id=3d.htm'
url_p2 = 'http://www.boxofficemojo.com/genres/chart/?view=main&sort=gross&order=DESC&pagenum=2&id=3d.htm'
url_p3 = 'http://www.boxofficemojo.com/genres/chart/?view=main&sort=gross&order=DESC&pagenum=3&id=3d.htm'
url_p4 = 'http://www.boxofficemojo.com/genres/chart/?id=creaturefeature.htm&sort=date&order=DESC&p=.htm'
url_p5 = 'http://www.boxofficemojo.com/genres/chart/?id=environment.htm&sort=date&order=DESC&p=.htm'
url_p6 = 'http://www.boxofficemojo.com/genres/chart/?id=romanticdrama.htm&sort=date&order=DESC&p=.htm'
url_p7 = 'http://www.boxofficemojo.com/genres/chart/?id=disaster.htm&sort=date&order=DESC&p=.htm'
url_p8 = 'http://www.boxofficemojo.com/genres/chart/?id=actionheroine.htm&sort=date&order=DESC&p=.htm'
url_p9 = 'http://www.boxofficemojo.com/genres/chart/?id=liveactionfantasy.htm&sort=date&order=DESC&p=.htm'

In [93]:
soup_p1 = create_soups(url_p1)
soup_p2 = create_soups(url_p2)
soup_p3 = create_soups(url_p3)
soup_p4 = create_soups(url_p4)
soup_p5 = create_soups(url_p5)
soup_p6 = create_soups(url_p6)
soup_p7 = create_soups(url_p7)
soup_p8 = create_soups(url_p8)
soup_p9 = create_soups(url_p9)

In [94]:
def links_to_movies(soup, n=3, m=0, l=300):
    """ 
    Returns a list of links to movies in which data will be scrapped from.
    -----
    soup:
    BeautifulSoup object of a page with 100 movies listed (per page);
    n=3: 
    Three links for each movie are returned, slicing to only select 
    the first link to the movie's main page; 
    m: 
    Starting line of the returned websites 
    l=100:
    Ending line of the returned websites 
    """
    bgcolor = ['#ffffff','#f4f4ff','#ffff99']
    all_links = [] 
    for tr in soup.findAll('tr',{'bgcolor': bgcolor}):
        for a in tr.findAll('a'):
            full_link = 'http://www.boxofficemojo.com/' + a.attrs['href']
            all_links.append(full_link)
    links = all_links[m:l][::n]
    return links    
#             print (a.text.strip(), '=>', full_link)

In [95]:
links_p1 = links_to_movies(soup_p1)
links_p2 = links_to_movies(soup_p2)
links_p3 = links_to_movies(soup_p3,l=150)
links_p4 = links_to_movies(soup_p4,l=21) + links_to_movies(soup_p4,m=36,l=75)
links_p5 = links_to_movies(soup_p5,m=21,l=66)
links_p6 = (links_to_movies(soup_p6,l=21) 
            + links_to_movies(soup_p6,m=24,l=81) 
            + links_to_movies(soup_p6,m=84,l=204))
links_p7 = links_to_movies(soup_p7,l=75)
links_p8 = (links_to_movies(soup_p8,l=27) 
            + links_to_movies(soup_p8,m=36,l=81)) 
links_p9 = (links_to_movies(soup_p9,l=21) 
            + links_to_movies(soup_p9,m=24,l=75) 
            + links_to_movies(soup_p9,m=78,l=90)
            + links_to_movies(soup_p9,m=96,l=108)
            + links_to_movies(soup_p9,m=135,l=213))                

In [96]:
def get_movie_value(soup, field_name):
        '''Grab a value from boxofficemojo HTML

        Takes a string attribute of a movie on the page and
        returns the string in the next sibling object
        (the value for that attribute)
        or None if nothing is found.
        '''
        obj = soup.find(text=re.compile(field_name))
        if not obj: 
            return None
        next_sibling = obj.findNextSibling()
        if next_sibling:
            return next_sibling.text 
        else:
            return None

def find_box1_info(soup):
    if soup.findAll('div',{'class':'mp_box_content'}):
        boxes = soup.findAll('div',{'class':'mp_box_content'})
        box1_rows = [row for row in boxes[1].find_all('tr')] 
        box1 = box1_rows[:25]
        return box1
    else: 
        return None  
    
def find_box2_info(soup):
    if soup.findAll('div',{'class':'mp_box_content'}):
        boxes = soup.findAll('div',{'class':'mp_box_content'})
        box2_rows = [row for row in boxes[2].find_all('tr')]    
        return box2_rows
    else: 
        return None 

In [97]:
title_p1 = []
raw_domestic_total_gross_p1 = []
raw_release_date_p1 = []
raw_runtime_p1 = []
rating_p1 = []
raw_pro_budget_p1 = []
find_box1_p1 = []
find_box2_p1 = []
raw_opening_gross_p1 = []

    
for link in links_p1: 
    source_code1 = requests.get(link)
    soup1=BeautifulSoup(source_code1.text, 'html5lib')
    
    find_box1_p1.append(find_box1_info(soup1))
    find_box2_p1.append(find_box2_info(soup1))
    
    
    raw_opening_wknd_gross = find_box1_info(soup1)[0].find_all('td')[1].text
    raw_opening_gross_p1.append(raw_opening_wknd_gross)
    
    title_string = soup1.find('title').text
    title_p1.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup1,'Domestic Total')
    raw_domestic_total_gross_p1.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup1,'Release Date')
    raw_release_date_p1.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup1,'Runtime')
    raw_runtime_p1.append(raw_run_time)

    rating_string = get_movie_value(soup1,'MPAA Rating')
    rating_p1.append(rating_string)
    
    pro_budget = get_movie_value(soup1, 'Production Budget')
    raw_pro_budget_p1.append(pro_budget)

In [98]:
%%capture
raw_directors_p1=[]
for box in find_box2_p1:
    raw_directors_p1.append(box[0].text)
raw_directors_p1[75] = 'Directors:Phil LordChristopher Miller'
raw_directors_p1[84] = 'Directors:Carlos SaldanhaMike Thurmeier'
raw_directors_p1[90] = 'Director:Brian Fee'
directors_p1 = [str(raw.split(':')[1]) for raw in raw_directors_p1]
directors_p1[6] = 'Angus MacLane Andrew Stanton'
directors_p1[11] = 'Anthony Russo Joe Russo'
directors_p1[12] = 'Chris Buck Jennifer Lee'
directors_p1[16] = 'Pierre Coffin Chris Renaud'
directors_p1[20] = 'Byron Howard Rich Moore'
directors_p1[28] = 'Pete Docter Bob Peterson'
directors_p1[35] = 'Anthony Russo Joe Russo'
directors_p1[39] = 'Pierre Coffin Chris Renaud'
directors_p1[40] = 'Ron Clements John Musker'
directors_p1[45] = 'Mark Andrews Brenda Chapman'
directors_p1[53] = 'Dean DeBlois Chris Sanders'
directors_p1[60] = 'Nathan Greno Byron Howard'
directors_p1[62] = 'Rob Letterman Conrad Vernon'
directors_p1[64] = 'John Lasseter Brad Lewis'
directors_p1[67] = 'Kirk De Micco Chris Sanders'
directors_p1[75] = 'Phil Lord Christopher Miller'
directors_p1[77] = 'Joachim Ronning Espen Sandberg'
directors_p1[83] = 'Mike Mitchell Paul Tibbitt'
directors_p1[84] = 'Carlos Saldanha Mike Thurmeier'
directors_p1[97] = 'Alessandro Carloni Jennifer Yuh Nelson'
directors_p1

In [99]:
raw_actors_p1 = []
for box in find_box2_p1:
    if len(box) > 3: 
        raw_actors_p1.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p1.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p1.append(box[1].text)
    else: 
        raw_actors_p1.append(box[0].text)
raw_actors_p1 = [str(raw.split(':')[1]) for raw in raw_actors_p1]
raw_actors_p1[92] = 'Will Ferrell (Voice)Brad Pitt (Voice)Tina Fey (Voice)Jonah Hill (Voice)David Cross (Voice)Justin Theroux (Voice)Ben Stiller (Voice)Tom McGrath* (Voice)J.K. Simmons (Voice)'

In [100]:
%%capture
for i, raw in enumerate(raw_actors_p1):
    if raw.count('(Voice)') > len(raw.split())/3:
        raw_actors_p1[i] = "Animation"
raw_actors_p1[3] = 'Robert Downey, Jr.Chris HemsworthChris EvansJeremy RennerMark RuffaloScarlett JohanssonClark GreggSamuel L. JacksonCobie SmuldersTom HiddlestonStellan SkarsgardGwyneth Paltrow'
raw_actors_p1[4] = 'Felicity JonesMads MikkelsenBen MendelsohnDonnie YenForest WhitakerDiego LunaRiz Ahmed'
raw_actors_p1[7] = 'Robert Downey, Jr.Chris HemsworthMark RuffaloChris EvansScarlett JohanssonJeremy RennerDon CheadleAaron JohnsonElizabeth OlsenPaul BettanyCobie SmuldersAnthony MackieHayley AtwellIdris ElbaStellan Skarsgard'
raw_actors_p1[10] = 'Fan BingbingRobert Downey, Jr.Gwyneth PaltrowDon CheadleGuy PearceRebecca HallJames Badge DaleJon FavreauBen Kingsley'
raw_actors_p1[13] = 'Chris PrattZoe SaldanaDave Bautista'
raw_actors_p1[19] = 'Shia LaBeoufRosie Huntington-WhiteleyJosh DuhamelTyrese GibsonFrances McDormandPatrick DempseyJohn MalkovichAlan TudykKen JeongJohn Turturro'
raw_actors_p1[22] = 'Mia WasikowskaJohnny DeppHelena Bonham CarterAnne HathawayCrispin Glover'
raw_actors_p1[23] = 'Chris PrattZoe SaldanaDave BautistaLee PaceKaren GillanDjimon HounsouJohn C. ReillyGlenn CloseBenicio Del ToroJosh Brolin'
raw_actors_p1[27] = 'Martin FreemanIan McKellenRichard ArmitageCate BlanchettHugo WeavingChristopher Lee*Ian Holm*Elijah Wood*'
raw_actors_p1[36] = 'Martin FreemanIan McKellenRichard ArmitageOrlando BloomEvangeline LillyLuke EvansLee PaceCate Blanchett'
raw_actors_p1[38] = 'Ian McKellenMartin FreemanRichard ArmitageLee PaceLuke EvansOrlando BloomEvangeline LillyHugo WeavingChristopher LeeCate Blanchett'
raw_actors_p1[46] = 'Tony CoxJames FrancoMila KunisMichelle Williams'
raw_actors_p1[65] = 'Megan FoxWill ArnettWilliam Fichtner'
raw_actors_p1[98] = 'Neil Patrick HarrisJayma MaysHank Azaria'
raw_actors_p1

In [101]:
%%capture
actors_p1 = []
for a in raw_actors_p1:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p1.append(a)   
actors_p1 = [[name.strip().replace('*','').replace(' ','').replace('(','').replace('Voice)','').replace(')','') for name in actor] for actor in actors_p1] 
actors_p1 

In [102]:
%%capture
domestic_total_gross_p1 = []
for dtg in raw_domestic_total_gross_p1:
    if dtg == None: 
        pass 
    else: 
        dtg = dtg.replace('$', '').replace(',', '')
        dtg = int(dtg)
    domestic_total_gross_p1.append(dtg)
domestic_total_gross_p1[9] = int(412038809)  
domestic_total_gross_p1[24] = int(332003538)
domestic_total_gross_p1[34] = int(261863570)
domestic_total_gross_p1[76] = int(174980616)
domestic_total_gross_p1[90] = int(152412453)
domestic_total_gross_p1[94] = int(146288952)
domestic_total_gross_p1

In [103]:
%%capture
raw_theater_p1 = []
for box in find_box1_p1:
    raw_theater_p1.append((box[1].find_all('td')[0].text.split()[2]))
raw_theater_p1[12]='3,742'
theater_p1 = [int(raw.replace(',','')) for raw in raw_theater_p1]
theater_p1

In [104]:
%%capture
runtime_p1=[]
for rt in raw_runtime_p1: 
    rt = rt.split()
    minutes = int(rt[0])*60 + int(rt[2])
    rt = minutes 
    runtime_p1.append(rt)
runtime_p1

In [105]:
%%capture
raw_opening_gross_p1[12]= '\xa0$67,391,326'
opening_weekend_gross_p1 = []
for row in raw_opening_gross_p1:
    row = int(row.strip().replace('$','').replace(',',''))
    opening_weekend_gross_p1.append(row)
opening_weekend_gross_p1

In [106]:
%%capture
release_date_p1=[]
for rd in raw_release_date_p1:
    rd = parse(rd)
    release_date_p1.append(rd)
release_date_p1

In [107]:
%%capture
rating_p1

In [108]:
%%capture
prod_budget_p1 = []
for budget in raw_pro_budget_p1: 
    if budget == 'N/A':
        pass
    else: 
        budget = int(budget.split()[0].replace('$','')) 
    prod_budget_p1.append(budget)
prod_budget_p1[14]=int(250)
prod_budget_p1[6]=int(200)
prod_budget_p1[20]=int(150)
prod_budget_p1[27]=int(250)
prod_budget_p1[32]=int(200)
prod_budget_p1[36]=int(217)
prod_budget_p1[38]=int(250)
prod_budget_p1[40]=int(150)
prod_budget_p1[58]=int(200)
prod_budget_p1[76]=int(125)
prod_budget_p1[90]=int(175)
production_budget_p1 = []
for i in prod_budget_p1:
    i = int(i*1000000)
    production_budget_p1.append(i)
production_budget_p1

In [111]:
title_p2 = []
raw_domestic_total_gross_p2 = []
raw_release_date_p2 = []
raw_runtime_p2 = []
rating_p2 = []
raw_pro_budget_p2 = []
find_box1_p2 = []
find_box2_p2 = []
raw_opening_gross_p2 = []

for link in links_p2: 
    source_code2 = requests.get(link)
    soup2=BeautifulSoup(source_code2.text, 'html5lib')
    
    find_box1_p2.append(find_box1_info(soup2))
    find_box2_p2.append(find_box2_info(soup2))
    
    
    raw_opening_wknd_gross = find_box1_info(soup2)[0].find_all('td')[1].text
    raw_opening_gross_p2.append(raw_opening_wknd_gross)
    
    title_string = soup2.find('title').text
    title_p2.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup2,'Domestic Total')
    raw_domestic_total_gross_p2.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup2,'Release Date')
    raw_release_date_p2.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup2,'Runtime')
    raw_runtime_p2.append(raw_run_time)

    rating_string = get_movie_value(soup2,'MPAA Rating')
    rating_p2.append(rating_string)
    
    pro_budget = get_movie_value(soup2, 'Production Budget')
    raw_pro_budget_p2.append(pro_budget)

In [171]:
title_p3 = []
raw_domestic_total_gross_p3 = []
raw_release_date_p3 = []
raw_runtime_p3 = []
rating_p3 = []
raw_pro_budget_p3 = []
find_box1_p3 = []
find_box2_p3 = []
raw_opening_gross_p3 = []

for link in links_p3: 
    source_code3 = requests.get(link)
    soup3=BeautifulSoup(source_code3.text, 'html5lib')
    
    find_box1_p3.append(find_box1_info(soup3))
    find_box2_p3.append(find_box2_info(soup3))
    
    raw_opening_wknd_gross_p3 = find_box1_info(soup3)[0].find_all('td')[1].text
    raw_opening_gross_p3.append(raw_opening_wknd_gross_p3)
    
    title_string_p3 = soup3.find('title').text
    title_p3.append(title_string_p3.split('(')[0].strip()) 
    
    raw_dtg_p3 = get_movie_value(soup3,'Domestic Total')
    raw_domestic_total_gross_p3.append(raw_dtg_p3)
    
    raw_releasedate_p3 = get_movie_value(soup3,'Release Date')
    raw_release_date_p3.append(raw_releasedate_p3)
    
    raw_run_time_p3 = get_movie_value(soup3,'Runtime')
    raw_runtime_p3.append(raw_run_time_p3)

    rating_string_p3 = get_movie_value(soup3,'MPAA Rating')
    rating_p3.append(rating_string_p3)
    
    pro_budget_p3 = get_movie_value(soup3, 'Production Budget')
    raw_pro_budget_p3.append(pro_budget_p3)

In [113]:
title_p4 = []
raw_domestic_total_gross_p4 = []
raw_release_date_p4 = []
raw_runtime_p4 = []
rating_p4 = []
raw_pro_budget_p4 = []
find_box1_p4 = []
find_box2_p4 = []
raw_opening_gross_p4 = []

for link in links_p4: 
    source_code4 = requests.get(link)
    soup4=BeautifulSoup(source_code4.text, 'html5lib')
    
    find_box1_p4.append(find_box1_info(soup4))
    find_box2_p4.append(find_box2_info(soup4))
    
    
    raw_opening_wknd_gross = find_box1_info(soup4)[0].find_all('td')[1].text
    raw_opening_gross_p4.append(raw_opening_wknd_gross)
    
    title_string = soup4.find('title').text
    title_p4.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup4,'Domestic Total')
    raw_domestic_total_gross_p4.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup4,'Release Date')
    raw_release_date_p4.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup4,'Runtime')
    raw_runtime_p4.append(raw_run_time)

    rating_string = get_movie_value(soup4,'MPAA Rating')
    rating_p4.append(rating_string)
    
    pro_budget = get_movie_value(soup4, 'Production Budget')
    raw_pro_budget_p4.append(pro_budget)

In [114]:
title_p5 = []
raw_domestic_total_gross_p5 = []
raw_release_date_p5 = []
raw_runtime_p5 = []
rating_p5 = []
raw_pro_budget_p5 = []
find_box1_p5 = []
find_box2_p5 = []
raw_opening_gross_p5 = []

for link in links_p5: 
    source_code5 = requests.get(link)
    soup5=BeautifulSoup(source_code5.text, 'html5lib')
    
    find_box1_p5.append(find_box1_info(soup5))
    find_box2_p5.append(find_box2_info(soup5))
    
    raw_opening_wknd_gross = find_box1_info(soup5)[0].find_all('td')[1].text
    raw_opening_gross_p5.append(raw_opening_wknd_gross)
    
    title_string = soup5.find('title').text
    title_p5.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup5,'Domestic Total')
    raw_domestic_total_gross_p5.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup5,'Release Date')
    raw_release_date_p5.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup5,'Runtime')
    raw_runtime_p5.append(raw_run_time)

    rating_string = get_movie_value(soup5,'MPAA Rating')
    rating_p5.append(rating_string)
    
    pro_budget = get_movie_value(soup5, 'Production Budget')
    raw_pro_budget_p5.append(pro_budget)

In [115]:
title_p6 = []
raw_domestic_total_gross_p6 = []
raw_release_date_p6 = []
raw_runtime_p6 = []
rating_p6 = []
raw_pro_budget_p6 = []
find_box1_p6 = []
find_box2_p6 = []
raw_opening_gross_p6 = []

for link in links_p6: 
    source_code6 = requests.get(link)
    soup6=BeautifulSoup(source_code6.text, 'html5lib')
    
    find_box1_p6.append(find_box1_info(soup6))
    find_box2_p6.append(find_box2_info(soup6))
    
    raw_opening_wknd_gross = find_box1_info(soup6)[0].find_all('td')[1].text
    raw_opening_gross_p6.append(raw_opening_wknd_gross)
    
    title_string = soup6.find('title').text
    title_p6.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup6,'Domestic Total')
    raw_domestic_total_gross_p6.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup6,'Release Date')
    raw_release_date_p6.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup6,'Runtime')
    raw_runtime_p6.append(raw_run_time)

    rating_string = get_movie_value(soup6,'MPAA Rating')
    rating_p6.append(rating_string)
    
    pro_budget = get_movie_value(soup6, 'Production Budget')
    raw_pro_budget_p6.append(pro_budget)

In [116]:
title_p7 = []
raw_domestic_total_gross_p7 = []
raw_release_date_p7 = []
raw_runtime_p7 = []
rating_p7 = []
raw_pro_budget_p7 = []
find_box1_p7 = []
find_box2_p7 = []
raw_opening_gross_p7 = []

for link in links_p7: 
    source_code7 = requests.get(link)
    soup7=BeautifulSoup(source_code7.text, 'html5lib')
    
    find_box1_p7.append(find_box1_info(soup7))
    find_box2_p7.append(find_box2_info(soup7))
    
    raw_opening_wknd_gross = find_box1_info(soup7)[0].find_all('td')[1].text
    raw_opening_gross_p7.append(raw_opening_wknd_gross)
    
    title_string = soup7.find('title').text
    title_p7.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup7,'Domestic Total')
    raw_domestic_total_gross_p7.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup7,'Release Date')
    raw_release_date_p7.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup7,'Runtime')
    raw_runtime_p7.append(raw_run_time)

    rating_string = get_movie_value(soup7,'MPAA Rating')
    rating_p7.append(rating_string)
    
    pro_budget = get_movie_value(soup7, 'Production Budget')
    raw_pro_budget_p7.append(pro_budget)

In [117]:
title_p8 = []
raw_domestic_total_gross_p8 = []
raw_release_date_p8 = []
raw_runtime_p8 = []
rating_p8 = []
raw_pro_budget_p8 = []
find_box1_p8 = []
find_box2_p8 = []
raw_opening_gross_p8 = []

for link in links_p8: 
    source_code8 = requests.get(link)
    soup8=BeautifulSoup(source_code8.text, 'html5lib')
    
    find_box1_p8.append(find_box1_info(soup8))
    find_box2_p8.append(find_box2_info(soup8))
    
    raw_opening_wknd_gross = find_box1_info(soup8)[0].find_all('td')[1].text
    raw_opening_gross_p8.append(raw_opening_wknd_gross)
    
    title_string = soup8.find('title').text
    title_p8.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup8,'Domestic Total')
    raw_domestic_total_gross_p8.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup8,'Release Date')
    raw_release_date_p8.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup8,'Runtime')
    raw_runtime_p8.append(raw_run_time)

    rating_string = get_movie_value(soup8,'MPAA Rating')
    rating_p8.append(rating_string)
    
    pro_budget = get_movie_value(soup8, 'Production Budget')
    raw_pro_budget_p8.append(pro_budget)

In [118]:
title_p9 = []
raw_domestic_total_gross_p9 = []
raw_release_date_p9 = []
raw_runtime_p9 = []
rating_p9 = []
raw_pro_budget_p9 = []
find_box1_p9 = []
find_box2_p9 = []
raw_opening_gross_p9 = []

for link in links_p9: 
    source_code9 = requests.get(link)
    soup9=BeautifulSoup(source_code9.text, 'html5lib')
    
    find_box1_p9.append(find_box1_info(soup9))
    find_box2_p9.append(find_box2_info(soup9))
    
    raw_opening_wknd_gross = find_box1_info(soup9)[0].find_all('td')[1].text
    raw_opening_gross_p9.append(raw_opening_wknd_gross)
    
    title_string = soup9.find('title').text
    title_p9.append(title_string.split('(')[0].strip()) 
    
    raw_dtg = get_movie_value(soup9,'Domestic Total')
    raw_domestic_total_gross_p9.append(raw_dtg)
    
    raw_releasedate = get_movie_value(soup9,'Release Date')
    raw_release_date_p9.append(raw_releasedate)
    
    raw_run_time = get_movie_value(soup9,'Runtime')
    raw_runtime_p9.append(raw_run_time)

    rating_string = get_movie_value(soup9,'MPAA Rating')
    rating_p9.append(rating_string)
    
    pro_budget = get_movie_value(soup9, 'Production Budget')
    raw_pro_budget_p9.append(pro_budget)

In [119]:
def transform_raw_release_date(raw_release_date):
    release_date = []
    for rd in raw_release_date:
        rd = parse(rd)
        release_date.append(rd)
    return release_date

In [120]:
release_date_p2 = transform_raw_release_date(raw_release_date_p2)
release_date_p3 = transform_raw_release_date(raw_release_date_p3)
release_date_p4 = transform_raw_release_date(raw_release_date_p4)
release_date_p5 = transform_raw_release_date(raw_release_date_p5)
release_date_p6 = transform_raw_release_date(raw_release_date_p6)
release_date_p7 = transform_raw_release_date(raw_release_date_p7)
release_date_p8 = transform_raw_release_date(raw_release_date_p8)
release_date_p9 = transform_raw_release_date(raw_release_date_p9)

In [121]:
def transform_raw_runtime(raw_runtime):
    runtime=[]
    for rt in raw_runtime: 
        if rt == 'N/A':
            runtime.append('N/A')
        else: 
            rt = rt.split()
            minutes = int(rt[0])*60 + int(rt[2])
            rt = minutes 
            runtime.append(rt)
    return runtime 

In [122]:
runtime_p1 = transform_raw_runtime(raw_runtime_p1)
runtime_p2 = transform_raw_runtime(raw_runtime_p2)
runtime_p3 = transform_raw_runtime(raw_runtime_p3)
runtime_p4 = transform_raw_runtime(raw_runtime_p4)
runtime_p5 = transform_raw_runtime(raw_runtime_p5)
runtime_p6 = transform_raw_runtime(raw_runtime_p6)
runtime_p7 = transform_raw_runtime(raw_runtime_p7)
runtime_p8 = transform_raw_runtime(raw_runtime_p8)
runtime_p9 = transform_raw_runtime(raw_runtime_p9)

In [123]:
raw_opening_gross_p2[95]= '\xa0$13,242,895'
raw_opening_gross_p3[37] = '\xa0$5,062,479'
raw_opening_gross_p6[63] = '\xa0$168,051'
raw_opening_gross_p7[18] = '\xa0$143,818'
raw_opening_gross_p9[2] = '\xa0$2,080,051'

In [124]:
def transform_raw_opening_gross(raw_opening_gross):
    opening_weekend_gross= []
    for row in raw_opening_gross:
        row = int(row.strip().replace('$','').replace(',',''))
        opening_weekend_gross.append(row)
    return opening_weekend_gross

In [125]:
opening_weekend_gross_p2 = transform_raw_opening_gross(raw_opening_gross_p2)
opening_weekend_gross_p3 = transform_raw_opening_gross(raw_opening_gross_p3)
opening_weekend_gross_p4 = transform_raw_opening_gross(raw_opening_gross_p4)
opening_weekend_gross_p5 = transform_raw_opening_gross(raw_opening_gross_p5)
opening_weekend_gross_p6 = transform_raw_opening_gross(raw_opening_gross_p6)
opening_weekend_gross_p7 = transform_raw_opening_gross(raw_opening_gross_p7)
opening_weekend_gross_p8 = transform_raw_opening_gross(raw_opening_gross_p8)
opening_weekend_gross_p9 = transform_raw_opening_gross(raw_opening_gross_p9)

In [126]:
%%capture
raw_theater_p2 = []
for box in find_box1_p2:
    raw_theater_p2.append((box[1].find_all('td')[0].text.split()[2]))
raw_theater_p2[95]='3,006'
theater_p2 = [int(raw.replace(',','')) for raw in raw_theater_p2]
theater_p2

In [127]:
%%capture
theater_p3 = []
for box in find_box1_p3:
    if box[1].find_all('td')[0].text.startswith('(#'): 
        theater_p3.append((box[1].find_all('td')[0].text.split()[2]))
    else: 
        theater_p3.append('N/A')
theater_p3[37] = '2,324'
theater_p3

In [128]:
def transform_raw_theaters(find_box1_):
    raw_theater = []
    for box in find_box1_:
        if len((box[1].find_all('td')[0].text).split()) > 4:
            raw_theater.append((box[1].find_all('td')[0].text.split()[2]))
        elif len((box[1].find_all('td')[0].text).split()) == 3:
            raw_theater.append('0')
        else: 
            raw_theater.append((box[1].find_all('td')[0].text.split()[0]))
    theater = [int(raw.replace(',','').replace('(','')) for raw in raw_theater]
    return theater

In [129]:
theater_p4 = transform_raw_theaters(find_box1_p4)
theater_p5 = transform_raw_theaters(find_box1_p5)
theater_p6 = transform_raw_theaters(find_box1_p6)
theater_p7 = transform_raw_theaters(find_box1_p7)
theater_p8 = transform_raw_theaters(find_box1_p8)
theater_p9 = transform_raw_theaters(find_box1_p9)

In [130]:
def transform_raw_dtg(raw_domestic_total_gross):
    domestic_total_gross = []
    for dtg in raw_domestic_total_gross:
        if dtg == None: 
            pass 
        else: 
            dtg = dtg.replace('$', '').replace(',', '')
            dtg = int(dtg)
        domestic_total_gross.append(dtg)
    return domestic_total_gross

In [131]:
domestic_total_gross_p2 = transform_raw_dtg(raw_domestic_total_gross_p2)
domestic_total_gross_p2[38] = int(84651983)  
domestic_total_gross_p2[53] = int(73863410)
domestic_total_gross_p2[78] = int(51871619)
domestic_total_gross_p2[87] = int(46262620)

In [132]:
domestic_total_gross_p3 = transform_raw_dtg(raw_domestic_total_gross_p3)
domestic_total_gross_p3[9] = int(35916045)  
domestic_total_gross_p3[29] = int(26256065)
domestic_total_gross_p3[30] = int(25545476)

In [133]:
raw_domestic_total_gross_p8[0] = '$51,573,925'
raw_domestic_total_gross_p8[1] = '$412,080,447'
raw_domestic_total_gross_p9[0] = '$50,430,148'

In [134]:
domestic_total_gross_p4 = transform_raw_dtg(raw_domestic_total_gross_p4)
domestic_total_gross_p5 = transform_raw_dtg(raw_domestic_total_gross_p5)
domestic_total_gross_p6 = transform_raw_dtg(raw_domestic_total_gross_p6)
domestic_total_gross_p7 = transform_raw_dtg(raw_domestic_total_gross_p7)
domestic_total_gross_p8 = transform_raw_dtg(raw_domestic_total_gross_p8)
domestic_total_gross_p9 = transform_raw_dtg(raw_domestic_total_gross_p9)

In [168]:
prod_budget_p2 = []
for budget in raw_pro_budget_p2: 
    if budget == 'N/A':
        pass
    else: 
        budget = int(budget.split()[0].replace('$','')) 
    prod_budget_p2.append(budget)
prod_budget_p2[12]=int(175)
prod_budget_p2[33]=int(36)
prod_budget_p2[35]=int(150)
prod_budget_p2[46]=int(135)
prod_budget_p2[50]=int(18)
prod_budget_p2[51]=int(18)
prod_budget_p2[52]=int(180)
prod_budget_p2[63]=int(7)
prod_budget_p2[68]=int(135)
prod_budget_p2[86]=int(100)
prod_budget_p2[87]=int(1)
prod_budget_p2[89]=int(18)
production_budget_p2 = []
for i in prod_budget_p2:
    if i == 'N/A':
        pass
    else: 
        i = int(i*1000000)
    production_budget_p2.append(i)

In [135]:
%%capture
prod_budget_p3 = []
for budget in raw_pro_budget_p3: 
    if budget == 'N/A':
        pass
    else: 
        budget = float(budget.split()[0].replace('$',''))
    prod_budget_p3.append(budget)
prod_budget_p3[2]=float(50)
prod_budget_p3[7]=float(2.3)
prod_budget_p3[15]=float(20)
prod_budget_p3[17]=float(125)
prod_budget_p3[20]=float(90)
prod_budget_p3[26]=float(80)
prod_budget_p3[29]=float(70)
prod_budget_p3[32]=float(20)
prod_budget_p3[41]=float(17)
prod_budget_p3[48]=float(13)
production_budget_p3 = []
for i in prod_budget_p3:
    if i == 'N/A':
        pass
    else: 
        i = int(i*1000000)
    production_budget_p3.append(i)
len(production_budget_p3)

In [136]:
def transform_raw_prod_budget(raw_pro_budget):
    production_budget = []
    for budget in raw_pro_budget: 
        if budget == 'N/A':
            pass
        else: 
            budget = float(budget.split()[0].replace('$','')) 
            budget = int(budget*1000000)
        production_budget.append(budget)
    return production_budget

In [137]:
raw_pro_budget_p6[51] = '$1 million'

In [138]:
production_budget_p4 = transform_raw_prod_budget(raw_pro_budget_p4)
production_budget_p6 = transform_raw_prod_budget(raw_pro_budget_p6)
production_budget_p7 = transform_raw_prod_budget(raw_pro_budget_p7)
production_budget_p8 = transform_raw_prod_budget(raw_pro_budget_p8)
production_budget_p9 = transform_raw_prod_budget(raw_pro_budget_p9)

In [139]:
%%capture
prod_budget_p5 = []
for budget in raw_pro_budget_p5: 
    if budget == 'N/A':
        pass
    elif budget == '$1,000,000':
        budget = int(1)
    else: 
        budget = int(budget.split()[0].replace('$','')) 
    prod_budget_p5.append(budget)
production_budget_p5 = []
for i in prod_budget_p5:
    if i == 'N/A':
        pass
    else:
        i = int(i*1000000)
    production_budget_p5.append(i)
production_budget_p5   

In [142]:
%%capture
raw_directors_p2=[]
for box in find_box2_p2:
    raw_directors_p2.append(box[0].text)
raw_directors_p2[0] = 'Directors:Mark Dindal'
raw_directors_p2[5] = 'Directors:Steve Martino'
raw_directors_p2[14] = 'Director:Cody CameronKris Pearn'
raw_directors_p2[15] = 'Directors:Hoyt Yeatman'
raw_directors_p2[29] = 'Director:Eric Brevig'
raw_directors_p2[30] = 'Directors:Eric Brevig'
raw_directors_p2[35] = 'Director:Stephen Anderson'
raw_directors_p2[38] = 'Directors:Tony Leondis'
raw_directors_p2[48] = 'Director:David Lowery'
raw_directors_p2[50] = 'Director:Henry Selick'
raw_directors_p2[51] = 'Director:Henry Selick'
raw_directors_p2[63] = 'Directors:Bruce Hendricks'
raw_directors_p2[78] = 'Director:Toni Myers'
raw_directors_p2[87] = 'Director:Howard Hall'
raw_directors_p2[88] = 'Director:Kevin Greutert'
raw_directors_p2[89] = 'Directors:Joe Alves'
raw_directors_p2 = [str(raw.split(':')[1]) for raw in raw_directors_p2]
directors_p2 = []
for a in raw_directors_p2:
    a = re.findall('[A-Z][^A-Z]*', a)
    directors_p2.append(a) 
directors_p2 = [[name.strip().replace(' ','').replace(')','').replace('(','') for name in d] for d in directors_p2] 
directors_p2

In [183]:
%%capture
raw_directors_p3=[]
for box in find_box2_p3:
    raw_directors_p3.append(box[0].text)
# for i, raw in enumerate(raw_directors_p3):
#     if not raw.startswith('Director'):
#         print(i,raw)
raw_directors_p3[4] = 'Directors:Steve Miner'
raw_directors_p3[8] = 'Directors:Howard Hall'
raw_directors_p3[10] = 'Director:Cody CameronKris Pearn'
raw_directors_p3[15] = 'Directors:Rachel Talalay'
raw_directors_p3[21] = 'Director:Ericson Core'
raw_directors_p3[25] = 'Directors:Charlie BeanPaul FisherBob Logan'
raw_directors_p3[30] = 'Director:David Lickley'
raw_directors_p3[31] = 'Directors:Dan CutforthJane Lipsitz'
raw_directors_p3[32] = 'Director:David Lowery'
raw_directors_p3[41] = 'Director:Anthony BellBen Gluck'
raw_directors_p3[49] = 'Director:Lamont Johnson'
raw_directors_p3 = [str(raw.split(':')[1]) for raw in raw_directors_p3]
directors_p3 = []
for a in raw_directors_p3:
    a = re.findall('[A-Z][^A-Z]*', a)
    directors_p3.append(a) 
directors_p3 = [[name.strip().replace(' ','')
                 .replace(')','')
                 .replace('(','') for name in d] for d in directors_p3] 
directors_p3

In [184]:
def transform_raw_directors(find_box2_):
    raw_directors = []
    directors = []
    for box in find_box2_:
        raw_directors.append(box[0].text)
    for raw in raw_directors: 
        if not raw.startswith('Director'):
            raw = 'Directors:N/A'
            directors.append(raw)
        else:
            directors.append(raw)
    return directors 

In [185]:
directors_4 = transform_raw_directors(find_box2_p4)
directors_p4 = [str(i.split(':')[1]) for i in directors_4]

directors_5 = transform_raw_directors(find_box2_p5)
directors_p5 = [str(i.split(':')[1]) for i in directors_5]

directors_6 = transform_raw_directors(find_box2_p6)
directors_p6 = [str(i.split(':')[1]) for i in directors_6]

directors_7 = transform_raw_directors(find_box2_p7)
directors_p7 = [str(i.split(':')[1]) for i in directors_7]

directors_8 = transform_raw_directors(find_box2_p8)
directors_p8 = [str(i.split(':')[1]) for i in directors_8]

directors_9 = transform_raw_directors(find_box2_p9)
directors_p9 = [str(i.split(':')[1]) for i in directors_9]

In [186]:
%%capture
raw_actors_p2 = []
for box in find_box2_p2:
    if len(box) > 3: 
        raw_actors_p2.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p2.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p2.append(box[1].text)
    else: 
        raw_actors_p2.append(box[0].text)
raw_actors_p2[0]= 'Actors:Animation'
raw_actors_p2[5]= 'Actors:Animation'
raw_actors_p2[6]= 'Actors:Mark Wahlberg Anthony Hopkins Josh Duhamel Stanley Tucci'
raw_actors_p2[14]= 'Actors:Animation'
raw_actors_p2[15]= 'Actors:Animation'
raw_actors_p2[29]= 'Actors:Brendan Fraser Josh Hutcherson'
raw_actors_p2[33]= 'Actors:Animation'
raw_actors_p2[35]= 'Actors:Animation'
raw_actors_p2[40]= 'Actors:Henry Cavill Stephen Dorff Isabel Lucas Freida Pinto Luke Evans Kellan Lutz John Hurt Mickey Rourke'
raw_actors_p2[41]= 'Actors:Animation'
raw_actors_p2[50]= 'Actors:Animation'
raw_actors_p2[51]= 'Actors:Animation'
raw_actors_p2[53]= 'Actors:Ben Kingsley Chloe Moretz Sacha Baron Cohen Jude Law Emily Mortimer* Christopher Lee* Richard Griffiths* Ray Winstone*'
raw_actors_p2[54]= 'Actors:Animation'
raw_actors_p2[56]= 'Actors:Justin Bieber'
raw_actors_p2[62]= 'Actors:Devon Sawa Ali Larter Kerr Smith Kristen Cloke Daniel Roebuck'
raw_actors_p2[71]= 'Actors:Animation'
raw_actors_p2[77]= 'Actors:Animation'
raw_actors_p2[78]= 'Actors:Leonardo DiCaprio*'
raw_actors_p2[80]= 'Actors:Tom Hanniger Sarah Palmer Megan Boone'
raw_actors_p2[87]= 'Actors:Animation'
raw_actors_p2[98]= 'Actors:Alyson Stoner Harry Shum Jr Sharni Vinson'
raw_actors_p2 = [str(raw.split(':')[1]) for raw in raw_actors_p2]
raw_actors_p2

In [187]:
%%capture
for i, raw in enumerate(raw_actors_p2):
    if raw.count('(Voice)') > len(raw.split())/3:
        raw_actors_p2[i] = "Animation"
raw_actors_p2[64] = 'Ewen BremnerWarwick DavisNicholas HoultEleanor TomlinsonEwan McGregorStanley TucciEddie MarsanIan'
raw_actors_p2[93] = 'Christina ApplegateMichael'
raw_actors_p2

In [188]:
%%capture
actors_p2 = []
for a in raw_actors_p2:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p2.append(a)   
actors_p2 = [[name.strip().replace('*','').replace(' ','').replace('(','').replace('Voice)','').replace(')','') for name in actor] for actor in actors_p2] 
actors_p2

In [189]:
%%capture
raw_actors_p3 = []
for box in find_box2_p3:
    if len(box) > 3: 
        raw_actors_p3.append(box[3].text)
    elif len(box) == 3: 
        raw_actors_p3.append(box[1].text)
    elif len(box) == 3: 
        raw_actors_p3.append(box[1].text)
    else: 
        raw_actors_p3.append(box[0].text)
# for i, raw in enumerate(raw_actors_p3):
#     if not raw.startswith('Actor'):
#         print(i,raw)
raw_actors_p3[7]= 'Actors:Dana KimmellPaul KratkaRichard Brooker'
raw_actors_p3[8]= 'Actors:Animation'
raw_actors_p3[9]= 'Actors:Animation'
raw_actors_p3[14]= 'Actors:Robert EnglundLisa ZaneShon GreenblattLezlie DeaneYaphet Kotto'
raw_actors_p3[23]= 'Actors:Niall HoranZayn MalikLiam PayneHarry StylesLouis Tomlinson'
raw_actors_p3[24]= 'Actors:Luke BraceyDelroy LindoTeresa PalmerEdgar RamirezRay Winstone'
raw_actors_p3[27]= 'Actors:Milla Jovovich'
raw_actors_p3[28]= 'Actors:Jack HustonToby KebbellRodrigo SantoroAyelet ZurerMorgan Freeman'
raw_actors_p3[29]= 'Actors:Animation'
raw_actors_p3[30]= 'Actors:Animation'
raw_actors_p3[31]= 'Actors:Katy Perry'
raw_actors_p3[32]= 'Actors:Animation'
raw_actors_p3[33]= 'Actors:Ben Kingsley Chloe Moretz Sacha Baron Cohen Jude Law Emily Mortimer* Christopher Lee* Richard Griffiths* Ray Winstone*'
raw_actors_p3[34]= 'Actors:Animation'
raw_actors_p3[35]= 'Actors:Tom HollandChris HemsworthBenjamin WalkerCillian MurphyBen WhishawBrendan Gleeson'
raw_actors_p3[36]= 'Actors:Ioan GruffuddRichard RoxburghRhys Wakefield'
raw_actors_p3[41]= 'Actors:Kevin JonasJoe JonasNick JonasDemi LovatoTaylor Swift'
raw_actors_p3[43]= 'Actors:Animation'
raw_actors_p3[48]= 'Actors:Animation*'
raw_actors_p3[49]= 'Actors:Peter StraussMolly RingwaldErnie HudsonMichael Ironside'
raw_actors_p3 = [str(raw.split(':')[1]) for raw in raw_actors_p3]
raw_actors_p3

In [190]:
%%capture
actors_p3 = []
for a in raw_actors_p3:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p3.append(a)   
actors_p3 = ([[name.strip().replace('*','')
               .replace(' ','').replace('(','')
               .replace(')','').replace('(executive)','') 
               for name in actor] for actor in actors_p3]) 
actors_p3

In [191]:
%%capture
raw_actors_p4 = []
actors_p4 = []
actorss_p4 =[]

for box in find_box2_p4:
    if len(box) > 3: 
        raw_actors_p4.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p4.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p4.append(box[1].text)
    else: 
        raw_actors_p4.append(box[0].text)

for raw in raw_actors_p4:
    if not raw.startswith('Actor'):
        raw = "Actor:N/a"
        actorss_p4.append(raw)
    else:
        actorss_p4.append(raw)
actorss_p4 = [raw.split(':')[1] for raw in actorss_p4]

In [192]:
%%capture
for a in actorss_p4:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p4.append(a)   
actors_p4 = [[name.strip().replace('*','')
              .replace(' ','').replace('(','')
              .replace('(Cameo)','')
              .replace(')','') for name in actor] for actor in actors_p4] 
actors_p4

In [193]:
%%capture
raw_actors_p5 = []
actors_p5 = []
actorss_p5 = []
for box in find_box2_p5:
    if len(box) > 3: 
        raw_actors_p5.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p5.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p5.append(box[1].text)
    else: 
        raw_actors_p5.append(box[0].text)
for raw in raw_actors_p5:
    if not raw.startswith('Actor'):
        raw = "Actor:N/a"
        actorss_p5.append(raw)
    else:
        actorss_p5.append(raw)
actorss_p5 = [str(raw.split(':')[1]) for raw in actorss_p5]
actorss_p5

In [194]:
%%capture
for a in actorss_p5:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p5.append(a)   
actors_p5 = ([[name.strip().replace('*','')
               .replace(' ','').replace('(','')
               .replace('(Cameo)','').replace('Voice)','')
               .replace(')','') for name in actor
              ] for actor in actors_p5])
actors_p5

In [195]:
%%capture
raw_actors_p6 = []
actors_p6 = []
actorss_p6 = []
for box in find_box2_p6:
    if len(box) > 3: 
        raw_actors_p6.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p6.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p6.append(box[1].text)
    else: 
        raw_actors_p6.append(box[0].text)
for raw in raw_actors_p6:
    if not raw.startswith('Actor'):
        raw = "Actor:N/a"
        actorss_p6.append(raw)
    else:
        actorss_p6.append(raw)
actorss_p6 = [str(raw.split(':')[1]) for raw in actorss_p6]
actorss_p6

In [196]:
%%capture
for a in actorss_p6:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p6.append(a)   
actors_p6 = ([[name.strip().replace('*','')
               .replace(' ','').replace('(','')
               .replace('(Cameo)','').replace('Voice)','')
               .replace(')','') for name in actor
              ] for actor in actors_p6])
actors_p6

In [197]:
%%capture
raw_actors_p7 = []
actors_p7 = []
actorss_p7 = []
for box in find_box2_p7:
    if len(box) > 3: 
        raw_actors_p7.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p7.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p7.append(box[1].text)
    else: 
        raw_actors_p7.append(box[0].text)
for raw in raw_actors_p7:
    if not raw.startswith('Actor'):
        raw = "Actor:N/a"
        actorss_p7.append(raw)
    else:
        actorss_p7.append(raw)
actorss_p7 = [str(raw.split(':')[1]) for raw in actorss_p7]
actorss_p7

In [198]:
%%capture
for a in actorss_p7:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p7.append(a)   
actors_p7 = ([[name.strip().replace('*','')
               .replace(' ','').replace('(','')
               .replace('(Cameo)','').replace('Voice)','')
               .replace(')','') for name in actor
              ] for actor in actors_p7])
actors_p7

In [199]:
%%capture
raw_actors_p8 = []
actors_p8 = []
actorss_p8 = []
for box in find_box2_p8:
    if len(box) > 3: 
        raw_actors_p8.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p8.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p8.append(box[1].text)
    else: 
        raw_actors_p8.append(box[0].text)
for raw in raw_actors_p8:
    if not raw.startswith('Actor'):
        raw = "Actor:N/a"
        actorss_p8.append(raw)
    else:
        actorss_p8.append(raw)
actorss_p8 = [str(raw.split(':')[1]) for raw in actorss_p8]
actorss_p8

In [200]:
%%capture
for a in actorss_p8:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p8.append(a)   
actors_p8 = ([[name.strip().replace('*','')
               .replace(' ','').replace('(','')
               .replace('(Cameo)','').replace('Voice)','')
               .replace(')','') for name in actor
              ] for actor in actors_p8])
actors_p8

In [201]:
%%capture
raw_actors_p9 = []
actors_p9 = []
actorss_p9 = []
for box in find_box2_p9:
    if len(box) > 3: 
        raw_actors_p9.append(box[2].text)
    elif len(box) == 3: 
        raw_actors_p9.append(box[1].text)
    elif len(box) == 2: 
        raw_actors_p9.append(box[1].text)
    else: 
        raw_actors_p9.append(box[0].text)
for raw in raw_actors_p9:
    if not raw.startswith('Actor'):
        raw = "Actor:N/a"
        actorss_p9.append(raw)
    else:
        actorss_p9.append(raw)
actorss_p9 = [str(raw.split(':')[1]) for raw in actorss_p9]
actorss_p9

In [202]:
%%capture
for a in actorss_p9:
    a = re.findall('[A-Z][^A-Z]*', a)
    actors_p9.append(a)   
actors_p9 = ([[name.strip().replace('*','')
               .replace(' ','').replace('(','')
               .replace('(Cameo)','').replace('Voice)','')
               .replace(')','') for name in actor
              ] for actor in actors_p9])
actors_p9

In [203]:
s = len(actors_p9) + len(actors_p8) + len(actors_p7) + len(actors_p6) + len(actors_p5) + len(actors_p4)
s

208

In [204]:

actors_2D = actors_p4 + actors_p5 + actors_p6 + actors_p7 + actors_p8 + actors_p9

directors_2D = (directors_p4 + directors_p5 
                + directors_p6 + directors_p7 
                + directors_p8 + directors_p9)

genre_2D = ['2D'] * 208

domestic_total_gross_2D = (domestic_total_gross_p4 
                           + domestic_total_gross_p5 
                           + domestic_total_gross_p6
                           + domestic_total_gross_p7 
                           + domestic_total_gross_p8 
                           + domestic_total_gross_p9)

release_date_2D = (release_date_p4 + release_date_p5 
                   + release_date_p6 + release_date_p7 
                   + release_date_p8 + release_date_p9)

runtime_2D = runtime_p4 +  runtime_p5 +  runtime_p6 +  runtime_p7 + runtime_p8 +runtime_p9 

theater_2D = theater_p4 + theater_p5 + theater_p6 + theater_p7 + theater_p8 + theater_p9

rating_2D = rating_p4 + rating_p5 + rating_p6 + rating_p7 + rating_p8 + rating_p9

production_budget_2D = (production_budget_p4 + production_budget_p5
                        + production_budget_p6 + production_budget_p7 
                        + production_budget_p8 + production_budget_p9)

directors_2D = directors_p4 + directors_p5 + directors_p6 + directors_p7 + directors_p8 + directors_p9

actors_2D = actors_p4 + actors_p5 + actors_p6 + actors_p7 + actors_p8 + actors_p9

opening_weekend_gross_2D = (opening_weekend_gross_p4 
                           + opening_weekend_gross_p5
                           + opening_weekend_gross_p6
                           + opening_weekend_gross_p7 
                           + opening_weekend_gross_p8
                           + opening_weekend_gross_p9)

title_2D = title_p4 + title_p5 + title_p6 + title_p7 + title_p8 + title_p9

In [205]:
len(directors_2D)

208

In [218]:
def create_movies_dict(title,genre
                       ,domestic_total_gross
                       ,release_date
                       ,runtime,theater
                       ,rating,production_budget
                       ,directors,actors
                       ,opening_weekend_gross):
    movies = {}
    headers = ['Movie Title',
           'Genre',
           'Domestic Total Gross ($)',
           'Release Date', 
           'Runtime (mins)', 
           'Wildest Release',
           'Rating',
           'Production Budget ($)',
           'Directors',
           'Actors', 
           'Opening Weekend Gross ($)']
    movies = dict(zip(headers, [title,
                            genre,
                            domestic_total_gross,
                            release_date,
                            runtime,
                            theater,
                            rating,
                            production_budget,
                            directors,
                            actors,
                            opening_weekend_gross]))
    df = pd.DataFrame(movies)
    df = df[['Movie Title', 
            'Genre',
            'Release Date', 
            'Runtime (mins)',
            'Wildest Release',
            'Rating',
            'Directors',
            'Actors',
            'Production Budget ($)',
            'Opening Weekend Gross ($)',
            'Domestic Total Gross ($)']]
    return df

In [219]:
df_2D = create_movies_dict(title_2D,genre_2D,domestic_total_gross_2D
                           ,release_date_2D,runtime_2D,theater_2D
                           ,rating_2D,production_budget_2D,directors_2D
                           ,actors_2D,opening_weekend_gross_2D)

In [231]:
genre_p1 = ['3D'] * 100
df_p1 = create_movies_dict(title_p1,
                            genre_p1,
                            domestic_total_gross_p1,
                            release_date_p1,
                            runtime_p1,
                            theater_p1,
                            rating_p1,
                            production_budget_p1,
                            directors_p1,
                            actors_p1,
                            opening_weekend_gross_p1)

In [223]:
genre_p2 = ['3D'] * 100
df_p2 = create_movies_dict(title_p2,
                            genre_p2,
                            domestic_total_gross_p2,
                            release_date_p2,
                            runtime_p2,
                            theater_p2,
                            rating_p2,
                            production_budget_p2,
                            directors_p2,
                            actors_p2,
                            opening_weekend_gross_p2)

In [225]:
genre_p3 = ['3D'] * 50
df_p3 = create_movies_dict(title_p3,
                            genre_p3,
                            domestic_total_gross_p3,
                            release_date_p3,
                            runtime_p3,
                            theater_p3,
                            rating_p3,
                            production_budget_p3,
                            directors_p3,
                            actors_p3,
                            opening_weekend_gross_p3)

In [232]:
df_3D = pd.concat([df_p1,df_p2,df_p3])

In [233]:
df_movies = pd.concat([df_3D, df_2D])

In [234]:
df_movies.shape

(458, 11)

In [236]:
df_movies

Unnamed: 0,Movie Title,Genre,Release Date,Runtime (mins),Wildest Release,Rating,Directors,Actors,Production Budget ($),Opening Weekend Gross ($),Domestic Total Gross ($)
0,Star Wars: The Force Awakens,3D,2015-12-18,136,4134,PG-13,J.J. Abrams,"[John, Boyega, Daisy, Ridley, Adam, Driver, Os...",245000000,247966675,936662225.0
1,Avatar,3D,2009-12-18,160,3452,PG-13,James Cameron,"[Sam, Worthington, Zoe, Saldana, Sigourney, We...",237000000,77025481,749766139.0
2,Jurassic World,3D,2015-06-12,124,4274,PG-13,Colin Trevorrow,"[Nick, Robinson, Omar, Sy, Chris, Pratt, Bryce...",150000000,208806270,652270625.0
3,Marvel's The Avengers,3D,2012-05-04,142,4349,PG-13,Joss Whedon,"[Robert, Downey,, Jr., Chris, Hemsworth, Chris...",220000000,207438708,623357910.0
4,Rogue One: A Star Wars Story,3D,2016-12-16,133,4157,PG-13,Gareth Edwards,"[Felicity, Jones, Mads, Mikkelsen, Ben, Mendel...",200000000,155081681,532177324.0
5,Beauty and the Beast,3D,2017-03-17,129,4210,PG,Bill Condon,"[Emma, Watson, Luke, Evans, Emma, Thompson, Ke...",160000000,174750616,504014165.0
6,Finding Dory,3D,2016-06-17,103,4305,PG,Angus MacLane Andrew Stanton,[Animation],200000000,135060273,486295561.0
7,Avengers: Age of Ultron,3D,2015-05-01,141,4276,PG-13,Joss Whedon,"[Robert, Downey,, Jr., Chris, Hemsworth, Mark,...",250000000,191271109,459005868.0
8,Toy Story 3,3D,2010-06-18,103,4028,G,Lee Unkrich,[Animation],200000000,110307189,415004880.0
9,Wonder Woman,3D,2017-06-02,141,4165,PG-13,Patty Jenkins,"[Gal, Gadot, Chris, Pine, Connie, Nielsen, Rob...",149000000,103251471,412038809.0


In [214]:
# with open('../../Data/movies.pickle', 'wb') as handle: 
#     pickle.dump(df_movies, handle, pickle.HIGHEST_PROTOCOL)