# Predicting the ratings of films by one film director at a time

Joshua Banks Mailman: joshuabankmailman@gmail.com

#### This project applies linear regression to analysis of viewer ratings on IMDB.com

### The data processing and analysis pipeline divides into _five_ jupyter notebooks

## 1. Scrape and clean
(this notebook)

## 2. Wrangle and EDA

## 3. (a) EDA and exploratory linear regression

##    3. (b) Regularized linear regression 
(Lasso and Ridge w/ Cross Validation)

## 3. (c) Regularized linear regression with _polynomial features_

## Data processing files -- 
#### Between these notebooks, _pickled_ datastructures are stored in separate folders for each director

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
#import numpy as np
import re
#import math
#import seaborn as sns
#from matplotlib import pyplot as plt
#from collections import defaultdict
from IPython.core.display import display, HTML
import pickle


In [2]:
dirs = {'fassbinder': 'https://www.imdb.com/name/nm0001202/', \
        'godard':'https://www.imdb.com/name/nm0000419/', \
        'hitchcock': 'https://www.imdb.com/name/nm0000033', \
        'spielberg': 'https://www.imdb.com/name/nm0000229', \
        'kurosawa': 'https://www.imdb.com/name/nm0000041/', \
        'bergman': 'https://www.imdb.com/name/nm0000005/', \
        'allen': 'https://www.imdb.com/name/nm0000095',\
        'herzog': 'https://www.imdb.com/name/nm0001348/',
        'scorsese': 'https://www.imdb.com/name/nm0000217/'}
        # scorcese

dirs_box_office = {'godard':'https://www.the-numbers.com/person/55430401-Jean-Luc-Godard#tab=technical', \
                   'hitchcock':'https://www.the-numbers.com/person/66230401-Alfred-Hitchcock#tab=technical', \
                   'spielberg': 'https://www.the-numbers.com/person/135430401-Steven-Spielberg#tab=technical',\
                   'allen':'https://www.the-numbers.com/person/4010401-Woody-Allen#tab=technical', \
                   'bergman': 'https://www.the-numbers.com/person/13600401-Ingmar-Bergman#tab=technical',\
                   'herzog':'https://www.the-numbers.com/person/65080401-Werner-Herzog#tab=technical',
                   'scorsese': 'https://www.the-numbers.com/person/128910401-Martin-Scorsese#tab=technical'}

In [3]:
director = 'hitchcock'

In [4]:

dirs_page = dirs[director]
url = dirs_page
response = requests.get(url)
print( response.status_code )
page = response.text
soup = BeautifulSoup(page, "lxml")
#need to distinguish 'filmo-head-director' from 'assistant_director' etc.
head_director_soup = soup.find('div', id='filmo-head-director')
head_director_section_soup= head_director_soup.find_next_sibling('div', class_='filmo-category-section')
films = head_director_section_soup.find_all('div', class_=re.compile('filmo-row'), id=re.compile("director"))
#films = soup.find_all('div', class_=re.compile('filmo-row'), id=re.compile("director"))

200


* Clean the datapoints below
* Ensure they are available or handled for each film's page
* Convert into numeric datatypes
* Populate a list of dictionaries with these 
* Within the loop that goes through every flick, populate the list of dictionaries with these
* Do one-hot-encoding on categorical variables
* Train and test the linear regression model

In [5]:
def is_tv_series( the_film_soup ):
    try:
        parenthesis = the_film_soup.find('a').next
        try:
            if 'TV ' in parenthesis.next:
                return True
            else:
                return False
        except:
            return False
    except:
        return False

In [6]:
def is_documentary( the_film_soup ):
    try:
        parenthesis = the_film_soup.find('a').next
        #print(parenthesis)
        try:
            if 'documentary' in parenthesis.next.lower():
                return 1
            else:
                return 0
        except:
            return 0
    except:
        return 0

In [7]:
def parse_crew_data(the_crew_soup):
    crew_data = {}
    try: 
        #print( the_crew_soup.find('h4', id='cinematographer').next_sibling )
        crew_data['cinematographer'] = the_crew_soup.find('h4', id='cinematographer').find_next_sibling('table', class_='simpleTable').find('a').text.strip()
    except: 
        crew_data['cinematographer'] = ''
    return (crew_data)

In [8]:
def parse_cast_data(the_film_soup):
    try:
        cast_table = the_film_soup.find('table', class_='cast_list').find_all('td', class_='primary_photo')
        cast_list = [(row.find_next_sibling().text.strip(), 1) for row in cast_table ]
        #cast_dict= defaultdict(cast_list)
        #cast_dict.setdefault(1)
        cast_dict = dict(cast_list)
        #print(cast_dict)
        return(cast_dict)
        #return(  the_film_soup.find('table', class_='cast_list').find_all('a', class_='primary_photo').find_next_sibling().text )
        
  #     the_film_soup.find('table', class_='cast_list').find_all('a', class_=re.compile('filmo-row'), id=re.compile("director"))
  #  return the_film_soup.find('table', class_='cast_list').find('a')
    except:
        return dict()

In [9]:
#crew_url = test_url + '/fullcredits'
crew_url = 'https://www.imdb.com/title/tt0084654/fullcredits'
crew_response = requests.get(crew_url)
crew_page = crew_response.text
crew_soup = BeautifulSoup(crew_page, "lxml")

the_crew_data = parse_crew_data( crew_soup )
#print(the_crew_data)
#print(is_tv_series(test_row))

In [10]:
def get_film_data_only_title( film_soup ):
    film_title_string = film_soup.find(class_ = 'title_wrapper').find('h1').text
    print(film_title_string)
    
# searching from the page_content might be more efficient. To be tested later
def parse_film_data( film_soup ):
    film_data = {}
    
    try:        
        film_title_string = (film_soup.find(class_ = 'title_wrapper').find('h1').text).strip()
    except:
        film_title_string = ''
    title_line = film_title_string.split('\xa0(')
    film_data['title'] = title_line[0].replace('\xa0','').rstrip() # title part of title line
    
    film_data['year'] = title_line[1].replace(') ', '').strip(')') if len(title_line) > 1 else '' # year part of title line
    
    try:
        film_rating_tag = film_soup.find('span', itemprop='ratingValue')
        try:
            film_rating_string = film_rating_tag.text   #find_next_sibling().text
        except:
            film_rating_string = None      
    except:
        film_rating_string = None 
    film_data['rating'] = film_rating_string
   
    try:
        film_rating_count_tag = film_soup.find('span', itemprop='ratingCount')
        try:
            film_rating_count = float( film_rating_count_tag.text.replace(',', '')) #find_next_sibling().text
        except:
            film_rating_count = None 
    except:
            film_rating_count = None 
    film_data['rating_count'] = film_rating_count
    
    try:
        film_duration_string = film_soup.find('time').text
        dur_list = film_duration_string.replace('min','').rstrip().split('h ')
        film_data['duration'] = (int(dur_list[-1]) + 60*int(dur_list[0])) if (len(dur_list)>1) else int(dur_list[-1])
    except:
        film_data['duration'] = None
      
    try:
        film_date_string = film_soup.find('a', title='see more release dates')
    except:
        film_date_string = ''  
    
    try:
        film_release_date_tag = film_soup.find('h4', text=re.compile('Release Date:'))
        try:
            film_release_date = film_release_date_tag.next_sibling.replace('\n', '')
        except:
            film_release_date = '' 
    except:
            film_release_date = '' 
    film_data['release_date'] = film_release_date
    
    
    try:
        film_metacritic_score_tag = film_soup.find('div', class_=re.compile('metacriticScore'))
        try:
            film_metacritic_score = film_metacritic_score_tag.find('span').text
        except:
            film_metacritic_score = None #math.nan  
    except:
        film_metacritic_score = None #math.nan  
    film_data['metacritic_score'] = film_metacritic_score
    try:
        film_details = film_soup.find('div', class_='article', id='titleDetails')
        try:        
            film_gross_string = film_details.find('h4', text=re.compile('Cumulative Worldwide Gross:')).next_sibling
            film_gross = int( re.sub("[^0-9]", "", film_gross_string ))
        except:
             film_gross = None 
    except:
        film_gross = None 
    film_data['worldwide_gross_imdb']  = film_gross
    
    try:
        film_runtime_tag = film_soup.find('h4', text=re.compile('Runtime:'))
        try:
             film_runtime = int(film_runtime_tag.find_next_sibling().text.strip(' min'))
        except:
            film_runtime = None 
    except:
            film_runtime = None
    film_data['runtime'] = film_runtime
    try:
        film_budget_tag = film_soup.find('h4', text=re.compile('Budget'))
        try:
            film_budget = int( re.sub("[^0-9]", "", film_budget_tag.next_sibling ))
        except:
            film_budget = None #math.nan  
    except:
            film_budget = None #math.nan  
    film_data['budget'] = film_budget
      
    return film_data

In [11]:
list_of_films = []
dict_of_all_actors = {}

index_ = 0 # temporary for record keeping
for film in films:
    cast_dict = {}
    if ( not is_tv_series(film)): 
        film_url_suffix = film.find('a').get('href')
        film_url = 'https://imdb.com' + film_url_suffix
        print('\n', film_url)
        film_response = requests.get(film_url)
        if(film_response.status_code < 400):
            film_page = film_response.text
            film_soup = BeautifulSoup(film_page, "lxml")
            film_record = parse_film_data( film_soup )
            film_record['tv_series'] = 0
    else:  # To be customized 
        film_url_suffix = film.find('a').get('href')
        film_url = 'https://imdb.com' + film_url_suffix
        print('\n', film_url)
        film_response = requests.get(film_url)
        if(film_response.status_code < 400):
            film_page = film_response.text
            film_soup = BeautifulSoup(film_page, "lxml")
            film_record = parse_film_data( film_soup )
            film_record['tv_series'] = 1
            
    film_record['documentary'] = is_documentary(film)
    #print(film_record)
    
    crew_url = film_url + 'fullcredits'
   # print(crew_url)
    crew_response = requests.get(crew_url)
    crew_page = crew_response.text
    crew_soup = BeautifulSoup(crew_page, "lxml")
    film_record.update( parse_crew_data(crew_soup) )
    cast_dict = parse_cast_data(film_soup)
    film_record.update( cast_dict )
    dict_of_all_actors.update( cast_dict )
    #dict_of_all_actors = parse_cast_data(film_soup)
   
    print(index_) # temporary for record keeping
    print(film_record)
    
    list_of_films.append(film_record)
    index_+=1
    
film_catalog = pd.DataFrame(list_of_films )


 https://imdb.com/title/tt6914094/
0
{'title': 'Kaleidoscope', 'year': '1967', 'rating': None, 'rating_count': None, 'duration': None, 'release_date': '', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': None, 'budget': None, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Arthur Schatz'}

 https://imdb.com/title/tt3455796/
1
{'title': 'Memory of the Camps', 'year': '2014', 'rating': '8.3', 'rating_count': 481.0, 'duration': 70, 'release_date': ' 26 July 2015 (USA)    ', 'metacritic_score': None, 'worldwide_gross_imdb': 4468, 'runtime': 70, 'budget': None, 'tv_series': 1, 'documentary': 1, 'cinematographer': '', 'Jasper Britton': 1, 'Adolf Hitler': 1}

 https://imdb.com/title/tt0074512/
2
{'title': 'Family Plot', 'year': '1976', 'rating': '6.8', 'rating_count': 20794.0, 'duration': None, 'release_date': ' 9 April 1976 (USA)    ', 'metacritic_score': '79', 'worldwide_gross_imdb': None, 'runtime': 120, 'budget': 4490375, 'tv_series': 0, 'documentary': 0, 'cinemat

15
{'title': 'The Wrong Man', 'year': '1956', 'rating': '7.4', 'rating_count': 25655.0, 'duration': 105, 'release_date': ' 26 January 1957 (USA)    ', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': 105, 'budget': 1200000, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Robert Burks', 'Henry Fonda': 1, 'Vera Miles': 1, 'Anthony Quayle': 1, 'Harold J. Stone': 1, 'Charles Cooper': 1, 'John Heldabrand': 1, 'Esther Minciotti': 1, 'Doreen Lang': 1, 'Laurinda Barrett': 1, 'Norma Connolly': 1, 'Nehemiah Persoff': 1, "Lola D'Annunzio": 1, 'Kippy Campbell': 1, 'Robert Essen': 1, 'Richard Robbins': 1}

 https://imdb.com/title/tt0049470/
16
{'title': 'The Man Who Knew Too Much', 'year': '1956', 'rating': '7.4', 'rating_count': 58462.0, 'duration': None, 'release_date': ' 1 June 1956 (USA)    ', 'metacritic_score': '78', 'worldwide_gross_imdb': 8190, 'runtime': 120, 'budget': 2500000, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Robert Burks', 'James Stewart': 1, 

29
{'title': 'Watchtower Over Tomorrow', 'year': '1945', 'rating': '6.5', 'rating_count': 213.0, 'duration': 15, 'release_date': ' 29 March 1945 (USA)    ', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': 15, 'budget': None, 'tv_series': 0, 'documentary': 1, 'cinematographer': 'Lester White', 'John Nesbitt': 1, 'Edward R. Stettinius Jr.': 1}

 https://imdb.com/title/tt1375299/
30
{'title': 'The Fighting Generation', 'year': '1944', 'rating': '5.9', 'rating_count': 213.0, 'duration': 2, 'release_date': ' October 1944 (USA)    ', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': 2, 'budget': None, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Gregg Toland', 'Jennifer Jones': 1, 'Rhonda Fleming': 1, 'Steve Dunhill': 1, 'Tony Devlin': 1}

 https://imdb.com/title/tt0036621/
31
{'title': 'Aventure malgache', 'year': '1944', 'rating': '5.5', 'rating_count': 1448.0, 'duration': 32, 'release_date': ' June 1944 (Portugal)    ', 'metacritic_score': None

44
{'title': 'Sabotage', 'year': '1936', 'rating': '7.0', 'rating_count': 14941.0, 'duration': 76, 'release_date': ' 11 January 1937 (USA)    ', 'metacritic_score': '85', 'worldwide_gross_imdb': None, 'runtime': 76, 'budget': None, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Bernard Knowles', 'Sylvia Sidney': 1, 'Oskar Homolka': 1, 'Desmond Tester': 1, 'John Loder': 1, 'Joyce Barbour': 1, 'Matthew Boulton': 1, 'S.J. Warmington': 1, 'William Dewhurst': 1}

 https://imdb.com/title/tt0028231/
45
{'title': 'Secret Agent', 'year': '1936', 'rating': '6.5', 'rating_count': 7767.0, 'duration': 86, 'release_date': ' 15 June 1936 (USA)    ', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': 86, 'budget': None, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Bernard Knowles', 'Madeleine Carroll': 1, 'Peter Lorre': 1, 'John Gielgud': 1, 'Robert Young': 1, 'Percy Marmont': 1, 'Florence Kahn': 1, 'Charles Carson': 1, 'Lilli Palmer': 1}

 https://imdb.com/title/tt0026

60
{'title': 'The Manxman', 'year': '1929', 'rating': '6.2', 'rating_count': 2603.0, 'duration': 110, 'release_date': ' 16 December 1929 (USA)    ', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': 110, 'budget': None, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Jack E. Cox', 'Carl Brisson': 1, 'Malcolm Keen': 1, 'Anny Ondra': 1, 'Randle Ayrton': 1, 'Clare Greet': 1}

 https://imdb.com/title/tt0018756/
61
{'title': 'Champagne', 'year': '1928', 'rating': '5.6', 'rating_count': 2025.0, 'duration': 86, 'release_date': ' 20 August 1928 (UK)    ', 'metacritic_score': None, 'worldwide_gross_imdb': None, 'runtime': 86, 'budget': None, 'tv_series': 0, 'documentary': 0, 'cinematographer': 'Jack E. Cox', 'Betty Balfour': 1, 'Jean Bradin': 1, 'Ferdinand von Alten': 1, 'Gordon Harker': 1}

 https://imdb.com/title/tt0018876/
62
{'title': "The Farmer's Wife", 'year': '1928', 'rating': '5.9', 'rating_count': 2496.0, 'duration': 129, 'release_date': ' 4 January 1930 (USA)  

In [12]:
film_catalog['year'] = pd.to_numeric(film_catalog['year'])
film_catalog['rating'] = pd.to_numeric(film_catalog['rating'])
film_catalog['worldwide_gross_imdb'] = pd.to_numeric(film_catalog['worldwide_gross_imdb'])
film_catalog['budget'] = pd.to_numeric(film_catalog['budget'])
film_catalog['metacritic_score'] = pd.to_numeric(film_catalog['metacritic_score'])

In [13]:
len(film_catalog.columns)

646

In [14]:
len(dict_of_all_actors)

633

In [15]:
len(film_catalog.columns) - len(dict_of_all_actors)

13

In [16]:
first_cast_column = len(film_catalog.columns) - len(dict_of_all_actors)
first_cast_column

13

In [17]:
main_column_names = list(film_catalog.columns[:first_cast_column])
main_column_names 

['title',
 'year',
 'rating',
 'rating_count',
 'duration',
 'release_date',
 'metacritic_score',
 'worldwide_gross_imdb',
 'runtime',
 'budget',
 'tv_series',
 'documentary',
 'cinematographer']

In [18]:
film_catalog.iloc[:,first_cast_column:] # temporary, to check work

Unnamed: 0,Jasper Britton,Adolf Hitler,Karen Black,Bruce Dern,Barbara Harris,William Devane,Ed Lauter,Cathleen Nesbitt,Katherine Helmond,Warren J. Kemmerling,...,Ferdinand Martini,Florence Helminger,Georg H. Schnell,Karl Falkenberg,Seymour Hicks,Stanley Logan,Gertrude McCoy,Ellaline Terriss,Ian Wilson,Ernest Thesiger
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,1.0,,,,,,,,,...,,,,,,,,,,
2,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,,,,,,,,,,,...,,,,,,,,,,
67,,,,,,,,,,,...,,,,,,,,,,
68,,,,,,,,,,,...,1.0,1.0,1.0,1.0,,,,,,
69,,,,,,,,,,,...,,,,,1.0,1.0,1.0,1.0,1.0,


In [19]:
import copy
df_actors = copy.deepcopy(film_catalog.iloc[:, first_cast_column:])
df_actors

Unnamed: 0,Jasper Britton,Adolf Hitler,Karen Black,Bruce Dern,Barbara Harris,William Devane,Ed Lauter,Cathleen Nesbitt,Katherine Helmond,Warren J. Kemmerling,...,Ferdinand Martini,Florence Helminger,Georg H. Schnell,Karl Falkenberg,Seymour Hicks,Stanley Logan,Gertrude McCoy,Ellaline Terriss,Ian Wilson,Ernest Thesiger
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,1.0,,,,,,,,,...,,,,,,,,,,
2,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,,,,,,,,,,,...,,,,,,,,,,
67,,,,,,,,,,,...,,,,,,,,,,
68,,,,,,,,,,,...,1.0,1.0,1.0,1.0,,,,,,
69,,,,,,,,,,,...,,,,,1.0,1.0,1.0,1.0,1.0,


In [20]:
df_actors.sum().sort_values(ascending = False)

Leo G. Carroll      6.0
John Longden        5.0
Edmund Gwenn        4.0
Gordon Harker       4.0
James Stewart       4.0
                   ... 
Raymond Burr        1.0
Judith Evelyn       1.0
Ross Bagdasarian    1.0
Georgine Darcy      1.0
Ernest Thesiger     1.0
Length: 633, dtype: float64

In [21]:
df_actors_reordered = df_actors[df_actors.sum().sort_values(ascending = False).index]
df_actors_reordered

Unnamed: 0,Leo G. Carroll,John Longden,Edmund Gwenn,Gordon Harker,James Stewart,Hannah Jones,Donald Calthrop,Cary Grant,Edward Chapman,Basil Radford,...,Brigitte Auber,Georgette Anys,Sara Berner,Wendell Corey,Thelma Ritter,Raymond Burr,Judith Evelyn,Ross Bagdasarian,Georgine Darcy,Ernest Thesiger
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,,,,,,,,,,,...,,,,,,,,,,
67,,,,,,,,,,,...,,,,,,,,,,
68,,,,,,,,,,,...,,,,,,,,,,
69,,,,,,,,,,,...,,,,,,,,,,


In [22]:
df_trunk = copy.deepcopy(film_catalog .iloc[:, :first_cast_column])
df_trunk

Unnamed: 0,title,year,rating,rating_count,duration,release_date,metacritic_score,worldwide_gross_imdb,runtime,budget,tv_series,documentary,cinematographer
0,Kaleidoscope,1967.0,,,,,,,,,0,0,Arthur Schatz
1,Memory of the Camps,2014.0,8.3,481.0,70.0,26 July 2015 (USA),,4468.0,70.0,,1,1,
2,Family Plot,1976.0,6.8,20794.0,,9 April 1976 (USA),79.0,,120.0,4490375.0,0,0,Leonard J. South
3,Frenzy,1972.0,7.4,41424.0,116.0,21 June 1972 (USA),92.0,4121.0,116.0,2000000.0,0,0,Gilbert Taylor
4,Topaz,1969.0,6.3,16714.0,143.0,19 December 1969 (USA),,,143.0,4000000.0,0,0,Jack Hildyard
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,The Lodger: A Story of the London Fog,1927.0,7.3,10391.0,92.0,14 February 1927 (UK),,83260.0,92.0,12000.0,0,0,Gaetano di Ventimiglia
67,The Mountain Eagle,1926.0,,,57.0,1 November 1926 (USA),,,,,0,0,Gaetano di Ventimiglia
68,The Pleasure Garden,1925.0,6.0,2170.0,75.0,1926 (USA),,,75.0,,0,0,Gaetano di Ventimiglia
69,Always Tell Your Wife,1923.0,7.7,115.0,20.0,February 1923 (UK),,,20.0,,0,0,


In [23]:
film_catalog_cast_columns_reordered = pd.concat( [df_trunk, df_actors_reordered], axis=1)
film_catalog_cast_columns_reordered

Unnamed: 0,title,year,rating,rating_count,duration,release_date,metacritic_score,worldwide_gross_imdb,runtime,budget,...,Brigitte Auber,Georgette Anys,Sara Berner,Wendell Corey,Thelma Ritter,Raymond Burr,Judith Evelyn,Ross Bagdasarian,Georgine Darcy,Ernest Thesiger
0,Kaleidoscope,1967.0,,,,,,,,,...,,,,,,,,,,
1,Memory of the Camps,2014.0,8.3,481.0,70.0,26 July 2015 (USA),,4468.0,70.0,,...,,,,,,,,,,
2,Family Plot,1976.0,6.8,20794.0,,9 April 1976 (USA),79.0,,120.0,4490375.0,...,,,,,,,,,,
3,Frenzy,1972.0,7.4,41424.0,116.0,21 June 1972 (USA),92.0,4121.0,116.0,2000000.0,...,,,,,,,,,,
4,Topaz,1969.0,6.3,16714.0,143.0,19 December 1969 (USA),,,143.0,4000000.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,The Lodger: A Story of the London Fog,1927.0,7.3,10391.0,92.0,14 February 1927 (UK),,83260.0,92.0,12000.0,...,,,,,,,,,,
67,The Mountain Eagle,1926.0,,,57.0,1 November 1926 (USA),,,,,...,,,,,,,,,,
68,The Pleasure Garden,1925.0,6.0,2170.0,75.0,1926 (USA),,,75.0,,...,,,,,,,,,,
69,Always Tell Your Wife,1923.0,7.7,115.0,20.0,February 1923 (UK),,,20.0,,...,,,,,,,,,,


In [24]:
film_catalog_cast_columns_reordered.iloc[:, :15].head(50)

Unnamed: 0,title,year,rating,rating_count,duration,release_date,metacritic_score,worldwide_gross_imdb,runtime,budget,tv_series,documentary,cinematographer,Leo G. Carroll,John Longden
0,Kaleidoscope,1967.0,,,,,,,,,0,0,Arthur Schatz,,
1,Memory of the Camps,2014.0,8.3,481.0,70.0,26 July 2015 (USA),,4468.0,70.0,,1,1,,,
2,Family Plot,1976.0,6.8,20794.0,,9 April 1976 (USA),79.0,,120.0,4490375.0,0,0,Leonard J. South,,
3,Frenzy,1972.0,7.4,41424.0,116.0,21 June 1972 (USA),92.0,4121.0,116.0,2000000.0,0,0,Gilbert Taylor,,
4,Topaz,1969.0,6.3,16714.0,143.0,19 December 1969 (USA),,,143.0,4000000.0,0,0,Jack Hildyard,,
5,Torn Curtain,1966.0,6.7,24727.0,128.0,27 July 1966 (USA),55.0,,128.0,6000000.0,0,0,John F. Warren,,
6,Marnie,1964.0,7.2,45220.0,130.0,17 July 1964 (USA),73.0,7095.0,130.0,3000000.0,0,0,Robert Burks,,
7,The Birds,1963.0,7.7,171549.0,119.0,29 March 1963 (USA),90.0,32655.0,119.0,2500000.0,0,0,Robert Burks,,
8,The Alfred Hitchcock Hour,,8.5,3877.0,50.0,20 September 1962 (USA),,,50.0,,1,0,John F. Warren,,
9,Alfred Hitchcock Presents,,8.5,14767.0,25.0,2 October 1955 (USA),,,25.0,,1,0,John L. Russell,,


In [25]:
def get_box_office( the_box_office_url):
    response = requests.get(the_box_office_url)
    print( response.status_code )
    box_office_page = response.text
    box_office_soup = BeautifulSoup(box_office_page, "lxml")
    
    box_office_chart = box_office_soup.find('div', id='technical').find('div', id='page_filling_chart')
    box_office_chart_body = box_office_chart.find('table', id='all_technical_credits').find('tbody')
    box_office_chart_body
    box_office_rows = box_office_chart_body.find_all('tr')

    flix_box_office = []
    for row in box_office_rows:
        try:
            title_data = row.find('td').find_next_sibling('td')
            title=(title_data.find('a').text).strip()
        except:
            title=''
        try:
            worldwide_box_office = title_data.find_next_sibling('td').find_next_sibling().find_next_sibling().find_next_sibling().text   
            worldwide_box_office = int( re.sub("[^0-9]", "", worldwide_box_office ))
            flix_box_office.append({'title': title, 'worldwide_gross_bo': float(worldwide_box_office)})
        except:
            flix_box_office.append({'title': title, 'worldwide_gross_bo':''})
    return(pd.DataFrame(flix_box_office))

In [26]:
box_office_df = get_box_office(dirs_box_office[director])

200


In [27]:
film_catalog_cast_columns_reordered.iloc[:, 10:30].head(50)

Unnamed: 0,tv_series,documentary,cinematographer,Leo G. Carroll,John Longden,Edmund Gwenn,Gordon Harker,James Stewart,Hannah Jones,Donald Calthrop,Cary Grant,Edward Chapman,Basil Radford,George Curzon,Leslie Banks,Anny Ondra,Alfred Hitchcock,Grace Kelly,Malcolm Keen,Ian Hunter
0,0,0,Arthur Schatz,,,,,,,,,,,,,,,,,
1,1,1,,,,,,,,,,,,,,,,,,
2,0,0,Leonard J. South,,,,,,,,,,,,,,,,,
3,0,0,Gilbert Taylor,,,,,,,,,,,,,,,,,
4,0,0,Jack Hildyard,,,,,,,,,,,,,,,,,
5,0,0,John F. Warren,,,,,,,,,,,,,,,,,
6,0,0,Robert Burks,,,,,,,,,,,,,,,,,
7,0,0,Robert Burks,,,,,,,,,,,,,,,,,
8,1,0,John F. Warren,,,,,,,,,,,,,,1.0,,,
9,1,0,John L. Russell,,,,,,,,,,,,,,1.0,,,


In [28]:
first_cast_column = len(film_catalog.columns) - len(dict_of_all_actors)
first_cast_column

13

In [29]:
#director='hitchcock'
dir_str = director + '/'
pickled_film_catalog_filename = 'film_catalog_' + director +'.pickle'
pickled_box_office_filename = 'box_office_' + director +'.pickle'
pickled_fc_columns_filename =  'fc_cols_' + director +'.pickle'
pickled_dict_of_actors_filename =  'dict_of_actors_' + director +'.pickle'

In [30]:
#pickled_film_catalog_filename = 'film_catalog_' + director +'.pickle'
with open(dir_str + pickled_film_catalog_filename, 'wb') as to_write:
    pickle.dump(film_catalog_cast_columns_reordered, to_write)

In [31]:

with open(dir_str + pickled_box_office_filename, 'wb') as to_write:
    pickle.dump(box_office_df, to_write)

In [32]:
# film_catalog.columns
with open(dir_str + pickled_fc_columns_filename, 'wb') as to_write:
    pickle.dump(film_catalog.columns, to_write)

In [33]:
# dict_of_all_actors
with open(dir_str + pickled_dict_of_actors_filename, 'wb') as to_write:
    pickle.dump(dict_of_all_actors, to_write)