In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [2]:
import os, re, sys, pickle, requests, resource, json
import time
from random import random
from IPython.core.display import clear_output

### Webscraping Box Office Mojo

Since I encountered some difficulties in obtaining director information for all of the film entries on IMDB, in particular for films which had more than one director, I attempted to scrape this information from [Box Office Mojo](https://www.boxofficemojo.com/), which had a more approachable tabular format. I used the `imdb_id` field sliced from the `title_link`  from the cleaned IMDB dataframe to do this.

I also attempted to scrape other cast and crew data, but ultimately abandoned this portion due to time constraints.

In [70]:
imdb_lookup_df3 = pd.read_json("imdb_lookup_df3.json", lines=True)

In [71]:
imdb_lookup_df3.head()

Unnamed: 0,title,imdb_id,title_link,director_name,director_id,director_link,title_cc
0,Beneath,tt2325518,/title/tt2325518/?ref_=adv_li_tt,Larry Fessenden,nm0275244,/name/nm0275244/?ref_=adv_li_dr_0,beneath
1,The Million Dollar Duck,tt0066728,/title/tt0066728/?ref_=adv_li_tt,Vincent McEveety,nm0568546,/name/nm0568546/?ref_=adv_li_dr_0,the_million_dollar_duck
2,Cities of Last Things,tt4397342,/title/tt4397342/?ref_=adv_li_tt,Wi Ding Ho,nm0387399,/name/nm0387399/?ref_=adv_li_dr_0,cities_of_last_things
3,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,lazer_team
4,The Powerpuff Girls Movie,tt0289408,/title/tt0289408/?ref_=adv_li_tt,Craig McCracken,nm0566833,/name/nm0566833/?ref_=adv_li_dr_0,the_powerpuff_girls_movie


In [72]:
id_list = imdb_lookup_df3['imdb_id']

In [73]:
def get_movie_value(soup, field_name):
    obj = soup.find(text = re.compile(field_name))
    
    if not obj:
        return None
    
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text
    else:
        
        return None

In [74]:
mojo_data_list = []

def get_movie_dict(id_list):

    mojo_url = "https://www.boxofficemojo.com/title/{}/?ref_=bo_se_r_1"

# preparing to monitor
    start_time = time.time()
    tries =0

    for movie_id in id_list:
    
        response = requests.get(mojo_url.format(movie_id))
        page = response.text
        
        time.sleep(0.5+2*random())
        tries +=1
        elapsed_time = time.time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(tries, tries/elapsed_time))
        clear_output(wait = True)
        
        if tries > 2200:
            return "Number of requests exceeded"
            break

        mojo_soup = BeautifulSoup(page, "lxml")

        headers = ["id","title_string","dom_gross", "release_date"]

        title_string = mojo_soup.find('title').text
        
        try:
            domestic_gross = (mojo_soup.find(
                class_='mojo-performance-summary-table')\
                              .find_all('span', class_='money')[0].text)
        except:
            domestic_gross = np.nan
                         
        release_date = get_movie_value(mojo_soup, 'Release Date')
        
        mojo_dict = dict(zip(headers, [movie_id, title_string,
                                       domestic_gross,
                                       release_date]))
        mojo_data_list.append(mojo_dict)

In [50]:
crew_id_dict ={}
cast_id_dict ={}

def get_movie_cast_n_crew(id_list):

    
    mojo_crew_url = "https://www.boxofficemojo.com/title/{}/credits/?ref_=bo_tt_tab#tabs"

# preparing to monitor
    start_time = time.time()
    tries =0

    for movie_id in id_list:
    
        response = requests.get(mojo_crew_url.format(movie_id))
        page = response.text
        
        time.sleep(0.5+2*random())
        tries +=1
        elapsed_time = time.time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(tries, tries/elapsed_time))
        clear_output(wait = True)
        
        if tries > 2200:
            return "Number of requests exceeded"
            break

        mojo_crew_soup = BeautifulSoup(page, "lxml")
        
        crew_table = mojo_crew_soup.find('table', attrs={'id':'principalCrew'})
        
        crew_rows = [row for row in crew_table.find_all('tr')]
        
     
        crew_list = [movie_id]
        crew_header = ['movie_id']
                
        for row in crew_rows[1:3]:
            crew = row.find_all('td')
            crew_name = crew[0].find('a').text
            crew_list.append(crew_name)
            crew_role = crew[-1].text
            crew_header.append(crew_role)
        
        while crew_list < 4:
            crew_list.append(np.nan)
            
        crew_dict = dict(zip(crew_header, crew_list))

            
            
    mojo_cast_url = "https://www.boxofficemojo.com/title/{}/credits/?ref_=bo_tt_tab#tabs"

    for id in id_list:
        response = requests.get(mojo_cast_url.format(id))
        page = response.text

        mojo_cast_soup = BeautifulSoup(page, "lxml")

        cast_table = mojo_cast_soup.find('table', attrs={'id':'principalCast'})
        
        cast_rows = [row for row in cast_table.find_all('tr')]
        
        crew_list = [movie_id]
        crew_header = ['movie_id', 'actor_01', 'actor_02', 'actor_03']
        
        for row in crew_rows[1:3]:
            crew = row.find_all('td')
            crew_name = crew[0].find('a').text
            crew_list.append(crew_name)
        
        while crew_list < 4:
            crew_list.append(np.nan)
            
               imdb_dict = dict(zip(headers, [title, title_link, mpaa_rating, year, runtime, imdb_rating, metascore,
                              imdb_votes, director_name, director_link, imdb_genres]))

            
        cast_id_dict[id] = cast_dict 

In [75]:
get_movie_dict(id_list)

Request:83; Frequency: 0.49158886154006853 requests/s


In [76]:
len(mojo_data_list)

83

In [45]:
mojo_crew_url = "https://www.boxofficemojo.com/title/{}/credits/?ref_=bo_tt_tab#tabs"
    
# preparing to monitor
start_time = time.time()
tries =0

for movie_id in id_list:

    response = requests.get(mojo_crew_url.format(movie_id))
    page = response.text

    time.sleep(0.5+2*random())
    tries +=1
    elapsed_time = time.time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(tries, tries/elapsed_time))
    clear_output(wait = True)

    if tries > 2200:
        print("Number of requests exceeded")
        break

    mojo_crew_soup = BeautifulSoup(page, "lxml")

    crew_table = mojo_crew_soup.find('table', attrs={'id':'principalCrew'})

    crew_rows = [row for row in crew_table.find_all('tr')]
    print(crew_rows)

    crew_dict = {}

    for row in crew_rows[1:3]:
        crew = row.find_all('td')
        crew_link = crew[0].find('a')

        crew_name, crew_url = crew_link.text, crew_link['href']
        crew_role = crew[-1].text
        crew_dict[crew_name] = {crew_role, crew_url}

    crew_id_dict[movie_id] = crew_dict 

[<tr><th class="a-span6">Crew Member</th><th class="a-span6">Role</th></tr>, <tr><td><a class="a-link-normal" href="https://pro.imdb.com/name/nm0009190/?ref_=mojo_tt_cw_1&amp;rf=mojo_tt_cw_1" rel="noopener" target="_blank">J.J. Abrams<svg class="mojo-new-window-svg" viewbox="0 0 32 32" xmlns="http://www.w3.org/2000/svg">
<path d="M24,15.57251l3,3V23.5A3.50424,3.50424,0,0,1,23.5,27H8.5A3.50424,3.50424,0,0,1,5,23.5V8.5A3.50424,3.50424,0,0,1,8.5,5h4.92755l3,3H8.5a.50641.50641,0,0,0-.5.5v15a.50641.50641,0,0,0,.5.5h15a.50641.50641,0,0,0,.5-.5ZM19.81952,8.56372,12.8844,17.75a.49989.49989,0,0,0,.04547.65479l.66534.66528a.49983.49983,0,0,0,.65479.04553l9.18628-6.93518,2.12579,2.12585a.5.5,0,0,0,.84741-.27526l1.48273-9.35108a.50006.50006,0,0,0-.57214-.57214L17.969,5.59058a.5.5,0,0,0-.27526.84741Z"></path>
</svg></a></td><td>Director</td></tr>, <tr><td><a class="a-link-normal" href="https://pro.imdb.com/name/nm0006516/?ref_=mojo_tt_cw_2&amp;rf=mojo_tt_cw_2" rel="noopener" target="_blank">Chris T

In [46]:
crew_id_dict

{'tt6723592': {'Christopher Nolan\n\n': {'Writer',
   'https://pro.imdb.com/name/nm0634240/?ref_=mojo_tt_cw_2&rf=mojo_tt_cw_2'}},
 'tt10539608': {'George Clooney\n\n': {'Director',
   'https://pro.imdb.com/name/nm0000123/?ref_=mojo_tt_cw_1&rf=mojo_tt_cw_1'},
  'Lily Brooks-Dalton\n\n': {'Writer',
   'https://pro.imdb.com/name/nm10791512/?ref_=mojo_tt_cw_2&rf=mojo_tt_cw_2'}},
 'tt0451279': {'Patty Jenkins\n\n': {'Director',
   'https://pro.imdb.com/name/nm0420941/?ref_=mojo_tt_cw_1&rf=mojo_tt_cw_1'},
  'Allan Heinberg\n\n': {'Writer',
   'https://pro.imdb.com/name/nm0374302/?ref_=mojo_tt_cw_2&rf=mojo_tt_cw_2'}},
 'tt4154796': {'Anthony Russo\n\n': {'Director',
   'https://pro.imdb.com/name/nm0751577/?ref_=mojo_tt_cw_1&rf=mojo_tt_cw_1'},
  'Joe Russo\n\n': {'Director',
   'https://pro.imdb.com/name/nm0751648/?ref_=mojo_tt_cw_2&rf=mojo_tt_cw_2'}},
 'tt10451914': {'Mikael Håfström\n\n': {'Director',
   'https://pro.imdb.com/name/nm0405632/?ref_=mojo_tt_cw_1&rf=mojo_tt_cw_1'},
  'Rob Yescom

In [77]:
mojo_df2 = pd.DataFrame(mojo_data_list)

In [78]:
mojo_df2.tail()

Unnamed: 0,id,title_string,dom_gross,release_date
78,tt0270688,Teknolust - Box Office Mojo,"$28,811","August 22, 2003\n (Domestic)"
79,tt6170484,Monster Hunt 2 - Box Office Mojo,"$706,153","February 15, 2018\n (APAC)"
80,tt2320924,War of the Worlds: Goliath - Box Office Mojo,"$13,385","March 7, 2014\n (Domestic)"
81,tt1867091,The Reconstruction of William Zero - Box Offic...,,
82,tt0312358,Anatomy 2 - Box Office Mojo,$623,"February 6, 2003\n (Germany)"


In [68]:
import json

In [79]:
mojo_df2.to_json('mojo_data2.json', orient='records', lines=True)

In [81]:
len(mojo_df)

2000