In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os, re, sys, pickle, requests, resource, json
import time
from random import random
from IPython.core.display import clear_output

### Webscraping IMDB

I started by scraping data for the top 2000 science fiction films from the [Internet Movie Database](https://www.imdb.com/search/title/?title_type=movie&genres=sci-fi&start=51&explore=title_type,genres&ref_=adv_nxt)  website, sorted by popularity per IMDB's audience rankings.  The list was organized as 40 pages of 50 titles as the product of a search. 

I formatted each search url, and then scraped each search page of 50 for title, title link, year, runtime, MPAA rating, IMDB rating, number of IMDB votes, Metascore rating, and three genre crossover keyword tags. I attempted to scrape for director information, but this returned a lot of NaN data, so I turned to BoxOfficeMojo for that information instead.

Initially, I scraped only the top 2000 titles on their popularity list.  However, in order to include more titles that had Metascore, Rotten Tomatoes, and budget data, I scraped an additional 2000 titles, which yielded 83 titles matching the critical reception score requirement.  For this reason, there are two IMDB dataframes that are exported to JSON at the end of the notebook.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver"  # path to the chromedriver executable
os.environ['webdriver.chrome.driver'] = chromedriver

In [10]:
headers = ['title', 'title_link', 'mpaa_rating', 'year', 'runtime',
           'imdb_rating', 'metascore', 'votes', 'director_name', 
           'director_link', 'imdb_genre_list']
    
imdb_data_list = []

start_time = time.time()
tries = 0

search_url = "https://www.imdb.com/search/title/?title_type=movie&genres=sci-fi&start={}&explore=title_type,genres"

pages = [str(i) for i in range(2001,4000,50)]

# for every 50 titles
for page in pages:
    html_url = search_url.format(page)
    driver = webdriver.Chrome(chromedriver)
    response = driver.get(html_url)
    
    
    # pause the loop
    time.sleep(0.5+2*random())
    
    # monitor the requests
    tries += 1
    elapsed_time = time.time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(tries, tries/elapsed_time))
    clear_output(wait = True)
        
    # break if # of requests too large
    if tries > 2000:
        print('Number of requests greater than expected.')
        break
    
         
    # parse content
    imdb_soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # select all 50 lister_items from a page
    lister_items = imdb_soup.find_all('div', class_ = 'lister-item mode-advanced')
    
    # for each movie of the 50:
    for lister_item in lister_items:
        
        try:
            title = lister_item.h3.a.text
        
        except:
            continue
        
        try:
            title_link = lister_item.h3.a.get('href')

        except:
            title_link = "NaN"
       
        try:
            year = lister_item.h3.find('span', class_ = 'lister-item-year').text

        except:
            year = "NaN"
        
        try:
            runtime = lister_item.find('span', class_='runtime').text

        except:
            runtime = "NaN"
        
        try:
            imdb_rating = lister_item.strong.text

        except:
            imdb_rating = "NaN"
            
        try:
            mpaa_rating = lister_item.find('span', class_='certificate').text

        except:
            mpaa_rating = "NaN"
        
        try:
            metascore = lister_item.find('span', class_ = 'metascore').text

            except:
            metascore = "NaN"
                  
        try:
            imdb_votes = lister_item.find('span', attrs = {'name':'nv'})['data-value']

        except:
            imdb_votes = "NaN"
        
        try:
            director_name = lister_item.find(string=re.compile("Director:")).next_sibling.text

        except:
            director_name = "NaN"
            
        try:
            director_link = lister_item.find(string=re.compile("Director:")).next_sibling.get('href')

        except:
            director_link = "NaN"
                
        try:
            imdb_genres = lister_item.find('span', class_='genre').text
            
        except:
            imdb_genres = "NaN"

        imdb_dict = dict(zip(headers, [title, title_link, mpaa_rating, year, runtime, imdb_rating, metascore,
                              imdb_votes, director_name, director_link, imdb_genres]))

        imdb_data_list.append(imdb_dict)

        
    


Request:40; Frequency: 0.09840765928396693 requests/s


In [11]:
len(imdb_data_list)

2000

In [3]:
imdb_df.head()

Unnamed: 0,title,title_link,mpaa_rating,year,runtime,imdb_rating,metascore,votes,director_name,director_link,imdb_genre_list
0,Tenet,/title/tt6723592/?ref_=adv_li_tt,PG-13,(2020),,7.5,69.0,266993.0,Christopher Nolan,/name/nm0634240/?ref_=adv_li_dr_0,"\nAction, Sci-Fi, Thriller"
1,The Midnight Sky,/title/tt10539608/?ref_=adv_li_tt,PG-13,(2020),,5.6,58.0,57348.0,George Clooney,/name/nm0000123/?ref_=adv_li_dr_0,"\nDrama, Fantasy, Sci-Fi"
2,Wonder Woman,/title/tt0451279/?ref_=adv_li_tt,PG-13,(2017),,7.4,76.0,563463.0,Patty Jenkins,/name/nm0420941/?ref_=adv_li_dr_0,"\nAction, Adventure, Fantasy"
3,Avengers: Endgame,/title/tt4154796/?ref_=adv_li_tt,PG-13,(2019),,8.4,78.0,801485.0,,,"\nAction, Adventure, Drama"
4,Outside the Wire,/title/tt10451914/?ref_=adv_li_tt,R,(2021),,5.4,47.0,6075.0,Mikael Håfström,/name/nm0405632/?ref_=adv_li_dr_0,"\nAction, Adventure, Fantasy"


In [2]:
imdb_df = pd.DataFrame(pd.read_json('imdb_data.json', lines=True))

In [12]:
imdb_df2 = pd.DataFrame(imdb_data_list)
imdb_df2.head()

Unnamed: 0,title,title_link,mpaa_rating,year,runtime,imdb_rating,metascore,votes,director_name,director_link,imdb_genre_list
0,Trancers II,/title/tt0103116/?ref_=adv_li_tt,R,(1991),88 min,5.4,,1847,Charles Band,/name/nm0023929/?ref_=adv_li_dr_0,"\nAction, Horror, Sci-Fi"
1,Drive,/title/tt0116147/?ref_=adv_li_tt,R,(1997),100 min,6.8,,5451,Steve Wang,/name/nm0911036/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi"
2,Timerider: The Adventure of Lyle Swann,/title/tt0086443/?ref_=adv_li_tt,PG,(1982),94 min,5.4,,2005,William Dear,/name/nm0213100/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi"
3,Thirst,/title/tt3955808/?ref_=adv_li_tt,,(III) (2015),87 min,4.4,,1045,Greg Kiefer,/name/nm2279788/?ref_=adv_li_dr_0,"\nAction, Adventure, Horror"
4,I Married a Strange Person!,/title/tt0119346/?ref_=adv_li_tt,R,(1997),75 min,7.0,,1781,Bill Plympton,/name/nm0687739/?ref_=adv_li_dr_0,"\nAnimation, Comedy, Drama"


In [13]:
imdb_df2.shape

(2000, 11)

In [14]:
imdb_df2.describe()

Unnamed: 0,title,title_link,mpaa_rating,year,runtime,imdb_rating,metascore,votes,director_name,director_link,imdb_genre_list
count,2000,2000,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000
unique,1981,2000,19.0,142.0,119.0,78.0,49.0,1363.0,1436.0,1437.0,220
top,Almost Human,/title/tt2299206/?ref_=adv_li_tt,,,,,,,,,\nSci-Fi
freq,3,1,722.0,128.0,192.0,200.0,1917.0,200.0,268.0,268.0,189


In [15]:
imdb_df2.to_json('imdb_data2.json', orient='records', lines=True)