In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os, re, sys, pickle, requests, resource, json
import time
from random import random
from IPython.core.display import clear_output

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver"  # path to the chromedriver executable
os.environ['webdriver.chrome.driver'] = chromedriver
driver = webdriver.Chrome(chromedriver)

## Webscraping Rotten Tomatoes

In order to acquire Tomatometer data, it was necessary to search out the url for each film title individually.  To do this, I imported the cleaned IMDB data and formatted a title string to append to the base Rotten Tomatoes url.  

In [170]:
imdb_lookup_df3 = pd.read_json("imdb_lookup_df3.json", lines=True)

In [177]:
imdb_lookup_df3.shape

(83, 7)

In [171]:
imdb_lookup_df3.columns

Index(['title', 'imdb_id', 'title_link', 'director_name', 'director_id',
       'director_link', 'title_cc'],
      dtype='object')

In [17]:
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.strip(':')

In [172]:
title_list = imdb_lookup_df3['title_cc']

In [173]:
title_list[0:20]

0                                               beneath
1                               the_million_dollar_duck
2                                 cities_of_last_things
3                                            lazer_team
4                             the_powerpuff_girls_movie
5                                  these_are_the_damned
6                                                   bug
7                                                poison
8                                         american_hero
9                                                undead
10    aqua_teen_hunger_force_colon_movie_film_for_th...
11                                                stung
12                                        the_objective
13                                            curvature
14                                        the_minds_eye
15                                             metropia
16                                         time_changer
17                                save_the_green

In [22]:
# rotten tomatoes url - takes title, lower case, underscores
tomato_url = "https://www.rottentomatoes.com/m/{}"

In [155]:
movie_html = []
tomato_dict = {}
tomato_list = []


def get_page_source(title_list):
    t_success = 0
    t_nan = 0
    
    headers = ['title', 'tomatometer', 'tomato_ct', 'aud_score', 'aud_score_ct']
    
    start_time = time.time()
    tries = 0
    
    for title in title_list:
        url = tomato_url.format(title)
        user_agent = {'User-agent': 'Mozilla/5.0'}
        driver = webdriver.Chrome(chromedriver)
        tomato_response = driver.get(url)
        movie_html.append(driver.page_source)
        
        tries += 1
        time.sleep(2+2*random())
        elapsed_time = time.time() - start_time
        print('Request: {}; Frequency: {} requests/s'.format(tries, tries/elapsed_time))
        clear_output(wait = True)

        tomato_soup = BeautifulSoup(driver.page_source, 'html5lib')
       
        percents = tomato_soup.find_all('span', class_ = 'mop-ratings-wrap__percentage')
        if not percents:
            pass
        try:
            tomatometer = percents[0].text.strip().strip('%')
        except:
            tomatometer = np.nan
        try:
            aud_score = percents[1].text.strip().strip('%')
        except:
            aud_score = np.nan
                
        aud_score_cts = tomato_soup.find_all('strong', class_='mop-ratings-wrap__text--small')
        if not aud_score_cts:
            pass

        try:
            tomato_ct = tomato_soup.find('small', class_="mop-ratings-wrap__text--small").text.strip()
        except:
            tomato_ct = np.nan
        try:
            aud_score_ct = aud_score_cts[1].text.strip('User Ratings: ').strip()
        except:
            aud_score_ct = np.nan

        tomato_dict = dict(zip(headers, [title, tomatometer,tomato_ct,aud_score, aud_score_ct]))

        tomato_list.append(tomato_dict)



In [174]:
get_page_source(title_list)

Request: 1; Frequency: 0.09651267054628379 requests/s
Request: 2; Frequency: 0.1052446846880484 requests/s
Request: 3; Frequency: 0.10583214917210933 requests/s
Request: 4; Frequency: 0.10544584811674267 requests/s
Request: 5; Frequency: 0.10406853674412553 requests/s
Request: 6; Frequency: 0.1041812703857576 requests/s
Request: 7; Frequency: 0.10254388546574102 requests/s
Request: 8; Frequency: 0.10351448883246708 requests/s
Request: 9; Frequency: 0.10415415010313055 requests/s
Request: 10; Frequency: 0.10634002864994331 requests/s
Request: 11; Frequency: 0.10744948449912252 requests/s
Request: 12; Frequency: 0.10938345626721385 requests/s
Request: 13; Frequency: 0.11032251064019022 requests/s
Request: 14; Frequency: 0.11003541014757644 requests/s
Request: 15; Frequency: 0.11064321309124951 requests/s
Request: 16; Frequency: 0.11067666974645807 requests/s
Request: 17; Frequency: 0.10984446792971275 requests/s
Request: 18; Frequency: 0.1108828724122148 requests/s
Request: 19; Frequency

In [175]:
tomato_list[0:10]

[{'title': 'tenet',
  'tomatometer': '70',
  'tomato_ct': '324',
  'aud_score': '76',
  'aud_score_ct': 'Verified Ratings: 5,837'},
 {'title': 'the_midnight_sky',
  'tomatometer': '51',
  'tomato_ct': '226',
  'aud_score': '26',
  'aud_score_ct': '2,122'},
 {'title': 'wonder_woman',
  'tomatometer': '83',
  'tomato_ct': '12',
  'aud_score': '78',
  'aud_score_ct': '1,910'},
 {'title': 'avengers_endgame',
  'tomatometer': '94',
  'tomato_ct': '532',
  'aud_score': '90',
  'aud_score_ct': '70,830'},
 {'title': 'outside_the_wire',
  'tomatometer': '36',
  'tomato_ct': '58',
  'aud_score': '33',
  'aud_score_ct': '295'},
 {'title': 'dune',
  'tomatometer': '75',
  'tomato_ct': 'N/A',
  'aud_score': nan,
  'aud_score_ct': '1,261'},
 {'title': 'interstellar',
  'tomatometer': '72',
  'tomato_ct': '365',
  'aud_score': '86',
  'aud_score_ct': '176,217'},
 {'title': 'love_and_monsters',
  'tomatometer': '92',
  'tomato_ct': '75',
  'aud_score': '89',
  'aud_score_ct': 'Verified Ratings: 54'},


In [178]:
len(tomato_list)

2083

In [179]:
tomato_df2 = pd.DataFrame(tomato_list)
tomato_df2.head()

Unnamed: 0,title,tomatometer,tomato_ct,aud_score,aud_score_ct
0,tenet,70,324,76,"Verified Ratings: 5,837"
1,the_midnight_sky,51,226,26,2122
2,wonder_woman,83,12,78,1910
3,avengers_endgame,94,532,90,70830
4,outside_the_wire,36,58,33,295


In [180]:
len(tomato_df2)

2083

In [181]:
tomato_df2['tomato_ct']= tomato_df2['tomato_ct'].replace('N/A', np.NaN)

In [182]:
tomato_df2['aud_score_ct']= tomato_df2['aud_score_ct'].replace('Not yet availabl', np.NaN)

In [183]:
tomato_df2.describe()

Unnamed: 0,title,tomatometer,tomato_ct,aud_score,aud_score_ct
count,2083,1369,1098,1081,1352
unique,2017,100,274,92,1170
top,cargo,0,5,45,30
freq,3,35,38,26,6


In [185]:
tomato_df2.to_json('tomato_data2.json', orient='records', lines=True)