# Web Scraping Rotten Tomatoes Website

In [1]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
import urllib.parse

#### Import list of films from Mojo

In [2]:
bk_file_nm = 'bk_rt_4_raw.csv'

df_mojo_full = pd.read_csv('bk_mojo.csv')
df_mojo = df_mojo_full[600:800]

print('# films: ', len(df_mojo))
df_mojo.head()

# films:  200


Unnamed: 0,Title,Worldwide Lifetime Gross,Domestic Lifetime Gross,Domestic %,Foreign Lifetime Gross,Foreign %,Year
600,Super 8,"$260,095,986","$127,004,179",48.8%,"$133,091,807",51.2%,2011
601,Mojin: The Lost Legend,"$259,368,448","$1,243,810",0.5%,"$258,124,638",99.5%,2015
602,Blade Runner 2049,"$259,334,548","$92,054,159",35.5%,"$167,280,389",64.5%,2017
603,Charlie's Angels: Full Throttle,"$259,175,788","$100,830,111",38.9%,"$158,345,677",61.1%,2003
604,Parasite,"$258,668,626","$53,369,749",20.6%,"$205,298,877",79.4%,2019


In [3]:
len(df_mojo_full[0:250])
len(df_mojo_full[250:500])
len(df_mojo_full[500:750])
len(df_mojo_full[750:])

250

#### Use Selenium to web scrape movie review snippets from Rotten Tomatoes

In [4]:
%%time
#for each in mojo, search on rt, click first result, scrape first 5 pages of results, append to df

l_rev = []
l_mojo_movie_name = []
l_rt_movie_name = []
l_genre = []
l_year = []
l_cc = []
l_desc = []
l_rel_year = []
l_stream_year = []
lst_rt_combined = []
loop_counter = 1

driver = webdriver.Chrome('./chromedriver')

for index, row in df_mojo.iterrows(): 
    
    rt_movie_name=''
    
    # Print stage in loop every 25 movies
    if loop_counter % 25 == 0:
        print('Loops completed: ', loop_counter)
    loop_counter += 1
        
    try:
        # Search for movie
        driver.get('https://www.rottentomatoes.com/search?search='+urllib.parse.quote(row['Title']))
        
        # Filter to movies
        button_list=driver.find_element_by_class_name('search__nav').find_elements_by_class_name('js-search-filter')
        for button in button_list: 
            for attr in button.get_property('attributes'):
                if 'movie' in attr['value']: movie_button=button
        movie_button.click()
        
        # Fetch results of first result
        res = driver.find_elements_by_xpath('//*[@id="main-page-content"]/div/section[1]/search-page-result-container/search-page-result')
        for r in res: 
            for attr in r.get_property('attributes'):
                if 'movie' in attr['value']: 
                    movie_area = r.find_element_by_tag_name("ul")
        for aa in movie_area.find_element_by_tag_name("search-page-media-row").find_elements_by_tag_name("a"):
            for attr in aa.get_property('attributes'):
                if 'title' in attr['value'] and 'slot' in attr['name']: ListlinkerHref=aa
        
        # Navigate to Movie's page
        ListlinkerHref=ListlinkerHref.get_attribute("href")
        driver.get(ListlinkerHref  + '') 
        time.sleep(2) #page load
        
        try:
            # Extract 'Critics Consensus'
            cc = driver.find_element_by_xpath('//*[@id="what-to-know"]/div/section/p/span').text

            # Extract Description
            desc = driver.find_element_by_xpath('//*[@id="movieSynopsis"]').text
            
        except:
            pass

        # Navigate to 'Top Critics' Reviews page
        driver.get(ListlinkerHref  + '/reviews?type=top_critics') 
        time.sleep(2) #page load
        
        # Movie name being scraped
        rt_movie_name=driver.title.replace(' - Rotten Tomatoes','')

        try:
            # Movie Genre
            genre = []
            for g in driver.find_element_by_xpath('//div[@class="bottom_divider"]').find_elements_by_tag_name("li"):
                genre.append(g.text)
        
            # Fetch review snippets
            for rev in driver.find_elements_by_css_selector("div.review_table div.the_review"):
                try: 
                    l_rev.append(rev.text)
                except: 
                    l_rev.append('error')

                l_mojo_movie_name.append(row['Title']) # mojo movie name
                l_rt_movie_name.append(rt_movie_name) # RT movie name
                l_year.append(row['Year'])
                l_genre.append(genre[1]) # Genre
                l_rel_year.append(genre[3].split(',')[-1].strip()) # In theatre year
                l_stream_year.append(genre[4].split(',')[-1].strip()) # streaming year
                l_cc.append(cc)
                l_desc.append(desc)                

        except: 
            l_rev.append('no reviews found')
            l_mojo_movie_name.append(row['Title'])
            l_rt_movie_name.append(rt_movie_name)
            l_year.append(row['Year'])
            l_genre.append('no genre found')
            l_rel_year.append('no year')
            l_stream_year.append('no year')
            l_cc.append('no critics consensus')
            l_desc.append('no description')
            
    except: 
        l_rev.append('misc error')
        l_mojo_movie_name.append(row['Title']) #join to watchmojo moviename
        l_year.append(row['Year'])
        l_rt_movie_name.append('error')
        l_genre.append('error')
        l_rel_year.append('error')
        l_stream_year.append('error')
        l_cc.append('error')
        l_desc.append('error')
    
    # Combine everything except the agggregated movie reviews
    lst_rt_combined.append([l_mojo_movie_name, l_rt_movie_name, l_rev, l_genre, l_rel_year])
        
driver.close()      



Loops completed:  25
Loops completed:  50
Loops completed:  75
Loops completed:  100
Loops completed:  125
Loops completed:  150
Loops completed:  175
Loops completed:  200
Wall time: 1h 53min 20s


#### Save output to CSV

In [5]:
df_reviews = pd.DataFrame(
    np.column_stack([
            l_rev
            ,l_mojo_movie_name
            ,l_year
            ,l_rt_movie_name  
            ,l_genre
            ,l_rel_year
            ,l_stream_year
            ,l_cc
            ,l_desc
            ]), 
    columns=['review','mojo_name','mojo_year', 'rt_name', 'genre'
             , 'release_year', 'stream_year', 'critical_consensus','description'])

# Backup Movie list
df_reviews.to_csv(bk_file_nm, index = False)

df_reviews.tail()

Unnamed: 0,review,mojo_name,mojo_year,rt_name,genre,release_year,stream_year,critical_consensus,description
3151,Mr. Serling would be pleased.,The Others,2001,The Others - Movie Reviews,"Mystery & Thriller , Horror",2001,2016,The Others is a spooky thriller that reminds u...,"Grace (Nicole Kidman), the devoutly religious ..."
3152,"Bumpy at first, with a few too many clichs and...",The Others,2001,The Others - Movie Reviews,"Mystery & Thriller , Horror",2001,2016,The Others is a spooky thriller that reminds u...,"Grace (Nicole Kidman), the devoutly religious ..."
3153,"Clever as the film is, though, it turns out to...",The Others,2001,The Others - Movie Reviews,"Mystery & Thriller , Horror",2001,2016,The Others is a spooky thriller that reminds u...,"Grace (Nicole Kidman), the devoutly religious ..."
3154,The Others is a thoughtfully chilling movie. I...,The Others,2001,The Others - Movie Reviews,"Mystery & Thriller , Horror",2001,2016,The Others is a spooky thriller that reminds u...,"Grace (Nicole Kidman), the devoutly religious ..."
3155,Creepy-verging -on-campy thriller with a great...,The Others,2001,The Others - Movie Reviews,"Mystery & Thriller , Horror",2001,2016,The Others is a spooky thriller that reminds u...,"Grace (Nicole Kidman), the devoutly religious ..."
