# Web Scraping Rotten Tomatoes Website

In [1]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
import urllib.parse

#### Import list of films from Mojo

In [2]:
bk_file_nm = 'bk_rt_2_raw.csv'

df_mojo_full = pd.read_csv('bk_mojo.csv')
df_mojo = df_mojo_full[200:400]

print('# films: ', len(df_mojo))
df_mojo.head()

# films:  200


Unnamed: 0,Title,Worldwide Lifetime Gross,Domestic Lifetime Gross,Domestic %,Foreign Lifetime Gross,Foreign %,Year
200,The Revenant,"$532,950,503","$183,637,894",34.5%,"$349,312,609",65.5%,2015
201,The Meg,"$530,438,278","$145,443,742",27.4%,"$384,994,536",72.6%,2018
202,Ralph Breaks the Internet,"$529,323,962","$201,091,711",38%,"$328,232,251",62%,2018
203,Hotel Transylvania 3: Summer Vacation,"$528,583,774","$167,510,016",31.7%,"$361,073,758",68.3%,2018
204,The Boss Baby,"$527,965,936","$175,003,033",33.2%,"$352,962,903",66.8%,2017


#### Use Selenium to web scrape movie review snippets from Rotten Tomatoes

In [3]:
%%time
#for each in mojo, search on rt, click first result, scrape first 5 pages of results, append to df

l_rev = []
l_mojo_movie_name = []
l_rt_movie_name = []
l_genre = []
l_year = []
l_cc = []
l_desc = []
l_rel_year = []
l_stream_year = []
lst_rt_combined = []
loop_counter = 1

driver = webdriver.Chrome('./chromedriver')

for index, row in df_mojo.iterrows(): 
    
    rt_movie_name=''
    
    # Print stage in loop every 25 movies
    if loop_counter % 25 == 0:
        print('Loops completed: ', loop_counter)
    loop_counter += 1
        
    try:
        # Search for movie
        driver.get('https://www.rottentomatoes.com/search?search='+urllib.parse.quote(row['Title']))
        
        # Filter to movies
        button_list=driver.find_element_by_class_name('search__nav').find_elements_by_class_name('js-search-filter')
        for button in button_list: 
            for attr in button.get_property('attributes'):
                if 'movie' in attr['value']: movie_button=button
        movie_button.click()
        
        # Fetch results of first result
        res = driver.find_elements_by_xpath('//*[@id="main-page-content"]/div/section[1]/search-page-result-container/search-page-result')
        for r in res: 
            for attr in r.get_property('attributes'):
                if 'movie' in attr['value']: 
                    movie_area = r.find_element_by_tag_name("ul")
        for aa in movie_area.find_element_by_tag_name("search-page-media-row").find_elements_by_tag_name("a"):
            for attr in aa.get_property('attributes'):
                if 'title' in attr['value'] and 'slot' in attr['name']: ListlinkerHref=aa
        
        # Navigate to Movie's page
        ListlinkerHref=ListlinkerHref.get_attribute("href")
        driver.get(ListlinkerHref  + '') 
        time.sleep(2) #page load
        
        try:
            # Extract 'Critics Consensus'
            cc = driver.find_element_by_xpath('//*[@id="what-to-know"]/div/section/p/span').text

            # Extract Description
            desc = driver.find_element_by_xpath('//*[@id="movieSynopsis"]').text
            
        except:
            pass

        # Navigate to 'Top Critics' Reviews page
        driver.get(ListlinkerHref  + '/reviews?type=top_critics') 
        time.sleep(2) #page load
        
        # Movie name being scraped
        rt_movie_name=driver.title.replace(' - Rotten Tomatoes','')

        try:
            # Movie Genre
            genre = []
            for g in driver.find_element_by_xpath('//div[@class="bottom_divider"]').find_elements_by_tag_name("li"):
                genre.append(g.text)
        
            # Fetch review snippets
            for rev in driver.find_elements_by_css_selector("div.review_table div.the_review"):
                try: 
                    l_rev.append(rev.text)
                except: 
                    l_rev.append('error')

                l_mojo_movie_name.append(row['Title']) # mojo movie name
                l_rt_movie_name.append(rt_movie_name) # RT movie name
                l_year.append(row['Year'])
                l_genre.append(genre[1]) # Genre
                l_rel_year.append(genre[3].split(',')[-1].strip()) # In theatre year
                l_stream_year.append(genre[4].split(',')[-1].strip()) # streaming year
                l_cc.append(cc)
                l_desc.append(desc)                

        except: 
            l_rev.append('no reviews found')
            l_mojo_movie_name.append(row['Title'])
            l_rt_movie_name.append(rt_movie_name)
            l_year.append(row['Year'])
            l_genre.append('no genre found')
            l_rel_year.append('no year')
            l_stream_year.append('no year')
            l_cc.append('no critics consensus')
            l_desc.append('no description')
            
    except: 
        l_rev.append('misc error')
        l_mojo_movie_name.append(row['Title']) #join to watchmojo moviename
        l_year.append(row['Year'])
        l_rt_movie_name.append('error')
        l_genre.append('error')
        l_rel_year.append('error')
        l_stream_year.append('error')
        l_cc.append('error')
        l_desc.append('error')
    
    # Combine everything except the agggregated movie reviews
    lst_rt_combined.append([l_mojo_movie_name, l_rt_movie_name, l_rev, l_genre, l_rel_year])
        
driver.close()      



Loops completed:  25
Loops completed:  50
Loops completed:  75
Loops completed:  100
Loops completed:  125
Loops completed:  150
Loops completed:  175
Loops completed:  200
Wall time: 1h 51min 21s


#### Save output to CSV

In [4]:
df_reviews = pd.DataFrame(
    np.column_stack([
            l_rev
            ,l_mojo_movie_name
            ,l_year
            ,l_rt_movie_name  
            ,l_genre
            ,l_rel_year
            ,l_stream_year
            ,l_cc
            ,l_desc
            ]), 
    columns=['review','mojo_name','mojo_year', 'rt_name', 'genre'
             , 'release_year', 'stream_year', 'critical_consensus','description'])

# Backup Movie list
df_reviews.to_csv(bk_file_nm, index = False)

df_reviews.tail()

Unnamed: 0,review,mojo_name,mojo_year,rt_name,genre,release_year,stream_year,critical_consensus,description
3247,To return to why Murder on the Orient Express ...,Murder on the Orient Express,2017,Murder on the Orient Express - Movie Reviews,"Mystery & Thriller , Crime , Drama",2017,2018,Stylish production and an all-star ensemble ke...,A lavish trip through Europe quickly unfolds i...
3248,Whodunit? Who took the life of a famous murder...,Murder on the Orient Express,2017,Murder on the Orient Express - Movie Reviews,"Mystery & Thriller , Crime , Drama",2017,2018,Stylish production and an all-star ensemble ke...,A lavish trip through Europe quickly unfolds i...
3249,There is too little levity and cleverness afoo...,Murder on the Orient Express,2017,Murder on the Orient Express - Movie Reviews,"Mystery & Thriller , Crime , Drama",2017,2018,Stylish production and an all-star ensemble ke...,A lavish trip through Europe quickly unfolds i...
3250,Throughout the newest version of Murder on the...,Murder on the Orient Express,2017,Murder on the Orient Express - Movie Reviews,"Mystery & Thriller , Crime , Drama",2017,2018,Stylish production and an all-star ensemble ke...,A lavish trip through Europe quickly unfolds i...
3251,Branagh fusses about with unusual camera angle...,Murder on the Orient Express,2017,Murder on the Orient Express - Movie Reviews,"Mystery & Thriller , Crime , Drama",2017,2018,Stylish production and an all-star ensemble ke...,A lavish trip through Europe quickly unfolds i...
