# Web Scraping Rotten Tomatoes Website

In [1]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
import urllib.parse

#### Import list of films from Mojo

In [2]:
bk_file_nm = 'bk_rt_3_raw.csv'

df_wiki_full = pd.read_csv('bk_wiki.csv')
df_wiki = df_wiki_full[(108*2):(108*3)]

print('# films: ', len(df_wiki))
df_wiki.head()

# films:  108


Unnamed: 0,Title,Year,Worldwide Gross
216,Pirates of the Caribbean: On Stranger Tides,2011,"$1,045,713,802"
217,Alice in Wonderland,2010,"$1,025,467,110"
218,The Hobbit: An Unexpected Journey,2012,"$1,017,003,568"
219,Harry Potter and the Philosopher's Stone,2001,"$1,006,968,171"
220,Harry Potter and the Deathly Hallows – Part 1,2010,"$976,941,486"


#### Use Selenium to web scrape movie review snippets from Rotten Tomatoes

In [3]:
%%time
#for each in mojo, search on rt, click first result, scrape first 5 pages of results, append to df

l_rev = []
l_mojo_movie_name = []
l_rt_movie_name = []
l_genre = []
l_year = []
l_cc = []
l_desc = []
l_rel_year = []
l_stream_year = []
lst_rt_combined = []
loop_counter = 1

driver = webdriver.Chrome('./chromedriver')

for index, row in df_wiki.iterrows(): 
    
    rt_movie_name=''
    
    # Print stage in loop every 25 movies
    if loop_counter % 25 == 0:
        print('Loops completed: ', loop_counter)
    loop_counter += 1
        
    try:
        # Search for movie
        driver.get('https://www.rottentomatoes.com/search?search='+urllib.parse.quote(row['Title']))
        
        # Filter to movies
        button_list=driver.find_element_by_class_name('search__nav').find_elements_by_class_name('js-search-filter')
        for button in button_list: 
            for attr in button.get_property('attributes'):
                if 'movie' in attr['value']: movie_button=button
        movie_button.click()
        
        # Fetch results of first result
        res = driver.find_elements_by_xpath('//*[@id="main-page-content"]/div/section[1]/search-page-result-container/search-page-result')
        for r in res: 
            for attr in r.get_property('attributes'):
                if 'movie' in attr['value']: 
                    movie_area = r.find_element_by_tag_name("ul")
        for aa in movie_area.find_element_by_tag_name("search-page-media-row").find_elements_by_tag_name("a"):
            for attr in aa.get_property('attributes'):
                if 'title' in attr['value'] and 'slot' in attr['name']: ListlinkerHref=aa
        
        # Navigate to Movie's page
        ListlinkerHref=ListlinkerHref.get_attribute("href")
        driver.get(ListlinkerHref  + '') 
        time.sleep(2) #page load
        
        try:
            # Extract 'Critics Consensus'
            cc = driver.find_element_by_xpath('//*[@id="what-to-know"]/div/section/p/span').text

            # Extract Description
            desc = driver.find_element_by_xpath('//*[@id="movieSynopsis"]').text
            
        except:
            pass

        # Navigate to 'Top Critics' Reviews page
        driver.get(ListlinkerHref  + '/reviews?type=top_critics') 
        time.sleep(2) #page load
        
        # Movie name being scraped
        rt_movie_name=driver.title.replace(' - Rotten Tomatoes','')

        try:
            # Movie Genre
            genre = []
            for g in driver.find_element_by_xpath('//div[@class="bottom_divider"]').find_elements_by_tag_name("li"):
                genre.append(g.text)
        
            # Fetch review snippets
            for rev in driver.find_elements_by_css_selector("div.review_table div.the_review"):
                try: 
                    l_rev.append(rev.text)
                except: 
                    l_rev.append('error')

                l_mojo_movie_name.append(row['Title']) # mojo movie name
                l_rt_movie_name.append(rt_movie_name) # RT movie name
                l_year.append(row['Year'])
                l_genre.append(genre[1]) # Genre
                l_rel_year.append(genre[3].split(',')[-1].strip()) # In theatre year
                l_stream_year.append(genre[4].split(',')[-1].strip()) # streaming year
                try: 
                    l_cc.append(cc)
                    l_desc.append(desc)
                except:
                    l_cc.append('no critics consensus')
                    l_desc.append('no description')            

        except: 
            l_rev.append('no reviews found')
            l_mojo_movie_name.append(row['Title'])
            l_rt_movie_name.append(rt_movie_name)
            l_year.append(row['Year'])
            l_genre.append('no genre found')
            l_rel_year.append('no year')
            l_stream_year.append('no year')
            l_cc.append('no critics consensus')
            l_desc.append('no description')
            
    except: 
        l_rev.append('misc error')
        l_mojo_movie_name.append(row['Title']) #join to watchmojo moviename
        l_year.append(row['Year'])
        l_rt_movie_name.append('error')
        l_genre.append('error')
        l_rel_year.append('error')
        l_stream_year.append('error')
        l_cc.append('error')
        l_desc.append('error')
        
driver.close()      



Loops completed:  25
Loops completed:  50
Loops completed:  75
Loops completed:  100
Wall time: 1h 5min 16s


#### Save output to CSV

In [4]:
df_reviews = pd.DataFrame(
    np.column_stack([
            l_rev
            ,l_mojo_movie_name
            ,l_year
            ,l_rt_movie_name  
            ,l_genre
            ,l_rel_year
            ,l_stream_year
            ,l_cc
            ,l_desc
            ]), 
    columns=['review','mojo_name','mojo_year', 'rt_name', 'genre'
             , 'release_year', 'stream_year', 'critical_consensus','description'])

# Backup Movie list
df_reviews.to_csv(bk_file_nm, index = False)

df_reviews.head()

Unnamed: 0,review,mojo_name,mojo_year,rt_name,genre,release_year,stream_year,critical_consensus,description
0,Remember how fresh and novel Pirates of the Ca...,Pirates of the Caribbean: On Stranger Tides,2011,Pirates of the Caribbean: On Stranger Tides - ...,"Action , Fantasy , Adventure",2011,2014,It's shorter and leaner than the previous sequ...,The checkered past of Capt. Jack Sparrow (John...
1,just wish this movie had made me feel somethin...,Pirates of the Caribbean: On Stranger Tides,2011,Pirates of the Caribbean: On Stranger Tides - ...,"Action , Fantasy , Adventure",2011,2014,It's shorter and leaner than the previous sequ...,The checkered past of Capt. Jack Sparrow (John...
2,"Depp, grimacing, edges in and out of the actio...",Pirates of the Caribbean: On Stranger Tides,2011,Pirates of the Caribbean: On Stranger Tides - ...,"Action , Fantasy , Adventure",2011,2014,It's shorter and leaner than the previous sequ...,The checkered past of Capt. Jack Sparrow (John...
3,I've never seen a film in which what was actua...,Pirates of the Caribbean: On Stranger Tides,2011,Pirates of the Caribbean: On Stranger Tides - ...,"Action , Fantasy , Adventure",2011,2014,It's shorter and leaner than the previous sequ...,The checkered past of Capt. Jack Sparrow (John...
4,A poorly scripted film has its forked tongue s...,Pirates of the Caribbean: On Stranger Tides,2011,Pirates of the Caribbean: On Stranger Tides - ...,"Action , Fantasy , Adventure",2011,2014,It's shorter and leaner than the previous sequ...,The checkered past of Capt. Jack Sparrow (John...
