# Web Scraping Rotten Tomatoes Website

In [1]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
import urllib.parse

#### Import list of films from Mojo

In [2]:
bk_file_nm = 'bk_rt_1_raw.csv'

df_mojo_full = pd.read_csv('bk_mojo.csv')
df_mojo = df_mojo_full[0:250]

print('# films: ', len(df_mojo))
df_mojo.head()

# films:  250


Unnamed: 0,Title,Worldwide Lifetime Gross,Domestic Lifetime Gross,Domestic %,Foreign Lifetime Gross,Foreign %,Year
0,Avatar,"$2,847,246,203","$760,507,625",26.7%,"$2,086,738,578",73.3%,2009
1,Avengers: Endgame,"$2,797,501,328","$858,373,000",30.7%,"$1,939,128,328",69.3%,2019
2,Titanic,"$2,201,647,264","$659,363,944",30%,"$1,542,283,320",70%,1997
3,Star Wars: Episode VII - The Force Awakens,"$2,069,521,700","$936,662,225",45.3%,"$1,132,859,475",54.7%,2015
4,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%,2018


In [3]:
len(df_mojo_full[0:250])
len(df_mojo_full[250:500])
len(df_mojo_full[500:750])
len(df_mojo_full[750:])

250

#### Use Selenium to web scrape movie review snippets from Rotten Tomatoes

In [4]:
%%time
#for each in mojo, search on rt, click first result, scrape first 5 pages of results, append to df

l_rev = []
l_mojo_movie_name = []
l_rt_movie_name = []
l_genre = []
l_year = []
l_rel_year = []
l_stream_year = []
lst_rt_combined = []
loop_counter = 1

driver = webdriver.Chrome('./chromedriver')

for index, row in df_mojo.iterrows(): #dont judge me 
    
    rt_movie_name=''
    
    # Print stage in loop every 25 movies
    if loop_counter % 25 == 0:
        print('Loops completed: ', loop_counter)
    loop_counter += 1
        
    try:
        # Search for movie
        driver.get('https://www.rottentomatoes.com/search?search='+urllib.parse.quote(row['Title']))
        
        # Filter to movies
        button_list=driver.find_element_by_class_name('search__nav').find_elements_by_class_name('js-search-filter')
        for button in button_list: 
            for attr in button.get_property('attributes'):
                if 'movie' in attr['value']: movie_button=button
        movie_button.click()
        
        # Fetch results of first result
        res = driver.find_elements_by_xpath('//*[@id="main-page-content"]/div/section[1]/search-page-result-container/search-page-result')
        for r in res: 
            for attr in r.get_property('attributes'):
                if 'movie' in attr['value']: 
                    movie_area = r.find_element_by_tag_name("ul")
        for aa in movie_area.find_element_by_tag_name("search-page-media-row").find_elements_by_tag_name("a"):
            for attr in aa.get_property('attributes'):
                if 'title' in attr['value'] and 'slot' in attr['name']: ListlinkerHref=aa
        
        # Navigate to 'Top Critics' Reviews page
        ListlinkerHref=ListlinkerHref.get_attribute("href")
        driver.get(ListlinkerHref  + '/reviews?type=top_critics') 
        time.sleep(2) #page load
        
        # Movie name being scraped
        rt_movie_name=driver.title.replace(' - Rotten Tomatoes','')

        try:
            # Movie Genre
            genre = []
            for g in driver.find_element_by_xpath('//div[@class="bottom_divider"]').find_elements_by_tag_name("li"):
                genre.append(g.text)
        
            # Fetch review snippets
            for rev in driver.find_elements_by_css_selector("div.review_table div.the_review"):
                try: 
                    l_rev.append(rev.text)
                except: 
                    l_rev.append('error')

                l_mojo_movie_name.append(row['Title']) # mojo movie name
                l_rt_movie_name.append(rt_movie_name) # RT movie name
                l_year.append(row['Year'])
                l_genre.append(genre[1]) # Genre
                l_rel_year.append(genre[3].split(',')[-1].strip()) # In theatre year
                l_stream_year.append(genre[4].split(',')[-1].strip()) # streaming year

        except: 
            l_rev.append('no reviews found')
            l_mojo_movie_name.append(row['Title'])
            l_rt_movie_name.append(rt_movie_name)
            l_year.append(row['Year'])
            l_genre.append('no genre found')
            l_rel_year.append('no year')
            l_stream_year.append('no year')
            
    except: 
        l_rev.append('misc error')
        l_mojo_movie_name.append(row['Title']) #join to watchmojo moviename
        l_year.append(row['Year'])
        l_rt_movie_name.append('error')
        l_genre.append('error')
        l_rel_year.append('error')
        l_stream_year.append('error')
    
    # Combine everything except the agggregated movie reviews
    lst_rt_combined.append([l_mojo_movie_name, l_rt_movie_name, l_rev, l_genre, l_rel_year])
        
driver.close()      



Loops completed:  25
Loops completed:  50
Loops completed:  75
Loops completed:  100
Loops completed:  125
Loops completed:  150
Loops completed:  175
Loops completed:  200
Loops completed:  225
Loops completed:  250
Wall time: 1h 21min 55s


#### Save output to CSV

In [5]:
df_reviews = pd.DataFrame(
    np.column_stack([
            l_rev
            ,l_mojo_movie_name
            ,l_year
            ,l_rt_movie_name  
            ,l_genre
            ,l_rel_year
            , l_stream_year
            ]), 
    columns=['review','mojo_name','mojo_year', 'rt_name', 'genre', 'release_year', 'stream_year'])

# Backup Movie list
df_reviews.to_csv(bk_file_nm, index = False)

df_reviews.head()

Unnamed: 0,review,mojo_name,mojo_year,rt_name,genre,release_year,stream_year
0,Five hundred million dollars [wasted].,Avatar,2009,Avatar - Movie Reviews,"Fantasy , Action , Adventure , Sci-Fi",2009,2016
1,"Rewatching it with fresh eyes 10 years later, ...",Avatar,2009,Avatar - Movie Reviews,"Fantasy , Action , Adventure , Sci-Fi",2009,2016
2,"In my opinion, Avatar has been hyped beyond th...",Avatar,2009,Avatar - Movie Reviews,"Fantasy , Action , Adventure , Sci-Fi",2009,2016
3,Learning your visuals from Jack Kirby is one t...,Avatar,2009,Avatar - Movie Reviews,"Fantasy , Action , Adventure , Sci-Fi",2009,2016
4,"Overall, Avatar was really about the jaw-dropp...",Avatar,2009,Avatar - Movie Reviews,"Fantasy , Action , Adventure , Sci-Fi",2009,2016
