In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import json
import csv

In [53]:
# list to save movie url 
movie_urls=[]

In [54]:
# list of URLS that each contain about 50 movies from 2017-2019
url_list = ['https://editorial.rottentomatoes.com/guide/summer-movie-scorecard-2019/', 
            'https://editorial.rottentomatoes.com/guide/summer-movie-scorecard-2018/',
            'https://editorial.rottentomatoes.com/guide/summer-movie-scorecard-2017/'
           ]

In [55]:
# find and save list of movie urls in each page using BeautifulSoup
for url in url_list:
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    for link in soup.findAll('a', {'class': 'article_movie_poster'}):
        try :
            movie_link = link['href']
            movie_urls.append(movie_link)
            # print (link['href'])
        except Exception as err:
            print(err)

In [57]:
# print the length of movie URLS. total of 142
len(movie_urls)

142

In [72]:
#examine the URLS
movie_urls[:5]

['https://www.rottentomatoes.com/m/the_hustle_2019/',
 'https://www.rottentomatoes.com/m/the_kitchen/',
 'https://www.rottentomatoes.com/m/men_in_black_international/',
 'https://www.rottentomatoes.com/m/dark_phoenix/',
 'https://www.rottentomatoes.com/m/uglydolls/']

In [66]:
#to use the RottenTomatoes API we need a movie-id , this code extracts and save this movieID for each movie in the list (as a list of tuples)
movieID_list =[]
# get movieids
for movie in movie_urls:
    review_url = movie +'reviews?type=user'
    request = requests.get(review_url)
    data = json.loads(re.search('movieReview\s=\s(.*);', request.text).group(1))
    movieId = data["movieId"]
    movieID_list.append((movie, movieId))

In [75]:
#print the first 5
movieID_list[:5]

[('https://www.rottentomatoes.com/m/the_hustle_2019/',
  'b71aaf1e-eaf4-3c45-a9fb-369a0f916108'),
 ('https://www.rottentomatoes.com/m/the_kitchen/',
  '83e3861c-8a25-3f03-ab9a-1e8f0994775e'),
 ('https://www.rottentomatoes.com/m/men_in_black_international/',
  'e3d56bd8-a285-31dd-8306-fd8e331d05b7'),
 ('https://www.rottentomatoes.com/m/dark_phoenix/',
  '1e586d32-284c-356d-ba6f-e5184693d838'),
 ('https://www.rottentomatoes.com/m/uglydolls/',
  '296a363c-5b4e-30c1-a290-6c13a2062429')]

In [71]:
#create a df consisting of URLS and movieID
df_movieid = pd.DataFrame(movieID_list, columns=['URL', 'MovieID'])
df_movieid.head()

Unnamed: 0,URL,MovieID
0,https://www.rottentomatoes.com/m/the_hustle_2019/,b71aaf1e-eaf4-3c45-a9fb-369a0f916108
1,https://www.rottentomatoes.com/m/the_kitchen/,83e3861c-8a25-3f03-ab9a-1e8f0994775e
2,https://www.rottentomatoes.com/m/men_in_black_...,e3d56bd8-a285-31dd-8306-fd8e331d05b7
3,https://www.rottentomatoes.com/m/dark_phoenix/,1e586d32-284c-356d-ba6f-e5184693d838
4,https://www.rottentomatoes.com/m/uglydolls/,296a363c-5b4e-30c1-a290-6c13a2062429


In [73]:
#save this to a csv file so we wouldnt have to scrape everytime
df_movieid.to_csv('movieids.csv')

In [17]:
# load the saved csv file
df_movieids = pd.read_csv('movieids.csv')
df_movieids.drop('Unnamed: 0', axis=1, inplace=True) 
df_movieids.head()

Unnamed: 0,URL,MovieID
0,https://www.rottentomatoes.com/m/the_hustle_2019/,b71aaf1e-eaf4-3c45-a9fb-369a0f916108
1,https://www.rottentomatoes.com/m/the_kitchen/,83e3861c-8a25-3f03-ab9a-1e8f0994775e
2,https://www.rottentomatoes.com/m/men_in_black_...,e3d56bd8-a285-31dd-8306-fd8e331d05b7
3,https://www.rottentomatoes.com/m/dark_phoenix/,1e586d32-284c-356d-ba6f-e5184693d838
4,https://www.rottentomatoes.com/m/uglydolls/,296a363c-5b4e-30c1-a290-6c13a2062429


In [18]:
#re-create the movieID_list tuple from the dataframe
movie_urls = list(df_movieids.URL)
movie_ids = list(df_movieids.MovieID)

movieID_list = list(zip(movie_urls,movie_ids))
movieID_list[:3]

[('https://www.rottentomatoes.com/m/the_hustle_2019/',
  'b71aaf1e-eaf4-3c45-a9fb-369a0f916108'),
 ('https://www.rottentomatoes.com/m/the_kitchen/',
  '83e3861c-8a25-3f03-ab9a-1e8f0994775e'),
 ('https://www.rottentomatoes.com/m/men_in_black_international/',
  'e3d56bd8-a285-31dd-8306-fd8e331d05b7')]

In [19]:
#create an emtry array to save audiance movie reviews
all_movie_reviews = []

In [21]:
# We have total of 142 movie urls. we will limit the reviews from a single movie to a max of 2500 
# This should give a dataset closer to 150K reivews (some movies have less than 1000 reviews)
movie_index=0 
#loop through all movies  
for movie_url, movie_id in movieID_list:
    print('Getting movie index', movie_index, movie_url)
    movie_index= movie_index +1
    headers = {
    'Referer': movie_url +'reviews?type=user',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',}
    
    url = 'https://www.rottentomatoes.com/napi/movie/'+movie_id+'/reviews/user'
    
    payload = {
    'direction': 'next',
    'endCursor': '',
    'startCursor': '',}

    sess = requests.Session()
    
    count = 0 
    MAX_PER_MOVIE = 2500
    
    while True:
        if count >= MAX_PER_MOVIE :
            break;
        try :
            r = sess.get(url, headers=headers, params=payload)
            data = r.json()

            # if there is no next page then we have reached the end
            if not data['pageInfo']['hasNextPage']:
                break
        
            payload['endCursor'] = data['pageInfo']['endCursor']
            payload['startCursor'] = data['pageInfo']['startCursor']

            for x in data['reviews']:
                # user = x['user']['displayName'] # get the name of the reviwer
                review = x['review'] # get the audience review 
                star = x['rating'] # get the star rating (out of 5)
                all_movie_reviews.append((review, star)) # append review and start rating to the reviews list as tuple
                count = count+1 # increase count , the loop will terminate once it hit max
        except Exception as err:
            print(err) 

Getting movie index 0 https://www.rottentomatoes.com/m/the_hustle_2019/
Getting movie index 1 https://www.rottentomatoes.com/m/the_kitchen/
Getting movie index 2 https://www.rottentomatoes.com/m/men_in_black_international/
Getting movie index 3 https://www.rottentomatoes.com/m/dark_phoenix/
Getting movie index 4 https://www.rottentomatoes.com/m/uglydolls/
Getting movie index 5 https://www.rottentomatoes.com/m/the_intruder_2019/
Getting movie index 6 https://www.rottentomatoes.com/m/anna_2019/
Getting movie index 7 https://www.rottentomatoes.com/m/shaft_2019/
Getting movie index 8 https://www.rottentomatoes.com/m/poms/
Getting movie index 9 https://www.rottentomatoes.com/m/el_chicano/
Getting movie index 10 https://www.rottentomatoes.com/m/overcomer/
Getting movie index 11 https://www.rottentomatoes.com/m/angel_has_fallen/
Getting movie index 12 https://www.rottentomatoes.com/m/godzilla_king_of_the_monsters_2019/
Getting movie index 13 https://www.rottentomatoes.com/m/stuber/
Getting mo

In [22]:
#create a dataframe using the reviews collected
rotten_reviews = pd.DataFrame(all_movie_reviews, columns=['Reviews', 'StarRating'])

In [16]:
#save the file to the directory
rotten_reviews.to_csv('rottenReview_backup_xl.csv', index=False)