In [1]:
# source: https://howtofix.io/scraping-all-reviews-of-a-movie-from-rotten-tomato-using-soup-id39891
import pandas as pd
import requests
import re
import time
session = requests.Session()

In [5]:
def get_reviews(movie_title, 
                reviewer = 'user', 
                scraping_limit = 10, 
                write_data = False
    ):
    '''
    Scrapes rotten tomatoes for reviews and related info

    args: 
        movie_id = the title of the movie, 
        reviewer = select reviewer pool from list: ['critic', 'user']. Default is user. 
        scrape_limit = number of pages to stop scraping. Default is 10. 

    returns: 
        a pandas dataframe of reviewer data 
    '''

    url = f'https://www.rottentomatoes.com/m/{movie_title}/reviews'
    r = requests.get(url)
    movie_id = re.findall(r'(?<=movieId":")(.*)(?=","type)',r.text)[0]

    if reviewer == 'critic':
        api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/criticsReviews/all"
    if reviewer == 'user':
        api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/reviews/user"
    payload = {
        'direction': 'next',
        'endCursor': '',
        'startCursor': '',
    }
    
    pages_scraped = 0
    scraping_limit = scraping_limit
    review_data = []
    while True:
        r = session.get(api_url, 
                  params=payload)
        data = r.json()

        if not data['pageInfo']['hasNextPage']:
            print('scaping completed')
            break
        elif pages_scraped == scraping_limit:
            print('scraping limit reached')
            break

        payload['endCursor'] = data['pageInfo']['endCursor']
        payload['startCursor'] = data['pageInfo']['startCursor'] if data['pageInfo'].get('startCursor') else ''

        review_data.extend(data['reviews'])
        time.sleep(.1)
        pages_scraped += 1
        if pages_scraped % 10 == 0: 
            print(f'pages scraped: {pages_scraped}')
#         print(f'len of df: {len(review_data)}')
    
    data = pd.json_normalize(review_data)
    
    if write_data: 
        data.to_pickle(f'{movie_title}_data.pkl')
        
    return data

In [7]:
movie_id = 'the_matrix_resurrections'

df = get_reviews(movie_id , 
                   scraping_limit = 20)
print(len(df))
df.head()

pages scraped: 10
pages scraped: 20
scraping limit reached
200


Unnamed: 0,rating,review,displayName,displayImageUrl,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,timeFromCreation,user.displayName,user.accountLink,user.realm,user.userId
0,0.5,I'm so heartbroken by this movie. Everything t...,Stephanie H,https://graph.facebook.com/v3.3/10216072520244...,False,False,False,False,0.5,2h ago,Stephanie H,/user/id/979453002,RT,979453002
1,5.0,Definitely has a lot of opportunities to carry...,Precious P,https://graph.facebook.com/v3.3/669112949/picture,False,False,False,False,5.0,9h ago,Precious P,/user/id/921750821,RT,921750821
2,2.0,The first Matrix is my all time favourite and ...,Matt T,https://graph.facebook.com/v3.3/743245149/picture,False,False,False,False,2.0,10h ago,Matt T,/user/id/852557762,RT,852557762
3,5.0,I really enjoyed seeing how they were going to...,Luke,,True,False,False,False,5.0,12h ago,Luke,,Fandango,2adea5c1-0620-4ba9-9a8a-d419041b6e8c
4,3.0,Real sh it. It was kick ass for all fans. But ...,Kirill D,,False,False,False,True,3.0,15h ago,Kirill D,/user/id/979452489,RT,979452489


In [8]:
movie_id = 'dune_2021'

df2 = get_reviews(movie_id,
                  scraping_limit = 100, 
                  write_data = True
                )

pages scraped: 10
pages scraped: 20
pages scraped: 30
pages scraped: 40
pages scraped: 50
pages scraped: 60
pages scraped: 70
pages scraped: 80
pages scraped: 90
pages scraped: 100
scraping limit reached


In [9]:
df2.shape

(1000, 14)

In [10]:
df2.head()

Unnamed: 0,rating,review,displayName,displayImageUrl,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,timeFromCreation,user.displayName,user.accountLink,user.realm,user.userId
0,5.0,Excellent! One of the best movies I have seen ...,Kellie G,https://graph.facebook.com/v3.3/22821587819351...,False,False,False,False,5.0,6h ago,Kellie G,/user/id/979452817,RT,979452817
1,4.5,The only draw back is that its 2 hours 35 minutes,George M,https://graph.facebook.com/v3.3/1755748644/pic...,False,False,False,False,4.5,13h ago,George M,/user/id/895190686,RT,895190686
2,5.0,Fantastic. Can't wait for part 2.,Julian C,https://graph.facebook.com/v3.3/10158297875246...,False,False,False,False,5.0,1d ago,Julian C,/user/id/979451578,RT,979451578
3,2.0,Another Sci-Fi alternate universe that fails t...,Diane L,,False,False,False,False,2.0,2d ago,Diane L,/user/id/979449083,RT,979449083
4,2.0,Dune 2021 (Part 1) Review - I'm a Dune fanatic...,Martin D,,False,False,False,False,2.0,2d ago,Martin D,/user/id/979451296,RT,979451296
