In [1]:
# source: https://howtofix.io/scraping-all-reviews-of-a-movie-from-rotten-tomato-using-soup-id39891
import pandas as pd
import requests
import re
import time
session = requests.Session()

In [2]:
def get_reviews(movie_id, 
                reviewer = 'user', 
                scraping_limit = 10
    ):
    '''
    Scrapes rotten tomatoes for reviews and related info

    args: 
        movie_id = the title of the movie, 
        reviewer = select reviewer pool from list: ['critic', 'user']. Default is user. 
        scrape_limit = number of pages to stop scraping. Default is 10. 

    returns: 
        a pandas dataframe of reviewer data 
    '''

    url = f'https://www.rottentomatoes.com/m/{movie_id}/reviews'
    r = requests.get(url)
    movie_id = re.findall(r'(?<=movieId":")(.*)(?=","type)',r.text)[0]

    if reviewer == 'critic':
        api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/criticsReviews/all"
    if reviewer == 'user':
        api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/reviews/user"
    payload = {
        'direction': 'next',
        'endCursor': '',
        'startCursor': '',
    }

    pages_scraped = 0
    scraping_limit = scraping_limit
    review_data = []
    while True:
        r = session.get(api_url, 
                  params=payload)
        data = r.json()

        if not data['pageInfo']['hasNextPage']:
            print('scaping completed')
            break
        elif pages_scraped == scraping_limit:
            print('scraping limit reached')
            break

        payload['endCursor'] = data['pageInfo']['endCursor']
        payload['startCursor'] = data['pageInfo']['startCursor'] if data['pageInfo'].get('startCursor') else ''

        review_data.extend(data['reviews'])
        time.sleep(.1)
        pages_scraped += 1
        if pages_scraped % 10 == 0: 
            print(f'pages scraped: {pages_scraped}')
#         print(f'len of df: {len(review_data)}')

    return pd.json_normalize(review_data)

In [3]:
movie_id = 'the_matrix_resurrections'

df = get_reviews(movie_id , 
                   scraping_limit = 2)
print(len(df))
df.head()

scraping limit reached
20


Unnamed: 0,rating,review,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,timeFromCreation,user.realm,user.userId,displayName,displayImageUrl,user.displayName,user.accountLink
0,0.5,"The story line plodded along, was sadly predic...",False,False,False,False,0.5,2h ago,Flixster,4cffb798-26e3-4511-b291-cd0983e95c52,,,,
1,1.0,Utter waste of time. Don't know why it even ex...,False,False,False,False,1.0,2h ago,RT,979425008,Alan C,https://graph.facebook.com/v3.3/10158665953625...,Alan C,/user/id/979425008
2,0.5,This movie is a waste of time. It will be forg...,False,False,False,False,0.5,4h ago,RT,955250125,Jay D,https://graph.facebook.com/v3.3/10000599544905...,Jay D,/user/id/955250125
3,3.0,Better than I thought it was going to be based...,False,False,False,False,3.0,4h ago,RT,928846930,Susan S,,Susan S,/user/id/928846930
4,0.5,Unfortunately this iteration of matrix fell ex...,False,False,False,False,0.5,5h ago,Flixster,ad519f4e-1544-4ecc-a9d0-a552693d8188,,,,


In [4]:
movie_id = 'dune_2021'

data = get_reviews(movie_id,
                   scraping_limit = 100
                  )

pages scraped: 10
pages scraped: 20
pages scraped: 30
pages scraped: 40
pages scraped: 50
pages scraped: 60
pages scraped: 70
pages scraped: 80
pages scraped: 90
pages scraped: 100
scraping limit reached


In [5]:
data.shape

(1000, 14)

In [6]:
data.head()

Unnamed: 0,rating,review,displayName,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,timeFromCreation,user.displayName,user.accountLink,user.realm,user.userId,displayImageUrl
0,5.0,So I got to rewatch Dune again this time in a ...,Md T,False,False,False,False,5.0,7h ago,Md T,/user/id/979110234,RT,979110234,
1,5.0,Movie was beautifully shot and acted. Sound de...,Rohan P,False,False,False,False,5.0,10h ago,Rohan P,/user/id/977222113,RT,977222113,
2,5.0,Showing this magnificent book fron this side i...,Grzegorz B,False,False,False,False,5.0,11h ago,Grzegorz B,/user/id/979424531,RT,979424531,
3,3.0,Beautiful. \nBut.\nI was predisposed to love t...,Susan S,False,False,False,False,3.0,16h ago,Susan S,/user/id/948447923,RT,948447923,
4,5.0,"Blew my expectations out of the water, brillia...",kieran h,False,False,False,False,5.0,16h ago,kieran h,/user/id/979421884,RT,979421884,


In [7]:
data.to_pickle('dune_rt_data.pkl')