In [1]:
# source: https://howtofix.io/scraping-all-reviews-of-a-movie-from-rotten-tomato-using-soup-id39891
import pandas as pd
import requests
import re
import time
session = requests.Session()

In [2]:
def get_reviews(movie_id, 
                reviewer = 'user', 
                scraping_limit = 10
    ):
    '''
    Scrapes rotten tomatoes for reviews and related info

    args: 
        movie_id = the title of the movie, 
        reviewer = select reviewer pool from list: ['critic', 'user']. Default is user. 
        scrape_limit = number of pages to stop scraping. Default is 10. 

    returns: 
        a pandas dataframe of reviewer data 
    '''

    url = f'https://www.rottentomatoes.com/m/{movie_id}/reviews'
    r = requests.get(url)
    movie_id = re.findall(r'(?<=movieId":")(.*)(?=","type)',r.text)[0]

    if reviewer == 'critic':
        api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/criticsReviews/all"
    if reviewer == 'user':
        api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/reviews/user"
    payload = {
        'direction': 'next',
        'endCursor': '',
        'startCursor': '',
    }

    pages_scraped = 0
    scraping_limit = scraping_limit
    review_data = []
    while True:
        r = session.get(api_url, 
                  params=payload)
        data = r.json()

        if not data['pageInfo']['hasNextPage']:
            print('scaping completed')
            break
        elif pages_scraped == scraping_limit:
            print('scraping limit reached')
            break

        payload['endCursor'] = data['pageInfo']['endCursor']
        payload['startCursor'] = data['pageInfo']['startCursor'] if data['pageInfo'].get('startCursor') else ''

        review_data.extend(data['reviews'])
        time.sleep(.1)
        pages_scraped += 1
        print(f'pages scraped: {pages_scraped}')
#         print(f'len of df: {len(review_data)}')

    return review_data

In [3]:
movie_id = 'the_matrix_resurrections'

data = get_reviews(movie_id , 
                   scraping_limit = 2)
df = pd.json_normalize(data)
print(len(df))
df.head()

pages scraped: 1
pages scraped: 2
scraping limit reached
20


Unnamed: 0,rating,review,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,timeFromCreation,user.realm,user.userId,displayName,user.displayName,user.accountLink,displayImageUrl
0,5.0,"Superb, loved it. Great cast, pacing, action, ...",False,False,False,False,5.0,2h ago,Flixster,c297075d-7272-426e-96f2-baab5631bc5e,,,,
1,3.5,")\nIt's a fun, nostalgic, meta movie that is e...",False,False,False,False,3.5,2h ago,RT,979423830,Ricky L,Ricky L,/user/id/979423830,
2,3.0,It's hard to put a finger on what's wrong with...,False,False,False,False,3.0,2h ago,RT,823783380,Mela N,Mela N,/user/id/823783380,https://graph.facebook.com/v3.3/1169751812/pic...
3,2.0,The Matrix Awakens.\n\nDidn't hate it. Also gl...,False,False,False,False,2.0,2h ago,Fandango,50c0c39c-c3d5-486f-8c96-59e40991101c,Eric H,Eric H,,
4,2.5,"Well, well, well. So you've come to read a re...",False,False,False,True,2.5,2h ago,RT,979147760,R.J. E,R.J. E,/user/id/979147760,


In [4]:
movie_id = 'avengers_endgame'

data = get_reviews(movie_id,
                   scraping_limit = 3
                  )
df2 = pd.json_normalize(data)
print(len(df2))
df2.head()

pages scraped: 1
pages scraped: 2
pages scraped: 3
scraping limit reached
30


Unnamed: 0,rating,review,displayName,displayImageUrl,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,timeFromCreation,user.displayName,user.accountLink,user.realm,user.userId
0,5.0,"Honestly, an epic movie, for fans and the gene...",Gabriel A,https://graph.facebook.com/v3.3/11352663306121...,False,False,False,False,5.0,2h ago,Gabriel A,/user/id/979421365,RT,979421365
1,5.0,"Filme incrível, cativante, emocionante e histó...",Marcela A,https://graph.facebook.com/v3.3/96402369786835...,False,False,False,False,5.0,3h ago,Marcela A,/user/id/979423722,RT,979423722
2,5.0,"Filme emocionante, com tons cômicos,envolvente...",Bruno S,https://graph.facebook.com/v3.3/46581044708929...,False,False,False,False,5.0,10h ago,Bruno S,/user/id/979423056,RT,979423056
3,5.0,"I love this movie, it is so captivating and sc...",Marcos Bismarck d,,False,False,False,False,5.0,10h ago,Marcos Bismarck d,/user/id/979423040,RT,979423040
4,5.0,An incredible ending to an incredible franchis...,Ciaran W,,False,False,False,False,5.0,18h ago,Ciaran W,/user/id/979422608,RT,979422608


In [5]:
df2['review'].values[-1]

'This is my 2nd favorite marvel movie had great scenes tons of action comedy and hope and man the ending was just great.'