In [39]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import calendar

In [3]:
#regex patterns
page_pat = re.compile(r'Page 1 of \d+')
review_pat = re.compile(r'<div class=\"review-data\">[;a-zA-Z\s,-.\'\/\?\[\]\":\']*</div>')
rating_pat = re.compile(r'Original Score:\s([A-Z](\+|-)?|\d(.\d)?(\/\d)?)')
fresh_pat = re.compile(r'small\s(fresh|rotten)\"')
#critic_pat = re.compile(r'\/\"\>([A-Z][a-zA-Z]+\s[A-Z][a-zA-Z\-]+)|([A-Z][a-zA-Z.]+\s[A-Z].?\s[A-Z][a-zA-Z]+)|([A-Z][a-zA-Z]+\s[A-Z]+\'[A-Z][a-zA-Z]+)')
critic_pat = re.compile(r'data-qa="review-critic-link".*?>(.|\n)*?</a>')
publisher_pat = re.compile(r'\"subtle\">[a-zA-Z\s,.\(\)\'\-&;!\/\d+]+</em>')
date_pat = re.compile(r'[a-zA-Z]+\s\d+,\s\d+')

In [4]:
def make_soup(url):
    try:
        r = requests.get(url)
        if r.status_code == 404:
            return ''
        soup = BeautifulSoup(r.content, 'html.parser')
    except requests.TooManyRedirects:
        soup = ''
    return soup

In [5]:
def get_num_pages(soup):
    match = re.findall(page_pat,str(list(soup)))
    print(match)
    if len(match) > 0:
        match = match[0]
        match = match.split(' of ')[-1]
        return match
    else:
        return None

In [6]:
def get_critic_reviews_from_page(soup):
    reviews = list()
    rating = list()
    fresh = list()
    critic = list()
    top_critic = list()
    publisher = list()
    date = list()
    
    soup = str(soup)
    review_soup = soup.split('="review_table')[1].split('<div class="review-row" data-qa="review-item" tabindex="0">\n')
    review_soup.pop(0)
    print(review_soup)

    for review in review_soup:
        # match = re.findall(review_pat, str(review))
        # print(len(match))
        # if len(match) > 0:
            # m = match[0]            
            # for iden in ['<div class="the_review" data-qa="review-item"> ','</div>']:
            #     m = m.replace(iden,'')
            # reviews.append(m.strip('"'))            
            # extract rating
            match = re.findall(rating_pat, str(review))
            if len(match) > 0:
                m = match[0][0]
                if '/1' in m:
                    sp_m = m.split('/')
                    if sp_m[-1] == '1':
                        sp_m[-1] = '10'
                    m = '/'.join(sp_m)
                rating.append(m)
            else:
                rating.append(None)                    
            # extract critic
            match = re.findall(critic_pat, str(review))
            print(str(review))
            print(match)
            if len(match) > 0:
                critic.append(''.join(match[0]))
            else:
                critic.append(None)            
            # check if top critic
            if '> Top Critic<' in str(review):
                top_critic.append(1)
            else:
                top_critic.append(0)            
            # extract publisher
            # match = re.findall(publisher_pat, str(review))
            # if len(match) > 0:
            #     m = match[0]
            #     m = m.replace('"subtle">', '')
            #     m = m.replace('</em>','')
            #     publisher.append(m)
            # else:
            #     publisher.append(None)            
            # extract date
            match = re.findall(date_pat, str(review))
            if len(match) > 0:
                date.append(match[0].strip('"'))
            else:
                date.append(None)
            
    return [reviews, rating, fresh, critic, top_critic, publisher, date]

In [7]:
def get_critic_reviews(page):
    info = [[],[],[],[],[],[],[]]
    soup = make_soup(page + "reviews")
#    print(soup)
    pages = get_num_pages(soup)
    print(pages)
    if pages is not None:
        for page_num in range(1,int(pages)+1):
            soup = make_soup(page + "reviews?page=" + str(page_num) + "&sort=")
            c_info = get_critic_reviews_from_page(soup)
            
            # accumulate review info
            for i in range(len(c_info)):
                info[i] = info[i] + c_info[i]
        
        c_info = dict()
        keys = ['reviews', 'rating', 'fresh', 'critic', 'top_critic', 'publisher', 'date']
        for k in range(len(keys)):
            c_info[keys[k]] = info[k]        
    else:
        c_info = None        
    return c_info

In [8]:
movie_df = pd.read_csv('movie_list.csv', delimiter=' ')
movie_list = movie_df['Title'].tolist()
print(movie_list)

['The Shawshank Redemption', 'The Godfather', 'The Dark Knight', 'The Godfather: Part II', '12 Angry Men', 'The Lord of the Rings: The Return of the King', 'Pulp Fiction', "Schindler's List", 'Inception', 'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring', 'Forrest Gump', 'Il buono, il brutto, il cattivo', 'The Lord of the Rings: The Two Towers', 'The Matrix', 'Goodfellas', 'Star Wars: Episode V - The Empire Strikes Back', "One Flew Over the Cuckoo's Nest", 'Hamilton', 'Gisaengchung', 'Soorarai Pottru', 'Interstellar', 'Cidade de Deus', 'Sen to Chihiro no kamikakushi', 'Saving Private Ryan', 'The Green Mile', 'La vita è bella', 'Se7en', 'The Silence of the Lambs', 'Star Wars', 'Seppuku', 'Shichinin no samurai', "It's a Wonderful Life", 'Joker', 'Whiplash', 'The Intouchables', 'The Prestige', 'The Departed', 'The Pianist', 'Gladiator', 'American History X', 'The Usual Suspects', 'Léon', 'The Lion King', 'Terminator 2: Judgment Day', 'Nuovo Cinema Paradiso', 'Hotaru no hak

In [9]:

movies = [m.replace(' ', '_') for m in movie_list]


movie = list()
critic = list()
publisher = list()
rating = list()
date = list()
comment = list()

count = 0

for movie_name in movies:
    count += 1
    print(count)
    url = f'https://www.rottentomatoes.com/m/{movie_name}/' + "reviews"
    soup = make_soup(url)
    if soup == '':
        # print("Soup not found")
        continue
    
    print(url)

    review_list = list()
    for r in soup.find_all("div", "review-row"):
        review_list.append(r)

    for review in review_list:
        #Get Critic Name
        try:
            critic.append(review.find_all("a", "display-name")[0].get_text().strip())
        except:
            continue
        #Get Publisher Name
        try:
            publisher.append(review.find_all("a", "publication")[0].get_text().strip())
        except:
            critic.pop()
            continue
        #Get Rating
        #Get Date
        try:
            rating_text = review.find_all("p", "original-score-and-url")[0].get_text().strip()
        except:
            critic.pop()
            publisher.pop()
            continue
        rating_text = rating_text.split('Original Score: ')
        if len(rating_text) > 1:
            rating.append(rating_text[1].split('\n')[0])
            date.append(rating_text[1].split('| ')[1])
        else:
            rating.append(None)
            date.append(rating_text[0].split('| ')[1])
        #Get Comment
        try:
            comment.append(review.find_all("p", "review-text")[0].get_text().strip())
        except:
            critic.pop()
            publisher.pop()
            rating.pop()
            date.pop()
            continue
        #Append Movie Name
        movie.append(movie_name.replace('_', ' '))

frame = {"movie_name": movie, "critic_name": critic, "publisher": publisher, "rating": rating, "date": date, "comment": comment}

df = pd.DataFrame.from_dict(frame)
    


1
https://www.rottentomatoes.com/m/The_Shawshank_Redemption/reviews
2
https://www.rottentomatoes.com/m/The_Godfather/reviews
3
https://www.rottentomatoes.com/m/The_Dark_Knight/reviews
4
5
6
7
https://www.rottentomatoes.com/m/Pulp_Fiction/reviews
8
9
https://www.rottentomatoes.com/m/Inception/reviews
10
https://www.rottentomatoes.com/m/Fight_Club/reviews
11
12
https://www.rottentomatoes.com/m/Forrest_Gump/reviews
13
14
15
https://www.rottentomatoes.com/m/The_Matrix/reviews
16
https://www.rottentomatoes.com/m/Goodfellas/reviews
17
18
19
20
21
https://www.rottentomatoes.com/m/Soorarai_Pottru/reviews
22
https://www.rottentomatoes.com/m/Interstellar/reviews
23
24
https://www.rottentomatoes.com/m/Sen_to_Chihiro_no_kamikakushi/reviews
25
https://www.rottentomatoes.com/m/Saving_Private_Ryan/reviews
26
https://www.rottentomatoes.com/m/The_Green_Mile/reviews
27
28
29
https://www.rottentomatoes.com/m/The_Silence_of_the_Lambs/reviews
30
https://www.rottentomatoes.com/m/Star_Wars/reviews
31
https:/

In [10]:
display(df)

Unnamed: 0,movie_name,critic_name,publisher,rating,date,comment
0,The Shawshank Redemption,Jonathan Romney,Guardian,,"Mar 4, 2024",If this is a feel-good movie -- and by the end...
1,The Shawshank Redemption,Jay Boyar,Orlando Sentinel,,"Mar 4, 2024",The Shawshank Redemption is both resigned and ...
2,The Shawshank Redemption,Adam Mars-Jones,Independent (UK),,"Mar 4, 2024",The sad thing is that The Shawshank Redemption...
3,The Shawshank Redemption,Malcolm Johnson,Hartford Courant,3.5/4,"Mar 4, 2024","Next to Stand By Me, The Shawshank Redemption ..."
4,The Shawshank Redemption,Yardena Arar,Los Angeles Daily News,,"Mar 4, 2024","It's Robbins and Freeman who carry the film, w..."
...,...,...,...,...,...,...
9941,Lifeboat,Bosley Crowther,New York Times,4/5,"Mar 25, 2006","That old master of screen melodrama, Alfred Hi..."
9942,Lifeboat,Jeffrey M. Anderson,Combustible Celluloid,3.5/4,"Dec 17, 2005",It's a technical tour-de-force.
9943,Lifeboat,Philip Martin,Arkansas Democrat-Gazette,3/5,"Nov 7, 2005",
9944,Lifeboat,Dan Callahan,Slant Magazine,4/4,"Oct 24, 2005",Hitchcock's shifting sympathies guarantee our ...


In [11]:
new_df = df.dropna()

In [12]:
display(new_df)

Unnamed: 0,movie_name,critic_name,publisher,rating,date,comment
3,The Shawshank Redemption,Malcolm Johnson,Hartford Courant,3.5/4,"Mar 4, 2024","Next to Stand By Me, The Shawshank Redemption ..."
6,The Shawshank Redemption,William Arnold,Seattle Post-Intelligencer,2.5/4,"Mar 4, 2024","The acting is so strong, so seductive in The S..."
9,The Shawshank Redemption,Michael Wilmington,Chicago Tribune,3.5/4,"Mar 4, 2024",The Shawshank Redemption may be working with s...
12,The Shawshank Redemption,Jeff Simon,Buffalo News,4/5,"Mar 4, 2024",It's a movie with dignity and surprises and be...
14,The Shawshank Redemption,Jack Mathews,Newsday,3/4,"Mar 4, 2024",You've never seen a prison movie quite like it...
...,...,...,...,...,...,...
9941,Lifeboat,Bosley Crowther,New York Times,4/5,"Mar 25, 2006","That old master of screen melodrama, Alfred Hi..."
9942,Lifeboat,Jeffrey M. Anderson,Combustible Celluloid,3.5/4,"Dec 17, 2005",It's a technical tour-de-force.
9943,Lifeboat,Philip Martin,Arkansas Democrat-Gazette,3/5,"Nov 7, 2005",
9944,Lifeboat,Dan Callahan,Slant Magazine,4/4,"Oct 24, 2005",Hitchcock's shifting sympathies guarantee our ...


In [13]:
comment_nan_count = new_df['comment'].isna().sum()
print(comment_nan_count)

0


In [14]:
new_df.to_csv('reviews.csv', encoding='utf-8', index=False)

In [34]:
filtered_ratings = new_df[new_df['rating'].str.contains(r'\d*\.?\d*\/\d*')]
new_ratings = list()
for rating in filtered_ratings['rating']:
    split_rate = rating.replace(' ' , '').split('/')
    numerator = split_rate[0]
    denominator = split_rate[1]
    new_ratings.append((float(numerator)/float(denominator))*10)

filtered_ratings['rating'] = new_ratings

display(filtered_ratings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ratings['rating'] = new_ratings


Unnamed: 0,movie_name,critic_name,publisher,rating,date,comment
3,The Shawshank Redemption,Malcolm Johnson,Hartford Courant,8.75,"Mar 4, 2024","Next to Stand By Me, The Shawshank Redemption ..."
6,The Shawshank Redemption,William Arnold,Seattle Post-Intelligencer,6.25,"Mar 4, 2024","The acting is so strong, so seductive in The S..."
9,The Shawshank Redemption,Michael Wilmington,Chicago Tribune,8.75,"Mar 4, 2024",The Shawshank Redemption may be working with s...
12,The Shawshank Redemption,Jeff Simon,Buffalo News,8.00,"Mar 4, 2024",It's a movie with dignity and surprises and be...
14,The Shawshank Redemption,Jack Mathews,Newsday,7.50,"Mar 4, 2024",You've never seen a prison movie quite like it...
...,...,...,...,...,...,...
9941,Lifeboat,Bosley Crowther,New York Times,8.00,"Mar 25, 2006","That old master of screen melodrama, Alfred Hi..."
9942,Lifeboat,Jeffrey M. Anderson,Combustible Celluloid,8.75,"Dec 17, 2005",It's a technical tour-de-force.
9943,Lifeboat,Philip Martin,Arkansas Democrat-Gazette,6.00,"Nov 7, 2005",
9944,Lifeboat,Dan Callahan,Slant Magazine,10.00,"Oct 24, 2005",Hitchcock's shifting sympathies guarantee our ...


In [44]:
final_df = filtered_ratings
month_map = {month: index for index, month in enumerate(calendar.month_abbr) if month}
date_list = list()
for date in final_df['date']:
    split_date = date.split(' ')
    year = split_date[2]
    day = split_date[1].replace(',','')
    month = month_map[split_date[0]]
    date_str = str(year) + '-' + str(month) + '-' + str(day)
    date_list.append(date_str)

final_df['date'] = date_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['date'] = date_list


In [45]:
final_df.to_csv('reviews.csv', encoding='utf-8', index=False)