In [1]:
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import requests
import re

### Movie name & Consensus

In [2]:
def name_consensus_fn(soup):
    movie_name = soup.find('div',{'class':'mop-ratings-wrap score_panel js-mop-ratings-wrap'})
    movie_name = soup.findAll('h1',{'class':'mop-ratings-wrap__title mop-ratings-wrap__title--top'})[0].get_text()
    critic_consensus = soup.findAll('p',{'class':'mop-ratings-wrap__text mop-ratings-wrap__text--concensus'})[0].get_text()
    
    return movie_name, critic_consensus

### Ratings

In [3]:
def critic_score_fn(critic):
    try:
        critic_score = critic.find('span',{'class':'mop-ratings-wrap__percentage'}).get_text()
    except AttributeError:
        critic_score = np.NaN
#     print(critic_score)
    return critic_score

In [4]:
def num_critics_fn(critic):
    num_critics = critic.find('small',{'class':'mop-ratings-wrap__text--small'}).get_text()
#     print(num_critics)
    return num_critics

In [5]:
def audience_score_fn(audience):
    try:
        audience_score = audience.find('span',{'class':'mop-ratings-wrap__percentage'}).get_text()
    except AttributeError:
        audience_score = np.NaN
#     print(audience_score)        
    return audience_score

In [6]:
def num_audience_fn(audience):
    num_audience = audience.find('strong',{'class':'mop-ratings-wrap__text--small'}).get_text()
#     print(num_audience)
    return num_audience

### Movie info

In [7]:
def synopsis_fn(tables):
    synopsis = tables.findAll('div',{'class':'movie_synopsis clamp clamp-6 js-clamp'})[0].get_text()
#     print(synopsis)
    return synopsis

In [8]:
def movie_info_fn(movie_info_box):
    dict_data = {}

    for i in range(12):
        try:
            ii = movie_info_box[i].findAll('div',{'class':'meta-label subtle'})[0].get_text()
            dict_data[ii] = movie_info_box[i].find('div',{'class':'meta-value'}).get_text()
    #         print(synopsis[i].findAll('div',{'class':'meta-label subtle'})[0].get_text(), synopsis[i].find('div',{'class':'meta-value'}).get_text())
        except:
            continue
    return dict_data

def pg_fn(tables):
    pg = tables.findAll('div',{'class':'meta-value'})[0].get_text()
    return pg

def genre_fn(small_box):
    genre = small_box[1].find('div',{'class':'meta-value'}).get_text()
    return genre    

def directors_fn(small_box):
    directors = small_box[2].findAll('div',{'class':'meta-value'})[0].get_text()
    return directors

def written_by_fn(small_box):
    written_by = small_box[3].findAll('div',{'class':'meta-value'})[0].get_text()
    return written_by

def release_date_fn(small_box):
    release_date = small_box[4].findAll('div',{'class':'meta-value'})[0].get_text()
    return release_date

def run_time_fn(small_box):
    run_time = small_box[7].findAll('div',{'class':'meta-value'})[0].get_text()

def studio_fn(small_box):
    try:
        studio = small_box[8].findAll('div',{'class':'meta-value'})[0].get_text()
    except IndexError:
        studio = np.NaN
    return studio

### Cast

In [9]:
def list_celeb_names_fn(all_celebrities):
    top_5_celeb_names = []
    try:
        for i in range(5):
            top_5_celeb_names.append(all_celebrities[i].attrs['href'][11:])
    except IndexError:
        pass
    return top_5_celeb_names

In [35]:
def scrape_celeb_names_fn(soup):
    cast = soup.find('div',{'class':'castSection'})
    top_5_celeb_names = []
    try :
        all_celebs = cast.findAll('a',{'class':'pull-left'})
        top_5_celeb_names = list_celeb_names_fn(all_celebs)
        top_5_celeb_names = ", ".join(top_5_celeb_names)
    except AttributeError:
        pass
    return top_5_celeb_names

### Critics review

In [26]:
def critic_line_url_sub_fn(critic_lines):
    url =[]
    critic_line =[]
    for i in range(20):
        try:
            url.append(critic_lines[i].attrs['cite'])
            critic_line.append(critic_lines[i].get_text())
        except:
            continue
    return critic_line, url

In [27]:
def critic_line_url_fn(soup):
    critic_lines = soup.findAll('div',{'class':'panel-body content_body'})[3]
    critic_lines = critic_lines.findAll('blockquote',{'class':'media-body quote_bubble__quote'})
    critic_line, critic_url = critic_line_url_sub_fn(critic_lines)
    critic_line, critic_url = ", ".join(critic_line), ", ".join(critic_url)
    return critic_line, critic_url

### Audience review

In [28]:
def audience_line_sub_fn(audience_review):
    audience_line =[]
    for i in range(10):
        try:
            audience_line.append(audience_review.findAll('div',{'class':'mop-audience-reviews__review--comment clamp clamp-4 js-clamp'})[i].get_text())
        except IndexError:
            continue
    return audience_line

In [29]:
def audience_review_fn(soup):
    audience_review = soup.find('ul',{'class':'mop-audience-reviews__reviews-wrap clearfix'})
    audience_lines = audience_line_sub_fn(audience_review)
    audience_review = "".join(audience_lines)
    return(audience_review)

### Quotes 

In [30]:
def quotes_helper_fn(quotes_lines):
    quote_lines = quotes_lines[1::2]
    cleaned_quotes = []
    for line in quote_lines:
        try:
            line = str(line).replace('\n','')
            cleaned_quotes.append(re.search('<span>(.*)</span>',line).group(1))
        except:
            continue
    quotes = "".join(cleaned_quotes)
    return quotes

In [31]:
def quotes_fn(soup):
#     quotes_lines = [
    try:
        quotes_url = soup.findAll('div',{'class':'panel-body content_body'})[4]
        quotes_url = quotes_url.find('a',{'class':'unstyled articleLink fr'}).attrs['href']
        quotes_url = 'https://www.rottentomatoes.com' + quotes_url
        quotes_page = requests.get(quotes_url)
        quotes_page=bs(quotes_page.text)
        quotes_lines = quotes_page.findAll('div',{'class':'panel-body content_body'})[1]
        quotes_lines = quotes_lines.findAll('span')
        
    except:
        return np.NaN
    
    return quotes_helper_fn(quotes_lines)

### Scraping to get movie names

In [120]:
all_genre_100 = soup.findAll('ul',{'class':'dropdown-menu'})
little_cleaned_genre_100 = all_genre_100[0].findAll('a')

links_100_movies_various_genre = []

for i in range(17):
    extension = str(little_cleaned_genre_100[i].attrs['href'])
    extension = "https://www.rottentomatoes.com" + extension
    links_100_movies_various_genre.append(extension)

In [158]:
movie_url_genre = pd.DataFrame(columns=['url'])

for i in links_100_movies_various_genre:
    try:
#         print(i)
        movie_links_genre = pd.DataFrame()
        page = requests.get(i)
        soup=bs(page.text)

        movie_genre = soup.find('button',{'class':'dropdown-toggle btn btn-primary-border fullWidth'}).get_text()
        movie_genre = str(movie_genre).replace('\n','')
        movie_genre = str(movie_genre).replace(' ','')
        full_table = soup.find('table',{'class':'table'})
    #     print(full_table)
        movies_list_url = []
        for i in range(100):
            url_uncleaned = full_table.findAll('a',{'class':'unstyled articleLink'})[i].attrs['href']
            url_cleaned = 'https://www.rottentomatoes.com' + url_uncleaned
            movies_list_url.append(url_cleaned)

    #     print(movies_list_url)

        movie_links_genre = pd.DataFrame(movies_list_url, columns = ['url'])
        movie_links_genre['genre'] =  movie_genre
        movie_url_genre = movie_url_genre.append(movie_links_genre)  
    except:
        continue

https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_animation_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_art_house__international_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_classics_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_comedy_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_documentary_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_drama_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_horror_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_kids__family_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_musical__performing_arts_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_mystery__suspense_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_romance_movies/
https://www.rottentomatoes.com/top/bestofrt/top_100_science_fiction__fantasy_movies/
https://www.rottentomatoes.com/top/be

In [165]:
movie_url_genre.to_csv("movie_url_genre.csv", index=False)

In [33]:
def scrape_rotten(movies_list):
    final_df = pd.DataFrame()
    i=0
    for x in movies_list:
        print(x)
        print(i)
        i+=1
        page = requests.get(x)
        soup=bs(page.text)
        
        movie_name, critic_consensus = name_consensus_fn(soup)
        
        critic = soup.find('div',{'class':'mop-ratings-wrap__half'})
        critic_score = critic_score_fn(critic)
        num_critics = num_critics_fn(critic)
        
        audience = soup.find('div',{'class':'mop-ratings-wrap__half audience-score'})
        audience_score = audience_score_fn(audience)
        num_audience = num_audience_fn(audience)
        
        tables = soup.find('div',{'class':'panel-body content_body'})
        synopsis = synopsis_fn(tables)
#         pg = pg_fn(tables)

        movie_info = tables.findAll('li',{'class':'meta-row clearfix'})
        movie_info_dict = movie_info_fn(movie_info)
#         genre = genre_fn(small_box)
#         directors = directors_fn(small_box)
#         written_by = written_by_fn(small_box)
#         release_date = release_date_fn(small_box)
#         run_time = run_time_fn(small_box)
#         print(run_time)
#         studio = studio_fn(small_box)
        
        
        top_5_celeb_names = scrape_celeb_names_fn(soup)
        critic_line, critic_url = critic_line_url_fn(soup)
        audience_review = audience_review_fn(soup)
        quotes = quotes_fn(soup)
       
        
#         print("=="*30)
        
        df=pd.DataFrame([movie_info_dict])
        df['movie_name'] = [movie_name]
        df['critic_consensus'] = critic_consensus
        
        df['critic_score'] = [critic_score]
        df['audience_score'] = [audience_score]
#         df['pg'] = pg
#         df['genre'] = genre
#         df['directors'] = directors
#         df['written_by'] = written_by
#         df['release_date'] = release_date
#         df['run_time'] = run_time
#         df['studio'] = studio
        df['top_5_celeb_names'] = top_5_celeb_names
        df['critic_line'] = critic_line
        df['critic_url'] = critic_url
        df['audience_review'] = audience_review
        df['quotes'] = quotes

        final_df = final_df.append(df)
    return final_df
#     return critic_score, num_critics
        

In [34]:
ff = scrape_rotten(movies_list)

https://www.rottentomatoes.com/m/black_panther_2018
0
https://www.rottentomatoes.com/m/avengers_endgame
1
https://www.rottentomatoes.com/m/mission_impossible_fallout
2
https://www.rottentomatoes.com/m/spider_man_into_the_spider_verse
3
https://www.rottentomatoes.com/m/mad_max_fury_road
4
https://www.rottentomatoes.com/m/wonder_woman_2017
5
https://www.rottentomatoes.com/m/coco_2017
6
https://www.rottentomatoes.com/m/dunkirk_2017
7
https://www.rottentomatoes.com/m/thor_ragnarok_2017
8
https://www.rottentomatoes.com/m/logan_2017
9
https://www.rottentomatoes.com/m/star_wars_the_last_jedi
10
https://www.rottentomatoes.com/m/1011615-king_kong
11
https://www.rottentomatoes.com/m/star_wars_episode_vii_the_force_awakens
12
https://www.rottentomatoes.com/m/1000355-adventures_of_robin_hood
13
https://www.rottentomatoes.com/m/spider_man_far_from_home
14
https://www.rottentomatoes.com/m/incredibles_2
15
https://www.rottentomatoes.com/m/zootopia
16
https://www.rottentomatoes.com/m/war_for_the_plane

UnboundLocalError: local variable 'top_5_celeb_names' referenced before assignment

### Movie name and consensus

In [225]:
page = requests.get("https://www.rottentomatoes.com/m/seven_samurai_1956")
soup=bs(page.text)

movie_name = soup.find('div',{'class':'mop-ratings-wrap score_panel js-mop-ratings-wrap'})
movie_name = soup.findAll('h1',{'class':'mop-ratings-wrap__title mop-ratings-wrap__title--top'})[0].get_text()
critic_consensus = soup.findAll('p',{'class':'mop-ratings-wrap__text mop-ratings-wrap__text--concensus'})[0].get_text()

In [226]:
movie_name

'Seven Samurai (Shichinin no Samurai)'

In [228]:

critic_consensus

"Arguably Akira Kurosawa's masterpiece, The Seven Samurai is an epic adventure classic with an engrossing story, memorable characters, and stunning action sequences that make it one of the most influential films ever made."

## Ratings

In [209]:
critic = soup.find('div',{'class':'mop-ratings-wrap__half'})

In [107]:
try:
    critic_score = critic.find('span',{'class':'mop-ratings-wrap__percentage'}).get_text()
    print(critic_score)
except AttributeError:
    critic_score = np.NaN


                    100%
                


In [108]:
num_critics = critic.find('small',{'class':'mop-ratings-wrap__text--small'}).get_text()
num_critics.replace('\n','')

'                            64                    '

In [109]:
audience = soup.find('div',{'class':'mop-ratings-wrap__half audience-score'})
try:
    audience_score = audience.find('span',{'class':'mop-ratings-wrap__percentage'}).get_text()
    print(audience_score)
except AttributeError:
    audience_score = np.NaN


                    97%
                


In [110]:
num_audience = audience.find('strong',{'class':'mop-ratings-wrap__text--small'}).get_text()
num_audience

'User Ratings: 90,992'

### Movie Info

In [163]:
tables = soup.find('div',{'class':'panel-body content_body'})
synopsis = tables.findAll('div',{'class':'movie_synopsis clamp clamp-6 js-clamp'})[0].get_text()
synopsis.replace('\n','')

"                Akira Kurosawa's epic tale concerns honor and duty during a time when the old traditional order is breaking down. The film opens with master samurai Kambei (Takashi Shimura) posing as a monk to save a kidnapped farmer's child. Impressed by his selflessness and bravery, a group of farmers begs him to defend their terrorized village from bandits. Kambei agrees, although there is no material gain or honor to be had in the endeavor. Soon he attracts a pair of followers: a young samurai named Katsushiro (Isao Kimura), who quickly becomes Kambei's disciple, and boisterous Kikuchiyo (Toshiro Mifune), who poses as a samurai but is later revealed to be the son of a farmer. Kambei assembles four other samurais, including Kyuzo (Seiji Miyaguchi), a master swordsman, to round out the group. Together they consolidate the village's defenses and shape the villagers into a militia, while the bandits loom menacingly nearby. Soon raids and counter-raids build to a final bloody heart-wre

In [162]:
pg = tables.find('div',{'class':'meta-value'}).get_text()
pg.replace('\n','')

'G                    '

In [173]:
synopsis = tables.findAll('li',{'class':'meta-row clearfix'})
genre = synopsis[1].find('div',{'class':'meta-value'}).get_text()
genre.replace('\n','')

'                                                Action & Adventure,                                                 Art House & International,                                                 Classics,                                                 Drama                                            '

In [114]:
directors = synopsis[2].findAll('div',{'class':'meta-value'})[0].get_text()
directors.replace('\n','')

'Akira Kurosawa'

In [115]:
written_by = synopsis[3].findAll('div',{'class':'meta-value'})[0].get_text()
written_by.replace('\n','')

'Akira Kurosawa,                                                 Shinobu Hashimoto,                                                 Hideo Oguni'

In [116]:
release_date = synopsis[4].findAll('div',{'class':'meta-value'})[0].get_text()
release_date.replace('\n','')

'Nov 19, 1956                        \xa0wide'

In [120]:
run_time = synopsis[7].findAll('div',{'class':'meta-value'})[0].get_text()
run_time

'\n                        Columbia Pictures\n                    '

dict_data = {}

for i in range(12):
    try:
        ii = synopsis[i].findAll('div',{'class':'meta-label subtle'})[0].get_text()
        dict_data[ii] = synopsis[i].find('div',{'class':'meta-value'}).get_text()
    except:
        continue

In [119]:
try:
    studio = synopsis[8].findAll('div',{'class':'meta-value'})[0].get_text()

except IndexError:
    studio = np.NaN
print(studio)    


nan


## Cast

In [None]:
def list_celeb_names(all_celebrities):
    top_5_celeb_names = []
    try:
        for i in range(5):
            top_5_celeb_names.append(all_celebrities[i].attrs['href'][11:])
    except IndexError:
        continue
    return top_5_celeb_names

In [None]:
cast = soup.find('div',{'class':'castSection'})
try :
    all_celebs = cast.findAll('a',{'class':'pull-left'})
    top_5_celeb_names = list_celeb_names(all_celebs)
#     print(top_5_celeb_names)
except AttributeError:
    pass
top_5_celeb_names = ", ".join(top_5_celeb_names)
top_5_celeb_names

## Critics Review

In [None]:
def critic_line_url(critic_lines):
    url =[]
    critic_line =[]
    for i in range(20):
        try:
            url.append(critic_lines[i].attrs['cite'])
            critic_line.append(critic_lines[i].get_text())
        except:
            continue
    return critic_line, url

In [None]:
# critic_review = soup.find('section',{'class':'panel panel-rt panel-box'})
critic_lines = soup.findAll('div',{'class':'panel-body content_body'})[3]
critic_lines = critic_lines.findAll('blockquote',{'class':'media-body quote_bubble__quote'})
critic_line, critic_url = critic_line_url(critic_lines)
critic_line, critic_url = ", ".join(critic_line), ", ".join(critic_url)
critic_url

## Audiences Review

In [None]:
def audience_line(audience_review):
    audience_line =[]
    for i in range(10):
        try:
            audience_line.append(audience_review.findAll('div',{'class':'mop-audience-reviews__review--comment clamp clamp-4 js-clamp'})[i].get_text())
        except IndexError:
            continue
    return audience_line

In [None]:
audience_review = soup.find('ul',{'class':'mop-audience-reviews__reviews-wrap clearfix'})
audience_lines = audience_line(audience_review)
audience_review = "".join(audience_lines)
audience_review

## Quotes

In [None]:
quotes_url = soup.findAll('div',{'class':'panel-body content_body'})[4]
quotes_url = quotes_url.find('a',{'class':'unstyled articleLink fr'}).attrs['href']
quotes_url = 'https://www.rottentomatoes.com' + quotes_url
quotes_page = requests.get(quotes_url)
quotes_page=bs(quotes_page.text)

In [None]:
quotes_lines = quotes_page.findAll('div',{'class':'panel-body content_body'})[1]
# quotes_lines.findAll('span',{'class':'bold quote_actor'})
quotes_lines = quotes_lines.findAll('span')

In [None]:
quote_lines = quotes_lines[1::2]
# [word for word in re.split("\W+",quote_lines) if word.lower() not in ['<span>']]
cleaned_quotes = []
for line in quote_lines:
    try:
        line = str(line).replace('\n','')
        cleaned_quotes.append(re.search('<span>(.*)</span>',line).group(1))
    except:
        continue

quotes = "".join(cleaned_quotes)
# quotes

In [None]:
# for i in list(quotes_lines):
#     items = re.findall("bold quote_actor.*$",str(i),re.MULTILINE)
#     print(items)

In [None]:
# ,'num_critics','audience_score','num_audience',
#                           'synopsis','pg','genre','directors','written_by','release_date',
#                            'run_time','studio','top_5_celeb_names', 'critic_line', 'critic_url',
#                             'audience_review', 'quotes'

In [None]:
df=pd.DataFrame()
df['critic_score'] = [critic_score]
df['audience_score'] = [audience_score]
df['pg'] = pg
df['genre'] = genre
df['directors'] = directors
df['written_by'] = written_by
df['release_date'] = release_date
df['run_time'] = run_time
df['studio'] = studio
df['top_5_celeb_names'] = top_5_celeb_names
df['critic_line'] = critic_line
df['critic_url'] = critic_url
df['audience_review'] = audience_review
df['quotes'] = quotes

In [None]:
df