# This notebook does the following: 
    
1. Scrape multiple pages on imdb.com to get a list of notable movies
2. Google search these movies to find its rotten tomatoes page
3. Scrape the movie homepage on rotten tomatoes to get its known consensus summary
    - this text will be used to create as target labels for a fine tuned summarization model 
4. Scrape the movie user/critic reviews
    - this text will be used to create as training input for a fine tuned summarization model 


In [None]:
import pandas as pd
import requests
import re
import time
from bs4 import BeautifulSoup
import google
from tqdm import tqdm
session = requests.Session()

# setting up for scrape of imdb

In [None]:
categories = '''
Action
Adventure
Animation
Biography
Comedy
Crime
Documentary
Drama
Family
Fantasy
Film Noir
History
Horror
Music
Musical
Mystery
Romance
Sci-Fi
Sport
Superhero
Thriller
War
Western
'''    

categories = categories.lower().strip().split('\n')
print(len(categories))
categories

# 1. Scraping IMDB 

## Scraping top 250 movies from all movie genres on imdb 

In [None]:
movie_dict = dict()
for genre in tqdm(categories[:]):
    movie_dict[genre] = list()
    for count in range(1, 251, 50):
        try: 
            base_url = f'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres={genre}&view=simple&sort=user_rating,desc&start={count}&ref_=adv_nxt'
            response = requests.get(base_url)
            page_contents = response.text

            soup = BeautifulSoup(page_contents, 'html.parser') 
            review_text = soup.findAll(class_="lister-list")
            url_target_content = review_text[0].findAll(class_="col-title")

            for i in url_target_content:
                title = i.text.strip().split('\n')[2].strip()
                movie_dict[genre].append(title)
        except: 
            print(f'failed at {genre} at {count}')

In [None]:
all_movies_list = []
for g in movie_dict.keys():
    all_movies_list.extend(movie_dict[g])
    
len(all_movies_list)

# 2) Google each movie to get exact movie title url for rotten tomatoes 
# 3) then scrape movie homepage on rotten tomatoes

Note: rotten tomatoes doesn't want there search used in scraping so using google search is the work around

In [None]:
from googlesearch import search

In [None]:
def google_movie(query):
    '''
    returns top result for google search query
    '''
    return [i for i in search(query, num=1, stop=1, pause=2)][0]

In [None]:
def get_rt_consensus(url = str) -> tuple:
    '''
    Extracts critic and user summaries when they are on the homepage
    of the rt movie title 
    '''
    base_url = url
    response = requests.get(base_url)
    page_contents = response.text
    soup = BeautifulSoup(page_contents, 'html.parser') 
    review_text = soup.findAll('p' , class_="what-to-know__section-body")
    
    try: 
        critic = review_text[0].findAll('span')[0].text
    except:
         critic = None
    try: 
        user = review_text[1].findAll('span')[0].text
    except:
         user = None
    
    return critic, user

In [None]:
#Note: tasks 2. and 3. are in same loop 
info = []
count = 0
for movie in tqdm(all_movies_list[:2]):
    try: 
        query = f'{movie} rotten tomatoes'
        url =google_movie(query) #task number 2
        critic, user = get_rt_consensus(url = url)#task number 3
        info.append((url, movie, critic, user))

    except:
        failed.append(movie)
    count += 1 
    if count  % 50 == 0:
        print(f'count is at {count} so writing out data')
        df = pd.DataFrame(info, columns = ['url', 'movie', 'critic_summary', 'user_summary'])
        df.to_csv('movie_data.csv', index = False)

In [None]:
# saving out final dataframe
df.to_csv('movie_data_final.csv', index = False)

In [None]:
df = pd.DataFrame(info, columns = ['url', 'movie', 'critic_summary', 'user_summary'])
df

In [None]:
len(df), len(all_movies_list)

# 3) Scrape rotten romatoes for critic/user reviews

In [None]:
class ScrapeRtForDB(): 
    '''
    Instance scrapes rotten tomatoes reviews
    ... 
    Attributes: 
        movie_title: str 
            Title of the movie, 
        reviewer: str 
            Select reviewer pool from list: ['critic', 'user']. Default is user. 
        scrape_limit: int 
            Number of pages to stop scraping. Default is 10. 
        write_data : boolean
            Writes data out to pickle object. Default is False. 
    Methods: 
        scrape_reviews
            Scrapes rotten tomatoe page
        filter_df
            Filters scraped df 
        write_df
            Writes out data
        run_for_reviews
            Runs all methods above
    '''
    def __init__(
        self, 
        movie_title = '',
        reviewer = 'user',
        scraping_limit = 10, 
        write_data = False
    ):
    
        self.movie_title = movie_title
        self.reviewer = reviewer
        self.scraping_limit = scraping_limit
        self.write_data = write_data
        self.review_df = None
        self.user_cols = ['rating', 'review', 'displayName', 'isVerified', 'isSuperReviewer']
        self.critic_cols = ['creationDate', 'isFresh', 'isRotten', 'isTop', 'reviewUrl',
                           'quote','scoreSentiment', 'critic.name','publication.name']
        print('\n --------------------- \n ')
        print('Scrapped Initiated')
        
    def scrape_reviews(self):
        '''
        Scrapes rotten tomatoes for reviews and related info, and updates self.review_df with data 
        '''
        url = f'https://www.rottentomatoes.com/m/{self.movie_title}/reviews'
        r = requests.get(url)
        movie_id = re.findall(r'(?<=movieId":")(.*)(?=","type)',r.text)[0]

        if self.reviewer == 'critic':
            api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/criticsReviews/all"
        if self.reviewer == 'user':
            api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/reviews/user"
        payload = {
            'direction': 'next',
            'endCursor': '',
            'startCursor': '',
        }
        pages_scraped = 0
        review_data = []
        while True:
            r = session.get(api_url, 
                      params=payload)
            data = r.json()

            if not data['pageInfo']['hasNextPage']:
                print('Scaping completed')
                break
            elif pages_scraped == self.scraping_limit:
                print('Scraping limit reached')
                break

            payload['endCursor'] = data['pageInfo']['endCursor']
            payload['startCursor'] = data['pageInfo']['startCursor'] if data['pageInfo'].get('startCursor') else ''
            review_data.extend(data['reviews'])
            time.sleep(.1)
            pages_scraped += 1
            if pages_scraped % 50 == 0: 
                print(f'Pages scraped: {pages_scraped}')
#         print(review_data)
        self.review_df =  pd.json_normalize(review_data)
        return 
        
    def filter_df(self):
        '''
        takes in self.review_df and updates it based on filtering conditionals
        
        NOTE: if there are not enough verified user reviews (>5), no filtering carried out. 
        '''
    
        if self.reviewer == 'user':
            self.review_df = self.review_df[self.user_cols].copy()
        elif self.reviewer == 'critic':
            self.review_df = self.review_df[self.critic_cols].copy()
        return
    
    def write_df(self):
        '''
        If self.write_data == True, a pickled dataframe will be written
        '''
        if self.write_data == True:
            self.review_df.to_pickle(f'data/{self.reviewer}_{self.movie_title}.pkl')
    
    def run_for_reviews(self):
        '''
        runs all class methods -> scrape_reviews, filter_df, and write_df 
        '''
        try: 
            self.scrape_reviews()
            self.filter_df()
            self.write_df()
            return self.review_df
        
        except: 
            print(f'Could not find reviews for *{self.movie_title}*... Please try to find another one' )

In [None]:
rt_movie_names = [m.split('m/')[-1] for m in df.url.values]
rt_movie_names[:5]

In [None]:
failed_critic = []

scraper = ScrapeRtForDB(movie_title = None, 
                      scraping_limit= 200, 
                      reviewer = 'critic', 
                      write_data = True
                    )

for title in tqdm(rt_movie_names[:]):
    try: 
        scraper.movie_title = title
        scraper.run_for_reviews()
    except: 
        failed_critic.append(title)


In [None]:
failed_user = []

scraper = ScrapeRtForDB(movie_title = None, 
                      scraping_limit= 10, 
                      reviewer = 'user', 
                      write_data = True
                    )

for title in tqdm(rt_movie_names[:]):
    try: 
        scraper.movie_title = title
        scraper.run_for_reviews()
    except: 
        failed_user.append(title)
