### Web Scraping on Movie Info and Reviews from rottentomatoes.com 
1. Obtain a list of urls of the latest 250 movies with DVD or Streaming options available  
2. For each movie, scrap the information and ratings. (Structured Data)
3. For each movie, scrap the user reviews. (Unstructured Data)

In [None]:
import time                  
import requests
from bs4 import BeautifulSoup
from selenium import webdriver                    
from selenium.webdriver.common.keys import Keys                      
import re
import pandas as pd
import joblib

In [None]:
def find_latest_list_of_movies(URL = 'https://www.rottentomatoes.com/browse/dvd-streaming-all', maxcount = 250):
    # Launch Firefox using selenium package
    browser = webdriver.Firefox(executable_path="./geckodriver-v0.25.0-win64/geckodriver.exe")  
    
    # Browse to the page listing all movies with DVD or Streaming options available  
    browser.get(URL) 
    time.sleep(2) # Wait for "Show More" button to appear 
    
    # Click the "Show More" button to get all the required movies listed in the page.
    total_click_counts = int(maxcount/32)+2
    for j in range(total_click_counts):
        browser.find_element_by_class_name('mb-load-btn').click() # Click "Show More"
        time.sleep(1.5) # Wait for button to appear 
    html_source = browser.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    
    # Close the browser
    browser.close() 
    
    # Obtain the url to the page containing the movie information for each movie
    partial_movies_urls = [link.find("a").get("href") for link in soup.find_all("div", class_="movie_info")]
    each_movies_urls = ["https://www.rottentomatoes.com" + j for j in partial_movies_urls]
    
    return each_movies_urls[:maxcount]

def get_movie_info_and_rating(url_movie):
    # For each movie, we will scrap all the relevant information and ratings of the movie
    content = requests.get(url_movie)
    soup = BeautifulSoup(content.text, 'html.parser')

    movie_info = {}
    
    # Title
    title = soup.find("h1", class_="mop-ratings-wrap__title mop-ratings-wrap__title--top").text
    
    # Critics Consensus
    critics_consensus = soup.find("p", class_="mop-ratings-wrap__text mop-ratings-wrap__text--concensus")
    critics_consensus = critics_consensus.text if critics_consensus is not None else ""
        
    # Rotten Tomatoes rating
    try:
        tomatometer = [item.text.strip() for item in soup.find_all("span", class_="mop-ratings-wrap__percentage")][0]
    except:
        tomatometer = None
        
    # Audience rating
    try:
        audience_score = [item.text.strip() for item in soup.find_all("span", class_="mop-ratings-wrap__percentage")][1]
    except:
        audience_score = None
    
    # Number of Rotten Tomatoes rating given to the movie
    try: 
        tomatometer_count = soup.find("small", class_="mop-ratings-wrap__text--small").text.strip()
    except:
        tomatometer_count = None
    
     # Number of user rating given to the movie
    try:    
        user_rating_count = soup.find_all("strong", class_="mop-ratings-wrap__text--small")[1].text.strip().split(":")[1]
    except:
        user_rating_count = None
    
    # Return a dictionary containing all these data
    for item in soup.find_all("li", class_ = "meta-row clearfix"):
        key = item.find("div", class_ = "meta-label subtle").text.strip()
        val = item.find("div", class_ = "meta-value").text.strip()
        movie_info[key] = val
    
    movie_info["title"] = title
    movie_info["critics_consensus"] = critics_consensus
    movie_info["tomatometer"] = tomatometer
    movie_info["audience_score"] = audience_score
    movie_info["tomatometer_count"] = tomatometer_count
    movie_info["user_rating_count"] = user_rating_count
    movie_info["url"] = url_movie

    return movie_info

def get_reviews_list(url_movie):
    page_number = 1
    while True:
        # For each page of reviews, scrap all the reviews and append them to the reviews_list.
        url_review = url_movie + "/reviews?page=" + str(page_number)
        content = requests.get(url_review)
        soup = BeautifulSoup(content.text, 'html.parser')
        reviews_list = []
        for item in soup.find_all("div", class_="row review_table_row"):
            review_item = item.find("div", class_ = "the_review")
            if review_item is None:
                return reviews_list
            one_review = review_item.text.strip()
            reviews_list.append(one_review)

        page_number = page_number + 1

### Get the list of urls to the latest 250 movies with DVD or Streaming options available  

In [None]:
movies_urls = find_latest_list_of_movies(maxcount = 250)
movies_urls[0:10] # preview a few urls

### For each movie, scrap the information and ratings.

In [None]:
%%time
movie_info = []
for url_movie in movies_urls:
    movie_info.append(get_movie_info_and_rating(url_movie))
    print("Completed:", url_movie)

In [None]:
df = pd.DataFrame(movie_info)
df.head() # Preview of the data

In [None]:
# Temporarily store these data in CSV
df.to_csv("movie_info.csv", index=False)

### For each movie, scrap the user reviews.

In [None]:
%%time
# We will assign the list of user reviews to a dictionary with the url as the key
movie_reviews = {}
for url_movie in movies_urls:
    reviews = get_reviews_list(url_movie)
    movie_reviews[url_movie] = reviews     
    print("Completed:", url_movie)

In [None]:
movie_reviews[movies_urls[10]][0:10] # Preview some user reviews of a movie

In [None]:
# Temporarily save the dictionary as python pickled data *.pkl
joblib.dump(movie_reviews, filename = "movie_reviews.pkl")