### Web Scraping on Movie Info and Reviews from rottentomatoes.com 
1. Obtain a list of urls of the latest 250 movies with DVD or Streaming options available  
2. For each movie, scrap the information and ratings. (Structured Data)
3. For each movie, scrap the user reviews. (Unstructured Data)

In [None]:
import time                  
import requests
from bs4 import BeautifulSoup
from selenium import webdriver                    
from selenium.webdriver.common.keys import Keys                      
import re
import pandas as pd
import joblib

In [None]:
def find_latest_list_of_movies(URL = 'https://www.rottentomatoes.com/browse/dvd-streaming-all', maxcount = 250):
    # Launch Firefox using selenium package
    browser = webdriver.Firefox(executable_path="./geckodriver-v0.25.0-win64/geckodriver.exe")  
    
    # Browse to the page listing all movies with DVD or Streaming options available  
    browser.get(URL) 
    time.sleep(2) # Wait for "Show More" button to appear 
    
    # Click the "Show More" button to get all the required movies listed in the page.
    total_click_counts = int(maxcount/32)+2
    for j in range(total_click_counts):
        browser.find_element_by_class_name('mb-load-btn').click() # Click "Show More"
        time.sleep(1.5) # Wait for button to appear 
    html_source = browser.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    
    # Close the browser
    browser.close() 
    
    # Obtain the url to the page containing the movie information for each movie
    partial_movies_urls = [link.find("a").get("href") for link in soup.find_all("div", class_="movie_info")]
    each_movies_urls = ["https://www.rottentomatoes.com" + j for j in partial_movies_urls]
    
    return each_movies_urls[:maxcount]

def get_movie_info_and_rating(url_movie):
    # For each movie, we will scrap all the relevant information and ratings of the movie
    content = requests.get(url_movie)
    soup = BeautifulSoup(content.text, 'html.parser')

    movie_info = {}
    
    # Title
    title = soup.find("h1", class_="mop-ratings-wrap__title mop-ratings-wrap__title--top").text
    
    # Critics Consensus
    critics_consensus = soup.find("p", class_="mop-ratings-wrap__text mop-ratings-wrap__text--concensus")
    critics_consensus = critics_consensus.text if critics_consensus is not None else ""
        
    # Rotten Tomatoes rating
    try:
        tomatometer = [item.text.strip() for item in soup.find_all("span", class_="mop-ratings-wrap__percentage")][0]
    except:
        tomatometer = None
        
    # Audience rating
    try:
        audience_score = [item.text.strip() for item in soup.find_all("span", class_="mop-ratings-wrap__percentage")][1]
    except:
        audience_score = None
    
    # Number of Rotten Tomatoes rating given to the movie
    try: 
        tomatometer_count = soup.find("small", class_="mop-ratings-wrap__text--small").text.strip()
    except:
        tomatometer_count = None
    
     # Number of user rating given to the movie
    try:    
        user_rating_count = soup.find_all("strong", class_="mop-ratings-wrap__text--small")[1].text.strip().split(":")[1]
    except:
        user_rating_count = None
    
    # Return a dictionary containing all these data
    for item in soup.find_all("li", class_ = "meta-row clearfix"):
        key = item.find("div", class_ = "meta-label subtle").text.strip()
        val = item.find("div", class_ = "meta-value").text.strip()
        movie_info[key] = val
    
    movie_info["title"] = title
    movie_info["critics_consensus"] = critics_consensus
    movie_info["tomatometer"] = tomatometer
    movie_info["audience_score"] = audience_score
    movie_info["tomatometer_count"] = tomatometer_count
    movie_info["user_rating_count"] = user_rating_count
    movie_info["url"] = url_movie

    return movie_info

def get_reviews_list(url_movie):
    page_number = 1
    reviews_list = []
    while True:
        # For each page of reviews, scrap all the reviews and append them to the reviews_list.
        url_review = url_movie + "/reviews?page=" + str(page_number)
        content = requests.get(url_review)
        soup = BeautifulSoup(content.text, 'html.parser')
        item_list = soup.find_all("div", class_="row review_table_row")
        if len(item_list) == 0:
            return reviews_list
        
        for item in item_list:
            review_item = item.find("div", class_ = "the_review")
            if review_item is None:
                return reviews_list
            one_review = review_item.text.strip()
            reviews_list.append(one_review)

        page_number = page_number + 1
        
        
def get_text(browser, xpath):
    try:
        return browser.find_elements_by_xpath(xpath)[0].text 
    except:
        return None
    
def open_browser():
    browser = webdriver.Firefox(executable_path="./geckodriver-v0.25.0-win64/geckodriver.exe")  
    return browser

def scrap_box_office_data(browser, movie_title):
    URL = "https://www.boxofficemojo.com/"
    browser.get(URL) 
    time.sleep(1) 
    search = browser.find_elements_by_xpath('/html/body/div/div[3]/div[1]/ul/li[2]/form/input[1]')[0]
    # find_elements will give us the list of all elements with id as subjectInput 
    search.send_keys(movie_title)
    time.sleep(1)                                             # 5 
    search.send_keys(Keys.ENTER) 
    time.sleep(1)     
    
    
    first_search = browser.find_elements_by_xpath('/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td[1]/b/font/a')
    if len(first_search) == 0:
        print("No movie found:", movie_title)
        return {}
    else:
        movie_page_url = first_search[0].get_property("href")
        first_search[0].click()
        
    box_office_dict = {}

    domestic_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/b'
    foreign_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[1]/div[2]/table/tbody/tr[2]/td[2]'
    total_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[1]/div[2]/table/tbody/tr[4]/td[2]/b'
    opening_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[1]/tbody/tr[1]/td[2]'
    stats_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[1]/tbody/tr[2]/td/font'
    widest_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[2]/tbody/tr/td[2]'
    widest_key_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[2]/tbody/tr/td[1]'

    close_date_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[3]/tbody/tr/td[2]'
    close_date_key_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[3]/tbody/tr/td[1]'
    in_release_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[4]/tbody/tr/td[2]'
    in_release_key_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[4]/tbody/tr/td[1]'

    domestic_gross = get_text(browser, domestic_xpath)
    foreign_gross = get_text(browser, foreign_xpath)
    total_gross = get_text(browser, total_xpath)
    opening_weekend = get_text(browser, opening_xpath)

    stats = get_text(browser, stats_xpath)
    widest_release = get_text(browser, widest_xpath)
    widest_release_key = get_text(browser, widest_key_xpath)
    close_date = get_text(browser, close_date_xpath)
    close_date_key = get_text(browser, close_date_key_xpath)
    in_release = get_text(browser, in_release_xpath)
    in_release_key = get_text(browser, in_release_key_xpath)

    box_office_dict["domestic_gross"] = domestic_gross
    box_office_dict["foreign_gross"] = foreign_gross
    box_office_dict["total_gross"] = total_gross
    box_office_dict["opening_weekend"] = opening_weekend
    box_office_dict["stats"] = stats
    box_office_dict[widest_release_key] = widest_release
    box_office_dict[close_date_key] = close_date
    box_office_dict[in_release_key] = in_release
    box_office_dict["title"] = movie_title
    
    return box_office_dict

### Get the list of urls to the latest 250 movies with DVD or Streaming options available  

In [None]:
movies_urls = find_latest_list_of_movies(maxcount = 250)
movies_urls[0:10] # preview a few urls

### For each movie, scrap the information and ratings.

In [None]:
%%time
movie_info = []
for url_movie in movies_urls:
    movie_info.append(get_movie_info_and_rating(url_movie))
    print("Completed:", url_movie)

In [None]:
df = pd.DataFrame(movie_info)
df.head() # Preview of the data

In [None]:
# Temporarily store these data in CSV
df.to_csv("movie_info.csv", index=False)

### For each movie, scrap the user reviews.

In [None]:
%%time
# We will assign the list of user reviews to a dictionary with the url as the key
movie_reviews = {}
for url_movie in movies_urls:
    reviews = get_reviews_list(url_movie)
    movie_reviews[url_movie] = reviews     
    print("Completed:", url_movie)

In [None]:
movie_reviews[movies_urls[10]][0:10] # Preview some user reviews of a movie

In [None]:
# Temporarily save the dictionary as python pickled data *.pkl
joblib.dump(movie_reviews, filename = "movie_reviews.pkl")

### Scrap Box Office Data

In [92]:
df = pd.read_csv("movie_info.csv")

In [93]:
%%time 
browser = open_browser()

box_office_list = []
for movie_title in df.title.values:
    data = scrap_box_office_data(browser, movie_title)
    box_office_list.append(data)
    
browser.close()

No movie found: The Grief of Others
No movie found: My Son (Mon garçon)
No movie found: Four Hands (Die Vierhändige)
No movie found: In the Aisles (In den Gängen)
No movie found: The Wedding Year
No movie found: The Weekend
No movie found: A Bread Factory, Part One: For the Sake of Gold
No movie found: A Bread Factory, Part Two: Walk with Me a While
No movie found: Skin
No movie found: Empathy, Inc.
No movie found: Tall Girl
No movie found: Corporate Animals
No movie found: Auggie
No movie found: Between Two Ferns: The Movie
No movie found: Running with the Devil
No movie found: The Last Photograph
No movie found: Blue Note Records: Beyond the Notes
No movie found: Ladyworld
No movie found: Night Hunter (Nomis)
No movie found: Darlin'
No movie found: Anna
No movie found: Apocalypse Now: Final Cut
No movie found: Satanic Panic
No movie found: Strange But True
No movie found: Angel of Mine
No movie found: Itsy Bitsy
No movie found: The Parting Glass
No movie found: Artik
No movie found: 

In [94]:
box_office_df = pd.DataFrame(box_office_list)

In [103]:
box_office_df.to_csv("movie_box_office.csv", index = False)