# Data Mining Project: Milestone 1 (Part 2)
### Web Scraping (Part 2): Scrap box office data from boxofficemojo.com
1. Using selenium, open a browser and browse to https://www.boxofficemojo.com/
2. Search the desired movie title on boxofficemojo.com
3. Click into the first result page
3. Extract the box office data of the movie

In [None]:
import re
import time    
import joblib
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver                    
from selenium.webdriver.common.keys import Keys                      

def open_browser():
    '''Open a browser using selenium'''
    browser = webdriver.Firefox(executable_path="./geckodriver-v0.25.0-win64/geckodriver.exe")  
    return browser


def get_text(browser, xpath):
    '''Extract the text by xpath'''
    try:
        return browser.find_elements_by_xpath(xpath)[0].text 
    except:
        return None

def scrap_box_office_data(browser, movie_title):
    URL = "https://www.boxofficemojo.com/"
    browser.get(URL) 
    time.sleep(1) 
    
    # Search the movie title on boxofficemojo.com
    search = browser.find_elements_by_xpath('/html/body/div/div[3]/div[1]/ul/li[2]/form/input[1]')[0]
    search.send_keys(movie_title)
    time.sleep(1)                                             # 5 
    search.send_keys(Keys.ENTER) 
    time.sleep(1)     
    
    # On the search result page, click on the first result to go to the desired movie page
    first_search = browser.find_elements_by_xpath('/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td[1]/b/font/a')
    if len(first_search) == 0:
        print("No movie found:", movie_title)
        return {}
    else:
        movie_page_url = first_search[0].get_property("href")
        first_search[0].click()
        
    # On the movie page, scrap all the box office data    
    box_office_dict = {}

    domestic_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/b'
    foreign_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[1]/div[2]/table/tbody/tr[2]/td[2]'
    total_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[1]/div[2]/table/tbody/tr[4]/td[2]/b'
    opening_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[1]/tbody/tr[1]/td[2]'
    stats_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[1]/tbody/tr[2]/td/font'
    widest_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[2]/tbody/tr/td[2]'
    widest_key_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[2]/tbody/tr/td[1]'

    close_date_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[3]/tbody/tr/td[2]'
    close_date_key_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[3]/tbody/tr/td[1]'
    in_release_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[4]/tbody/tr/td[2]'
    in_release_key_xpath = '/html/body/div/div[3]/div[2]/table[2]/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr/td[1]/table/tbody/tr/td[1]/div[2]/div[2]/table[4]/tbody/tr/td[1]'

    domestic_gross = get_text(browser, domestic_xpath)
    foreign_gross = get_text(browser, foreign_xpath)
    total_gross = get_text(browser, total_xpath)
    opening_weekend = get_text(browser, opening_xpath)

    stats = get_text(browser, stats_xpath)
    widest_release = get_text(browser, widest_xpath)
    widest_release_key = get_text(browser, widest_key_xpath)
    close_date = get_text(browser, close_date_xpath)
    close_date_key = get_text(browser, close_date_key_xpath)
    in_release = get_text(browser, in_release_xpath)
    in_release_key = get_text(browser, in_release_key_xpath)

    box_office_dict["domestic_gross"] = domestic_gross
    box_office_dict["foreign_gross"] = foreign_gross
    box_office_dict["total_gross"] = total_gross
    box_office_dict["opening_weekend"] = opening_weekend
    box_office_dict["stats"] = stats
    box_office_dict[widest_release_key] = widest_release
    box_office_dict[close_date_key] = close_date
    box_office_dict[in_release_key] = in_release
    box_office_dict["title"] = movie_title
    
    return box_office_dict

### Load the 250 movie titles that we obtained from Part 1.

In [None]:
df = pd.read_csv("movie_info.csv")

### Scrap the box office data of the 250 movies.
*This will take approximately 20 minutes.*

In [None]:
%%time 
browser = open_browser()

box_office_list = []
for movie_title in df.title.values:
    data = scrap_box_office_data(browser, movie_title)
    box_office_list.append(data)
    
browser.close()


#### Temporarily store these data in CSV. 
*These data will be stored in the Hive Data Warehouse in the next milestone.*

In [None]:
box_office_df = pd.DataFrame(box_office_list)
box_office_df.to_csv("movie_box_office.csv", index = False)