# Data Mining Project: Milestone 1 (Part 2)
### Web Scraping (Part 2): Scrap box office data from boxofficemojo.com
1. Using selenium, open a browser and browse to https://www.boxofficemojo.com/
2. Search the desired movie title on boxofficemojo.com
3. Click into the first result page
3. Extract the box office data of the movie

In [1]:
import re
import time    
import joblib
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver                    
from selenium.webdriver.common.keys import Keys                      

def open_browser():
    '''Open a browser using selenium'''
    browser = webdriver.Firefox(executable_path="./geckodriver-v0.25.0-win64/geckodriver.exe")  
    return browser


def get_text(browser, xpath):
    '''Extract the text by xpath'''
    try:
        return browser.find_elements_by_xpath(xpath)[0].text 
    except:
        return None

def scrap_box_office_data(browser, movie_title):
    try:
        URL = "https://www.boxofficemojo.com/date"
        browser.get(URL) 
        time.sleep(0.5)  
        
        # Search the movie title on boxofficemojo.com
        search = browser.find_elements_by_xpath('//*[@id="mojo-search-text-input"]')[0]
        search.send_keys(Keys.ESCAPE) 
        
        search = browser.find_elements_by_xpath('//*[@id="mojo-search-text-input"]')[0]
        time.sleep(1)  
        search.send_keys(movie_title)
        time.sleep(0.5)  
        search.send_keys(Keys.ENTER) 
        time.sleep(2)      
        
        # On the search result page, click on the first result to go to the desired movie page
        first_search = browser.find_elements_by_xpath('/html/body/div[1]/main/div/div/div/div[1]/div/div[2]/a')
        if len(first_search) == 0:
            print("No movie found:", movie_title)
            return {}
        else:
            movie_page_url = first_search[0].get_property("href")
            first_search[0].click()
        
#         # On the movie page, scrap all the box office data    
        domestic_xpath = '/html/body/div[1]/main/div/div[3]/div[1]/div/div[1]/span[2]/span'
        foreign_xpath = '/html/body/div[1]/main/div/div[3]/div[1]/div/div[2]/span[2]/span'
        total_xpath = '/html/body/div[1]/main/div/div[3]/div[1]/div/div[3]/span[2]/span'
        
        distributor_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[1]/span[2]'
        opening_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[2]/span[2]/span'
        mpaa_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[4]/span[2]'
        mpaa_confirm_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[4]/span[1]'
        markets_xpath = '/html/body/div[1]/main/div/div[5]/div/div/table/tbody/tr[2]/td[3]'
        
        mpaa_confirm = get_text(browser, mpaa_confirm_xpath)
        
        box_office_dict = {}
        box_office_dict["movie_title"] = movie_title
        box_office_dict["domestic_gross"] = get_text(browser, domestic_xpath)
        box_office_dict["foreign_gross"] = get_text(browser, foreign_xpath)
        box_office_dict["total_gross"] = get_text(browser, total_xpath)
#         box_office_dict["distributor"] = get_text(browser, distributor_xpath)
        box_office_dict["domestic_opening"] = get_text(browser, opening_xpath)
        if mpaa_confirm == "MPAA":
            box_office_dict['mpaa'] = get_text(browser, mpaa_xpath)
        else:
            box_office_dict['mpaa'] = None
        box_office_dict['markets'] = get_text(browser, markets_xpath)

        return box_office_dict
    except Exception as err:
        print("No movie found err:", movie_title, err)
        return {}

### Load the 250 movie titles that we obtained from Part 1.

In [3]:
df = pd.read_csv("movie_info_5000.csv")
df.shape

(4878, 16)

### Scrap the box office data of the 250 movies.
*This will take approximately 20 minutes.*

In [4]:
%%time 
browser = open_browser()

box_office_list = []
for movie_title in df.title.values[3000:]:
    data = scrap_box_office_data(browser, movie_title)
    box_office_list.append(data)
    
browser.close()


No movie found: Elimination Game (Turkey Shoot)
No movie found: Electric Slide
No movie found: Cas & Dylan
No movie found: Bravetown
No movie found: 3 Holes and a Smoking Gun
No movie found: Minuscule: Valley of the Lost Ants (Minuscule - La vallée des fourmis perdues)
No movie found: Sword Of Vengeance
No movie found: Da Sweet Blood of Jesus
No movie found: See You in Valhalla
No movie found: Jen Kirkman: I'm Gonna Die Alone (And I Feel Fine)
No movie found: Second Opinion: Laetrile at Sloan-Kettering
No movie found: Glass Chin
No movie found: The Rise and Rise of Bitcoin
No movie found: Lost Soul: The Doomed Journey of Richard Stanley's Island of Dr. Moreau
No movie found: Justice League: Gods and Monsters
No movie found: Brush With Danger
No movie found: Bedlam
No movie found: John Doe: Vigilante
No movie found: Romeo And Juliet (Broadway Hd)
No movie found: Affluenza
No movie found: The Walking Deceased
No movie found: Batman Vs. Robin
No movie found: The Strange Affair of Uncle Ha

#### Temporarily store these data in CSV. 
*These data will be stored in the Hive Data Warehouse in the next milestone.*

In [5]:
box_office_df = pd.DataFrame(box_office_list)

print(box_office_df.shape)
box_office_df.head()


(1878, 7)


Unnamed: 0,domestic_gross,domestic_opening,foreign_gross,markets,movie_title,mpaa,total_gross
0,,,"$41,760",6 markets,The Road Within,R,"$41,760"
1,"$33,078,266","$8,540,370","$52,900,000",54 markets,The Second Best Exotic Marigold Hotel,,"$85,978,266"
2,"$308,156","$20,300",,Domestic,Merchants Of Doubt,PG-13,"$308,156"
3,"$24,296","$15,477","$77,511",5 markets,Kill Me Three Times,R,"$101,807"
4,"$502,294","$37,321",,Domestic,Deli Man,PG-13,"$502,294"


In [7]:
box_office_df = box_office_df.loc[~box_office_df.isna().all(axis = 1),:]
box_office_df.shape

(1640, 7)

In [8]:
box_office_df.to_csv("movie_box_office_5000_4.csv", index = False)