# Data Mining Project: Milestone 1 (Part 2)
### Web Scraping (Part 2): Scrap box office data from boxofficemojo.com
1. Using selenium, open a browser and browse to https://www.boxofficemojo.com/
2. Search the desired movie title on boxofficemojo.com
3. Click into the first result page
3. Extract the box office data of the movie

In [1]:
import re
import time    
import joblib
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver                    
from selenium.webdriver.common.keys import Keys                      

def open_browser():
    '''Open a browser using selenium'''
    browser = webdriver.Firefox(executable_path="./geckodriver-v0.25.0-win64/geckodriver.exe")  
    return browser


def get_text(browser, xpath):
    '''Extract the text by xpath'''
    try:
        return browser.find_elements_by_xpath(xpath)[0].text 
    except:
        return None

def scrap_box_office_data(browser, movie_title):
    try:
        URL = "https://www.boxofficemojo.com/date"
        browser.get(URL) 
        time.sleep(0.5)  
        
        # Search the movie title on boxofficemojo.com
        search = browser.find_elements_by_xpath('//*[@id="mojo-search-text-input"]')[0]
        search.send_keys(Keys.ESCAPE) 
        
        search = browser.find_elements_by_xpath('//*[@id="mojo-search-text-input"]')[0]
        time.sleep(1)  
        search.send_keys(movie_title)
        time.sleep(0.5)  
        search.send_keys(Keys.ENTER) 
        time.sleep(2)      
        
        # On the search result page, click on the first result to go to the desired movie page
        first_search = browser.find_elements_by_xpath('/html/body/div[1]/main/div/div/div/div[1]/div/div[2]/a')
        if len(first_search) == 0:
            print("No movie found:", movie_title)
            return {}
        else:
            movie_page_url = first_search[0].get_property("href")
            first_search[0].click()
        
#         # On the movie page, scrap all the box office data    
        domestic_xpath = '/html/body/div[1]/main/div/div[3]/div[1]/div/div[1]/span[2]/span'
        foreign_xpath = '/html/body/div[1]/main/div/div[3]/div[1]/div/div[2]/span[2]/span'
        total_xpath = '/html/body/div[1]/main/div/div[3]/div[1]/div/div[3]/span[2]/span'
        
        distributor_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[1]/span[2]'
        opening_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[2]/span[2]/span'
        mpaa_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[4]/span[2]'
        mpaa_confirm_xpath = '/html/body/div[1]/main/div/div[3]/div[4]/div[4]/span[1]'
        markets_xpath = '/html/body/div[1]/main/div/div[5]/div/div/table/tbody/tr[2]/td[3]'
        
        mpaa_confirm = get_text(browser, mpaa_confirm_xpath)
        
        box_office_dict = {}
        box_office_dict["movie_title"] = movie_title
        box_office_dict["domestic_gross"] = get_text(browser, domestic_xpath)
        box_office_dict["foreign_gross"] = get_text(browser, foreign_xpath)
        box_office_dict["total_gross"] = get_text(browser, total_xpath)
#         box_office_dict["distributor"] = get_text(browser, distributor_xpath)
        box_office_dict["domestic_opening"] = get_text(browser, opening_xpath)
        if mpaa_confirm == "MPAA":
            box_office_dict['mpaa'] = get_text(browser, mpaa_xpath)
        else:
            box_office_dict['mpaa'] = None
        box_office_dict['markets'] = get_text(browser, markets_xpath)

        return box_office_dict
    except Exception as err:
        print("No movie found err:", movie_title, err)
        return {}

### Load the 250 movie titles that we obtained from Part 1.

In [2]:
df = pd.read_csv("movie_info_5000.csv")
df.shape

(4878, 16)

### Scrap the box office data of the 250 movies.
*This will take approximately 20 minutes.*

In [3]:
%%time 
browser = open_browser()

box_office_list = []
for movie_title in df.title.values[2000:3000]:
    data = scrap_box_office_data(browser, movie_title)
    box_office_list.append(data)
    
browser.close()


No movie found: Jack Goes Home
No movie found: Sharknado: The 4th Awakens
No movie found: Justin Timberlake + The Tennessee Kids
No movie found: The Mind's Eye
No movie found: Audrie & Daisy
No movie found: Transpecos
No movie found: My Blind Brother
No movie found: The Blackout Experiments
No movie found: Youth in Oregon
No movie found: Sandy Wexler
No movie found: Speech & Debate
No movie found: Louis C.K.: 2017
No movie found: Buddymoon
No movie found: The White Helmets
No movie found: Moments of Clarity
No movie found: Brothers Hypnotic
No movie found: The Creeping Garden
No movie found: Deidra & Laney Rob a Train
No movie found: The Most Hated Woman In America
No movie found: Dig Two Graves
No movie found: What Happened, Miss Simone?
No movie found: Beautiful Something
No movie found: Downriver
No movie found: Jane Wants a Boyfriend
No movie found: Ace the Case: Manhattan Mystery
No movie found: Floyd Norman: An Animated Life
No movie found: Kampai! For the Love of Sake
No movie f

#### Temporarily store these data in CSV. 
*These data will be stored in the Hive Data Warehouse in the next milestone.*

In [4]:
box_office_df = pd.DataFrame(box_office_list)

print(box_office_df.shape)
box_office_df.head()


(1000, 7)


Unnamed: 0,domestic_gross,domestic_opening,foreign_gross,markets,movie_title,mpaa,total_gross
0,"$255,388","$21,883","$87,276",5 markets,"Life, Animated",PG,"$342,664"
1,,,"$277,761","EMEA, China",Sky Ladder: The Art of Cai Guo-Qiang,,"$277,761"
2,,,"$71,723",Australia,Chicken People,,"$71,723"
3,,,"$333,385",9 markets,The Late Bloomer,,"$333,385"
4,,,,,,,


In [6]:
box_office_df = box_office_df.loc[~box_office_df.isna().all(axis = 1),:]
box_office_df.shape

(869, 7)

In [7]:
box_office_df.to_csv("movie_box_office_5000_3.csv", index = False)