# Movie Maps
---
Webscrapper for Movie data.
---
### Dependencies

In [3]:
from bs4 import BeautifulSoup
import requests
from splinter import Browser
from os.path import basename
import time
import pandas as pd

In [4]:
# Splinter set up
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

---
### Genre vs Performance Data
- Scrape data off of Box Office Mojo

In [7]:
# Direct browser to the Box Office Mojo website and prepare for scraping
base_url = 'https://www.boxofficemojo.com' 
url = base_url + '/yearly/chart/?yr=2018&p=.htm'
browser.visit(url)

In [12]:
# Direct browser to page
html = browser.html
bom_soup = BeautifulSoup(html, 'html.parser')

# Focus on 'body' content
main_body = bom_soup.find('div', id='body')

# Focus on the 4th table in body
main_table = main_body.findAll('table')[3]

# Inspect html
print(main_table)


<table border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr><td align="center" valign="top"><br/>
<h1>2018 DOMESTIC GROSSES</h1><p>Total Grosses of all Movies Released in 2018</p>
<center><font face="Verdana" size="4"><b><font face="Verdana" size="5">#1–100</font> - <a href="/yearly/chart/?page=2&amp;view=releasedate&amp;view2=domestic&amp;yr=2018&amp;p=.htm">#101–200</a> - <a href="/yearly/chart/?page=3&amp;view=releasedate&amp;view2=domestic&amp;yr=2018&amp;p=.htm">#201–300</a> - <a href="/yearly/chart/?page=4&amp;view=releasedate&amp;view2=domestic&amp;yr=2018&amp;p=.htm">#301–400</a> - <a href="/yearly/chart/?page=5&amp;view=releasedate&amp;view2=domestic&amp;yr=2018&amp;p=.htm">#401–500</a> - <a href="/yearly/chart/?page=6&amp;view=releasedate&amp;view2=domestic&amp;yr=2018&amp;p=.htm">#501–600</a> - <a href="/yearly/chart/?page=7&amp;view=releasedate&amp;view2=domestic&amp;yr=2018&amp;p=.htm">#601–700</a> - <a href="/yearly/chart/?page=8&amp;view=releasedate&amp;vie

In [58]:
# Extract the rows associated with movie links
white_rows = main_table.findAll("tr", bgcolor="#ffffff")
bluey_rows = main_table.findAll("tr", bgcolor="#f4f4ff")

# Examine row structure
print(white_rows[0])

<tr bgcolor="#ffffff"><td align="center"><font size="2">1</font></td><td><b><font size="2"><a href="/movies/?id=marvel2017b.htm">Black Panther</a></font></b></td><td><font size="2"><a href="/studio/chart/?studio=buenavista.htm">BV</a></font></td><td align="right"><font size="2"><b>$700,059,566</b></font></td><td align="right"><font size="2">4,084</font></td><td align="right"><font size="2">$202,003,951</font></td><td align="right"><font size="2">4,020</font></td><td align="right"><font size="2"><a href="/schedule/?view=bydate&amp;release=theatrical&amp;date=2018-02-16&amp;p=.htm">2/16</a></font></td><td align="right"><font size="2">8/9</font></td></tr>


In [43]:
# Extract movie IDs from the links
movie_ids = []

for row in white_rows:
    link = row.find('a')
    if (link):
        addr = link['href'].split('=')[1].split('.')[0]
        movie_ids.append(addr)
for row in bluey_rows:
    link = row.find('a')
    if (link):
        addr = link['href'].split('=')[1].split('.')[0]
        movie_ids.append(addr)

# Check extraction results
print(len(movie_ids))
print(movie_ids)

100
['marvel2017b', 'theincredibles2', 'foxmarvel18', 'missionimpossible6', 'untitledhansolostarwarsanthologyfilm', 'astarisborn2018', 'bohemianrhapsody', 'hoteltransylvania3', 'wbevent2018', 'wbeventfilm2018', 'readyplayerone', 'thenun', 'theequalizer2', 'fiftyshadesfreed', 'disneyfairytale2017', 'icanonlyimagine', 'nightschool2018', 'gamenight', 'thehousewithaclockinitswalls', 'insidious4', 'pacificrim2', 'dcfilm0318', 'tag', 'nutcrackerandthefourrealms', 'thepredator', 'sicario2', 'blackkklansman', 'goosebumps2', 'horsesoldiers2018', 'firstman', 'acrimony', 'uncledrew', 'untitledgregberlantifilm', 'widows', '1517toparis', 'sonyeventfilm2017', 'deathwish2017', 'annihilation', 'untitledtylerperrymovie', 'supertroopers2', 'wbanimation62018', 'robinhood2018', 'winchester', 'whiteboyrick', 'overlord', 'proudmary', 'superfly', 'badtimesattheelroyale', 'paulapostleofchrist', 'chappaquiddick', 'marvel0518', 'jurassicworldsequel', 'grinch2017', 'ant-manandthewasp', 'venom2018', 'aquietplace'

In [66]:
# Determine how to scrape movie data from webpage 
wk_html = pd.read_html('https://www.boxofficemojo.com/movies/?page=weekly&id=marvel2017b.htm')
wk_col = wk_html[0].iloc[10, 0:9]
wk_pd = wk_html[0].iloc[11:, 0:9]
wk_col[6:9] = wk_col[5:8]
wk_col[0] = 'Date'
wk_col[4:6] = ['Theaters', 'Change']
wk_pd.columns = wk_col
wk_pd.head()

10,Date,Rank,WeeklyGross,%Change,Theaters,Change,Avg.,Gross-to-Date,Week#
11,Feb 1622,1,"$291,954,422",-,4020,-,"$72,625","$291,954,422",1
12,Feb 23Mar 1,1,"$143,445,615",-50.9%,4020,-,"$35,683","$435,400,037",2
13,Mar 28,1,"$85,479,564",-40.4%,4084,+64,"$20,930","$520,879,601",3
14,Mar 915,1,"$57,496,927",-32.7%,3942,-142,"$14,586","$578,376,528",4
15,Mar 1622,1,"$35,881,708",-37.6%,3834,-108,"$9,359","$614,258,236",5


In [None]:
wk_base_url = 'https://www.boxofficemojo.com/movies/?page=weekly&id='

