# Web Scraping Development
## IMDB Film Details

## Objectives
* To learn more about web scraping
* Example used will be to pull film details from IMDB, like IMDB Score and box office results.

In [99]:
# Install packages, if necessary:
# pip install requests
# pip install beautifulsoup4

In [98]:
# Load libraries and URL:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
# import numpy as np
# import seaborn as sns

# Example URL is for Top Gun:
# Separate href from full URL for integration with filmography scraper:
href = '/title/tt0092099/'
url = 'https://www.imdb.com' + href

# Load URL and confirm success by printing first 100 characters:
r = requests.get(url)
print(r.content[:100])

# Parse HTML with BeautifulSoup:
soup = BeautifulSoup(r.content, 'html.parser')

b'\n\n\n\n\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/'


In [2]:
# Prettify HTML for manual review
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="app-id=342792525, app-argument=imdb:///title/tt0092099?src=mdot" name="apple-itunes-app"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Top Gun (1986) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});


In [70]:
# Within 'soup', find movie title data including title and year:
imdbFilmdata = soup.find('div', class_ = 'title_wrapper')
title_year = imdbFilmdata.h1.text
yearbrackets = imdbFilmdata.h1.span.text

# Perform string operation for title:
title = title_year[:-len(yearbrackets)-2]

# Perform string operation and convert to integer for year:
yearstr = yearbrackets[1:len(yearbrackets)-1]
year = int(yearstr)

# Verify parsing:
print("Title (Year):", title_year)
print("Title:", title)
print("Year:", year)

# Within 'soup', find all <div> tag with 'class' attribute "imdbRating":
imdbRatingdata = soup.find('div', class_ = 'imdbRating')
str_imdbRating = imdbRatingdata.strong.text
str_imdbRatingQty = imdbRatingdata.a.text

# Convert strings to float and int:
imdbRating = float(str_imdbRating)
str_imdbRatingQty = str_imdbRatingQty.replace(',','')
imdbRatingQty = int(str_imdbRatingQty)

# Verify variable types:
# print("Year:", type(year))
# print("IMDB Rating:", type(imdbRating))
# print("IMDB Rating Quantity:", type(imdbRatingQty))

# Verify values:
print("IMDB Rating:", imdbRating)
print("From", imdbRatingQty, "ratings")

Title (Year): Top Gun (1986) 
Title: Top Gun
Year: 1986
IMDB Rating: 6.9
From 284517 ratings


In [38]:
# # Within 'soup', find all <div> tag with 'id' attribute "titleDetails"
# titleDetails = soup.find('div', id = 'titleDetails').find_all('h4')
# for details in titleDetails:
#     print(details.text, details.nextSibling)

Official Sites: 

Country: 

Language: 

Release Date:  16 May 1986 (USA)
    
Also Known As:  Top Gun
      
Filming Locations: 

Budget: $15,000,000
            
Opening Weekend USA:  $8,193,052,

Gross USA:  $179,800,601        
Cumulative Worldwide Gross:  $356,830,601        
Production Co: 

Runtime: 

Sound Mix: 

Color: 

Aspect Ratio:  2.39 : 1
    


## Parsing titleDetails
Only take the necessary data.  In this case, focus is on:
- Budget
- Opening weekend
- Gross USA
- Cumulative worldwide gross

Note: Returned values are strings and need to be formatted before calculations

In [90]:
# budgetTag = soup.find('h4', text = 'Budget:')
budgetTag = soup.find('h4', text = re.compile('^Budg'))
str_budgetVal = budgetTag.next_sibling
budgetVal = budget_test.replace('$','')
budgetVal = int(budgetVal)
print(budgetTag.text, budgetVal)

openingTag = soup.find('h4', text = re.compile('^Opening Weekend'))
str_openingVal = openingTag.next_sibling
openingVal = str_openingVal.replace('$','')
openingVal = openingVal.replace(',','')
openingVal = int(openingVal)
print(openingTag.text, openingVal)

domesticTag = soup.find('h4', text = re.compile('^Gross '))
str_domesticVal = domesticTag.next_sibling
domesticVal = str_domesticVal.replace('$','')
domesticVal = domesticVal.replace(',','')
domesticVal = int(domesticVal)
print(domesticTag.text, domesticVal)

worldwideTag = soup.find('h4', text = re.compile('^Cumulative Worldwide Gross'))
str_worldwideVal = worldwideTag.next_sibling
worldwideVal = str_worldwideVal.replace('$','')
worldwideVal = worldwideVal.replace(',','')
worldwideVal = int(worldwideVal)
print(worldwideTag.text, worldwideVal)

Budget: 15000000
Opening Weekend USA: 8193052
Gross USA: 179800601
Cumulative Worldwide Gross: 356830601


In [94]:
# Putting it together into a sample DataFrame:
# Assemble all key values into an array:
# Title, Year, IMDB Rating, # of Ratings, Budget, Opening Weekend, Gross Domestic, Cumulative Gross
# title, year, imdbRating, imdbRatingQty, budgetVal, openingVal, domesticVal, worldwideVal

filmdata = []
filmdata.append([title, year, imdbRating, imdbRatingQty, budgetVal, openingVal, domesticVal, worldwideVal])

# Convert to DataFrame:
pdfilmdata = pd.DataFrame(filmdata, columns = ['Title',
                                               'Year',
                                               'IMDB_Rating',
                                               'IMDB_Ratings',
                                               'Budget',
                                               'Opening_Weekend',
                                               'Domestic_Gross',
                                               'Worldwide_Gross'
                                              ])
pdfilmdata

Unnamed: 0,Title,Year,IMDB_Rating,IMDB_Ratings,Budget,Opening_Weekend,Domestic_Gross,Worldwide_Gross
0,Top Gun,1986,6.9,284517,15000000,8193052,179800601,356830601


## Repeat to Check (Out of date!)
Repeat the same exercise but with another movie; in this case, Mission Impossible.

In [95]:
# Repeat for another film:
# Example url is for Mission Impossible:
url_test = 'https://www.imdb.com/title/tt0117060/'

# Load URL and confirm success
r_test = requests.get(url_test)
# print(r_test.content[:100])

# Parse HTML with BeautifulSoup
soup_test = BeautifulSoup(r_test.content, 'html.parser')

In [96]:
# Within 'soup', find all <div> tag with 'class' attribute "imdbRating"
imdbRatingdata_test = soup_test.find('div', class_ = 'imdbRating')
print("IMDB Rating:", imdbRatingdata_test.strong.text)
print("From", imdbRatingdata_test.a.text, "ratings")
imdbRating_test = imdbRatingdata_test.strong.text
imdbRatingValue_test = imdbRatingdata_test.a.text

IMDB Rating: 7.1
From 378,567 ratings


In [108]:
# Within 'soup', find all <div> tag with 'id' attribute "titleDetails"
titleDetails2 = soup2.find('div', id = 'titleDetails').find_all('h4')
for details in titleDetails2:
    print(details.text, details.nextSibling)

Official Sites: 

Country: 

Language: 

Release Date:  22 May 1996 (Canada)
    
Also Known As:  Mission: Impossible
      
Filming Locations: 

Budget: $80,000,000
            
Opening Weekend USA:  $45,436,830,

Gross USA:  $180,981,856        
Cumulative Worldwide Gross:  $457,731,198        
Production Co: 

Runtime: 

Sound Mix: 

Color: 

Aspect Ratio:  2.39 : 1
    
