# IMDB Film Data Scraper Function
## Web Scraping Development

## Objectives
* To apply web scraping development work into a film data scraper function

In [1]:
# Install packages, if necessary:
# pip install requests
# pip install beautifulsoup4

# Load libraries and URL:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
# import numpy as np
# import seaborn as sns

## Summarizing Film Data Scraper
### UX Summary:
* User provides href to complete IMDB title URL
* Function retrieves the following data:
    * Title
    * Year
    * Budget
    * Opening Weekend
    * Domestic Gross
    * Worldwide Gross
* Function returns DataFrame of IMDB film data

In [2]:
# Exchange rate function to convert budget and box office values to USD
# Ref. Development - IMDB Film Details Jupyter Notebook for more details
def perf_usd_conversion(native_value):
    # Call Exchange Rates API to look up latest USD exchange rates
    r_usd = 'https://api.exchangeratesapi.io/latest?base=USD'
    usd_response = requests.get(r_usd)
    rates = usd_response.json()
    
    # Parse reported value to determine currency used and remove currency code from string
    native_value = native_value.strip()
    if native_value[0] == '$':
        num_value = native_value.replace('$','')
        exchange_rate = 1
    else:
        currency = native_value[:3]
        exchange_rate = rates['rates'][currency]
        num_value = native_value[3:]
    num_value = num_value.replace(',','')
    if num_value.isnumeric() is True:
        usd_value = float(num_value) / exchange_rate
    else:
        usd_value = None
    return usd_value

In [3]:
# Function input is IMDB title href
# Ref. Development - IMDB Film Details Jupyter Notebook for more details

def imdb_film_data(href):
    # Append href input to full IMDB URL
    url = 'https://www.imdb.com' + href
    
    # Parse URL with BeautifulSoup
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Retrieve film data
    imdbFilmdata = soup.find('div', class_ = 'title_wrapper')
    title_year = imdbFilmdata.h1.text
    if imdbFilmdata.h1.span is None:
        title_year = title_year.replace(u'\xa0',u' ')
        title = title_year.strip()
        year = None
        pass
    else:
        yearbrackets = imdbFilmdata.h1.span.text
        title = title_year[:-len(yearbrackets)-2]
        yearstr = yearbrackets[1:len(yearbrackets)-1]
        year = int(yearstr)
    
    # Retrieve IMDB Rating data
    imdbRatingdata = soup.find('div', class_ = 'imdbRating')
    if imdbRatingdata is None:
        imdbRating = None
        imdbRatingQty = None
        pass
    else:
        str_imdbRating = imdbRatingdata.strong.text
        str_imdbRatingQty = imdbRatingdata.a.text
        imdbRating = float(str_imdbRating)
        str_imdbRatingQty = str_imdbRatingQty.replace(',','')
        imdbRatingQty = int(str_imdbRatingQty)

    # Retrieve budget data
    budgetTag = soup.find('h4', text = re.compile('^Budg'))
    if budgetTag is None:
        budgetVal = None
        pass
    else:
        str_budgetVal = budgetTag.next_sibling
        budgetVal = perf_usd_conversion(str_budgetVal)

    # Retrieve box office data
    openingTag = soup.find('h4', text = re.compile('^Opening Weekend'))
    if openingTag is None:
        openingVal = None
        pass
    else:
        str_openingVal = openingTag.next_sibling
        openingVal = perf_usd_conversion(str_openingVal)
    
    domesticTag = soup.find('h4', text = re.compile('^Gross '))
    if domesticTag is None:
        domesticVal = None
        pass
    else:
        str_domesticVal = domesticTag.next_sibling
        domesticVal = perf_usd_conversion(str_domesticVal)
    
    worldwideTag = soup.find('h4', text = re.compile('^Cumulative Worldwide Gross'))
    if worldwideTag is None:
        worldwideVal = None
        pass
    else:
        str_worldwideVal = worldwideTag.next_sibling
        worldwideVal = perf_usd_conversion(str_worldwideVal)

    # Return list of film data in prescribed order
    filmdata = [title, year, imdbRating, imdbRatingQty, budgetVal, openingVal, domesticVal, worldwideVal]
    return filmdata

In [4]:
# Example: Top Gun
imdb_film_data('/title/tt0092099/')

['Top Gun', 1986, 6.9, 284631, 15000000.0, 8193052.0, 179800601.0, 356830601.0]


## Summarizing Filmography Data Scraper
### UX Summary:
* User provides href to complete IMDB actor URL
* Function retrieves the following data:
    * All films with Actor credits
    * Associated href for each film to be used in IMDB film data scraper
* Function returns a DataFrame with all IMDB film data for each film

In [12]:
# Function input is IMDB actor href
# Ref. Development - IMDB Actor Filmography Jupyter Notebook for more details
# Time measures added to measure efficiency

def imdb_filmography_data(href):
#     # Function timing
#     filmostart = pd.Timestamp.now()
#     print('IMDB Filmography Time Start!')
    
    # Append href input to full IMDB URL
    url = 'https://www.imdb.com' + href

    # Parse URL with BeautifulSoup
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Retrieve filmography data
    films = soup.find_all('div', id = re.compile('^actor-'))
    
#     # Time milestone - Site parse time
#     siteparsetime = pd.Timestamp.now()
#     print('Site parsed - Time elapsed:')
#     print(siteparsetime - filmostart)
    
    # Create array, append what each href returns from IMDB Film Data function, then convert array to DataFrame
    startfilmoprocessing = pd.Timestamp.now()
    
    filmsarray = []
    for film in films:
        filmhref = film.a.get('href')
        filmsarray.append(imdb_film_data(filmhref))
        
#         # Time milestone - Each film iteration in array
#         print('Film processed:')
#         print(pd.Timestamp.now())
        
    filmspd = pd.DataFrame(filmsarray, columns = ['Title',
                                                  'Year',
                                                  'IMDB_Rating',
                                                  '#_of_Ratings',
                                                  'Budget',
                                                  'Opening_Weekend',
                                                  'Domestic_Gross',
                                                  'Worldwide_Gross'
                                                 ])
#     # Time milestone - Filmography processing done
#     print('Total time:')
#     print(pd.Timestamp.now() - filmostart)
    return filmspd
#     return filmsarray

In [13]:
# Example: Tom Cruise
imdb_filmography_data('/name/nm0000129/')

# Example: Brad Pitt
# imdb_filmography_data('/name/nm0000093/')

# Example: John David Washington
# imdb_filmography_data('/name/nm0913475/')

# Example: Tom Holland
# European currencies
# imdb_filmography_data('/name/nm4043618/')


Unnamed: 0,Title,Year,IMDB_Rating,#_of_Ratings,Budget,Opening_Weekend,Domestic_Gross,Worldwide_Gross
0,Mission: Impossible 7,2021.0,,,,,,
1,Luna Park,,,,,,,
2,Untitled Tom Cruise/SpaceX Project,,,,200000000.0,,,
3,Mission: Impossible 8,2022.0,,,,,,
4,Live Die Repeat and Repeat,,,,,,,
5,Top Gun: Maverick,2021.0,,,,,,
6,Mission: Impossible - Fallout,2018.0,7.7,278264.0,178000000.0,61236534.0,220159104.0,791115104.0
7,American Made,2017.0,7.2,152266.0,50000000.0,16776390.0,51342000.0,134866593.0
8,The Mummy,2017.0,5.4,168198.0,125000000.0,31688375.0,80227895.0,409231607.0
9,Jack Reacher: Never Go Back,2016.0,6.1,133436.0,60000000.0,22872490.0,58697076.0,162146076.0
