# IMDB Film Data Scraper Function
## Web Scraping Development

## Objectives
* To apply web scraping development work into a film data scraper function

In [1]:
# Install packages, if necessary:
# pip install requests
# pip install beautifulsoup4

# Load libraries and URL:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
# import numpy as np
# import seaborn as sns

## Summarizing Film Data Scraper
### UX Summary:
* User provides href to complete IMDB title URL
* Function retrieves the following data:
    * Title
    * Year
    * Budget
    * Opening Weekend
    * Domestic Gross
    * Worldwide Gross
* Function returns DataFrame of IMDB film data

In [2]:
# Exchange rate function to convert budget and box office values to USD
# Ref. Development - IMDB Film Details Jupyter Notebook for more details
def perf_usd_conversion(native_value):
    # Call Exchange Rates API to look up latest USD exchange rates
    r_usd = 'https://api.exchangeratesapi.io/latest?base=USD'
    usd_response = requests.get(r_usd)
    rates = usd_response.json()
    
    # Parse reported value to determine currency used and remove currency code from string
    native_value = native_value.strip()
    if native_value[0] == '$':
        num_value = native_value.replace('$','')
        exchange_rate = 1
    else:
        currency = native_value[:3]
        exchange_rate = rates['rates'][currency]
        num_value = native_value[3:]
    num_value = num_value.replace(',','')
    if num_value.isnumeric() is True:
        usd_value = float(num_value) / exchange_rate
    else:
        usd_value = None
    return usd_value

In [3]:
# Function input is IMDB title href
# Ref. Development - IMDB Film Details Jupyter Notebook for more details

def imdb_film_data(href):
    # Append href input to full IMDB URL
    url = 'https://www.imdb.com' + href
    
    # Parse URL with BeautifulSoup
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Retrieve film data
    imdbFilmdata = soup.find('div', class_ = 'title_wrapper')
    title_year = imdbFilmdata.h1.text
    if imdbFilmdata.h1.span is None:
        title_year = title_year.replace(u'\xa0',u' ')
        title = title_year.strip()
        year = None
        pass
    else:
        yearbrackets = imdbFilmdata.h1.span.text
        title = title_year[:-len(yearbrackets)-2]
        yearstr = yearbrackets[1:len(yearbrackets)-1]
        year = int(yearstr)
    
    # Retrieve IMDB Rating data
    imdbRatingdata = soup.find('div', class_ = 'imdbRating')
    if imdbRatingdata is None:
        imdbRating = None
        imdbRatingQty = None
        pass
    else:
        str_imdbRating = imdbRatingdata.strong.text
        str_imdbRatingQty = imdbRatingdata.a.text
        imdbRating = float(str_imdbRating)
        str_imdbRatingQty = str_imdbRatingQty.replace(',','')
        imdbRatingQty = int(str_imdbRatingQty)

    # Retrieve budget data
    budgetTag = soup.find('h4', text = re.compile('^Budg'))
    if budgetTag is None:
        budgetVal = None
        pass
    else:
        str_budgetVal = budgetTag.next_sibling
        budgetVal = perf_usd_conversion(str_budgetVal)

    # Retrieve box office data
    openingTag = soup.find('h4', text = re.compile('^Opening Weekend'))
    if openingTag is None:
        openingVal = None
        pass
    else:
        str_openingVal = openingTag.next_sibling
        openingVal = perf_usd_conversion(str_openingVal)
    
    domesticTag = soup.find('h4', text = re.compile('^Gross '))
    if domesticTag is None:
        domesticVal = None
        pass
    else:
        str_domesticVal = domesticTag.next_sibling
        domesticVal = perf_usd_conversion(str_domesticVal)
    
    worldwideTag = soup.find('h4', text = re.compile('^Cumulative Worldwide Gross'))
    if worldwideTag is None:
        worldwideVal = None
        pass
    else:
        str_worldwideVal = worldwideTag.next_sibling
        worldwideVal = perf_usd_conversion(str_worldwideVal)

    # Return list of film data in prescribed order
    filmdata = [title, year, imdbRating, imdbRatingQty, budgetVal, openingVal, domesticVal, worldwideVal]
    return filmdata

In [4]:
# Example: Top Gun
imdb_film_data('/title/tt0092099/')

['Top Gun', 1986, 6.9, 284569, 15000000.0, 8193052.0, 179800601.0, 356830601.0]


## Summarizing Filmography Data Scraper
### UX Summary:
* User provides href to complete IMDB actor URL
* Function retrieves the following data:
    * All films with Actor credits
    * Associated href for each film to be used in IMDB film data scraper
* Function returns a DataFrame with all IMDB film data for each film

In [5]:
# Function input is IMDB actor href
# Ref. Development - IMDB Actor Filmography Jupyter Notebook for more details

def imdb_filmography_data(href):
    # Append href input to full IMDB URL
    url = 'https://www.imdb.com' + href
    
    # Parse URL with BeautifulSoup
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Retrieve filmography data
    films = soup.find_all('div', id = re.compile('^actor-'))
    
#     # Basic:
#     # Append data to table and convert to DataFrame
#     filmsarray = []
#     for film in films:
#         filmsarray.append([film.a.text, film.a.get('href')])
#     pdfilms = pd.DataFrame(filmsarray, columns = ['Title', 'href'])
#     return pdfilms

#     # Advanced:
#     # Create DataFrame, and append what each href returns from IMDB Film Data function
#     # This method is not efficient!
#     filmsarray = pd.DataFrame(columns = ['Title',
#                                         'Year',
#                                        'IMDB_Rating',
#                                        '#_of_Ratings',
#                                        'Budget',
#                                         'Opening_Weekend',
#                                         'Domestic_Gross',
#                                         'Worldwide_Gross'
#                                         ])
#     for film in films:
#         filmhref = film.a.get('href')
#         filmsarray = filmsarray.append(imdb_film_data(filmhref), ignore_index = True)
#     return filmsarray

    # Expert:
    # Create array, append what each href returns from IMDB Film Data function, then convert array to DataFrame
    filmsarray = []
    for film in films:
        filmhref = film.a.get('href')
        filmsarray.append(imdb_film_data(filmhref))
    filmspd = pd.DataFrame(filmsarray, columns = ['Title',
                                                  'Year',
                                                  'IMDB_Rating',
                                                  '#_of_Ratings',
                                                  'Budget',
                                                  'Opening_Weekend',
                                                  'Domestic_Gross',
                                                  'Worldwide_Gross'
                                                 ])
    return filmspd
#     return filmsarray

In [7]:
# Example: Tom Cruise
# Works!
# imdb_filmography_data('/name/nm0000129/')

# Example: Brad Pitt
imdb_filmography_data('/name/nm0000093/')

# Example: John David Washington
# imdb_filmography_data('/name/nm0913475/')

# Example: Tom Holland
# European currencies
# imdb_filmography_data('/name/nm4043618/')


Unnamed: 0,Title,Year,IMDB_Rating,#_of_Ratings,Budget,Opening_Weekend,Domestic_Gross,Worldwide_Gross
0,Bullet Train,,,,,,,
1,Babylon,2021.0,,,,,,
2,Ad Astra,2019.0,6.6,179520.0,90000000.0,19001398.0,50188370.0,127461872.0
3,Once Upon a Time... in Hollywood,2019.0,7.7,494579.0,90000000.0,41082018.0,142502728.0,374343626.0
4,The Jim Jefferies Show,,7.1,2102.0,,,,
...,...,...,...,...,...,...,...,...
77,Dallas,,6.9,11139.0,,,,
78,Less Than Zero,1987.0,6.5,17805.0,,3008987.0,12396383.0,12396383.0
79,No Man's Land,1987.0,6.1,4103.0,8000000.0,1088273.0,2877571.0,2877571.0
80,Another World,,6.8,797.0,,,,
