# IMDB Film Data Scraper Function
## Web Scraping Development

## Objectives
* To apply web scraping development work into a film data scraper function

In [19]:
# Install packages, if necessary:
# pip install requests
# pip install beautifulsoup4

# Load libraries and URL:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
# import numpy as np
# import seaborn as sns

## Summarizing Film Data Scraper
### UX Summary:
* User provides href to complete IMDB title URL
* Function retrieves the following data:
    * Title
    * Year
    * Budget
    * Opening Weekend
    * Domestic Gross
    * Worldwide Gross
* Function returns DataFrame of IMDB film data

In [31]:
# Function input is IMDB title href
# Ref. Development - IMDB Film Details Jupyter Notebook for more details

def imdb_film_data(href):
    # Append href input to full IMDB URL
    url = 'https://www.imdb.com' + href
    
    # Parse URL with BeautifulSoup
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Retrieve film data
    imdbFilmdata = soup.find('div', class_ = 'title_wrapper')
    title_year = imdbFilmdata.h1.text
    if imdbFilmdata.h1.span is None:
        title = title_year
        year = None
        pass
    else:
        yearbrackets = imdbFilmdata.h1.span.text
        title = title_year[:-len(yearbrackets)-2]
        yearstr = yearbrackets[1:len(yearbrackets)-1]
        year = int(yearstr)
    
    # Retrieve IMDB Rating data
    imdbRatingdata = soup.find('div', class_ = 'imdbRating')
    if imdbRatingdata is None:
        imdbRating = None
        imdbRatingQty = None
        pass
    else:
        str_imdbRating = imdbRatingdata.strong.text
        str_imdbRatingQty = imdbRatingdata.a.text
        imdbRating = float(str_imdbRating)
        str_imdbRatingQty = str_imdbRatingQty.replace(',','')
        imdbRatingQty = int(str_imdbRatingQty)
    
    # Retrieve budget data
    budgetTag = soup.find('h4', text = re.compile('^Budg'))
    if budgetTag is None:
        budgetVal = None
        pass
    else:
        str_budgetVal = budgetTag.next_sibling
        budgetVal = str_budgetVal.replace('$','')
        budgetVal = budgetVal.replace(',','')
        budgetVal = int(budgetVal)

    # Retrieve box office data
    openingTag = soup.find('h4', text = re.compile('^Opening Weekend'))
    if openingTag is None:
        openingVal = None
        pass
    else:
        str_openingVal = openingTag.next_sibling
        openingVal = str_openingVal.replace('$','')
        openingVal = openingVal.replace(',','')
        openingVal = int(openingVal)
    
    domesticTag = soup.find('h4', text = re.compile('^Gross '))
    if domesticTag is None:
        domesticVal = None
        pass
    else:
        str_domesticVal = domesticTag.next_sibling
        domesticVal = str_domesticVal.replace('$','')
        domesticVal = domesticVal.replace(',','')
        domesticVal = int(domesticVal)
    
    worldwideTag = soup.find('h4', text = re.compile('^Cumulative Worldwide Gross'))
    if worldwideTag is None:
        worldwideVal = None
        pass
    else:
        str_worldwideVal = worldwideTag.next_sibling
        worldwideVal = str_worldwideVal.replace('$','')
        worldwideVal = worldwideVal.replace(',','')
        worldwideVal = int(worldwideVal)
    
    # Append data to table and convert to DataFrame
    filmdata = []
    filmdata.append([title, year, imdbRating, imdbRatingQty, budgetVal, openingVal, domesticVal, worldwideVal])
    pdfilmdata = pd.DataFrame(filmdata, columns = ['Title',
                                               'Year',
                                               'IMDB_Rating',
                                               'IMDB_Ratings',
                                               'Budget',
                                               'Opening_Weekend',
                                               'Domestic_Gross',
                                               'Worldwide_Gross'
                                              ])
    return(pdfilmdata)

In [32]:
# Example: Top Gun
imdb_film_data('/title/tt0092099/')

Unnamed: 0,Title,Year,IMDB_Rating,IMDB_Ratings,Budget,Opening_Weekend,Domestic_Gross,Worldwide_Gross
0,Top Gun,1986,6.9,284538,15000000,8193052,179800601,356830601



## Summarizing Filmography Data Scraper
### UX Summary:
* User provides href to complete IMDB actor URL
* Function retrieves the following data:
    * All films with Actor credits
    * Associated href for each film to be used in IMDB film data scraper
* Function returns a DataFrame with all IMDB film data for each film

In [42]:
# Function input is IMDB actor href
# Ref. Development - IMDB Actor Filmography Jupyter Notebook for more details

def imdb_filmography_data(href):
    # Append href input to full IMDB URL
    url = 'https://www.imdb.com' + href
    
    # Parse URL with BeautifulSoup
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Retrieve filmography data
    films = soup.find_all('div', id = re.compile('^actor-'))
    
    # Basic:
    Append data to table and convert to DataFrame
    filmsarray = []
    for film in films:
        filmsarray.append([film.a.text, film.a.get('href')])
    pdfilms = pd.DataFrame(filmsarray, columns = ['Title', 'href'])
    return(pdfilms)

#     # Advanced:
#     # Apply each href in table to IMDB film data function and convert to DataFrame
#     filmsarray = []
#     for film in films:
#         filmsarray.append(imdb_film_data(film.a.get('href')))
#     pdfilms = pd.DataFrame(filmsarray)
#     return(pdfilms)    


SyntaxError: invalid syntax (<ipython-input-42-e2fe4538b41f>, line 16)

In [41]:
# Example: Tom Cruise
# imdb_filmography_data('/name/nm0000129/')

# Example: Tom Holland
imdb_filmography_data('/name/nm4043618/')


ValueError: invalid literal for int() with base 10: 'EUR4749500\n            '