## Description

#### Purpose: To scrape movie details from Box Office Mojo using IMDb ID.

#### Input: `2.1.5_IMDb_IDs.csv`

#### Outputs: `2.1.6_BOM_Data.csv`

This notebook houses functions that scrape revenue, title, and other movie metadata from Box Office Mojo given IMDb ID.

In [None]:
!pip install beautifulsoup4
!pip3 install html5lib   

In [None]:
from bs4 import BeautifulSoup
import requests
import html5lib
import numpy as np
import re

In [None]:
# This gets the worldwide revenues
def get_worldwide_rev(sp):
    rev = {}
    table = sp.find("div", attrs={"class": "a-section a-spacing-none mojo-performance-summary-table"})
    revenues = []
    for row in table.findAll("span", attrs={"class" : "a-size-medium a-text-bold"}):
        money = row.findAll("span", attrs={"class":"money"})
        for m in money:
            dol = m.get_text().split("$")
            dol = "".join(dol[1].split(","))
            revenues.append(int(dol))

    type_of_rev = []
    for t in table.findAll("span", attrs={"class":"a-size-small"}):
        for line in list(set(t.get_text().split(" "))):
            if line.isalpha() : type_of_rev.append(line)

    for i in range(len(revenues)):
        rev[type_of_rev[i]] = revenues[i]
    return rev

In [None]:
# Gets all data available in the page
def get_data(sp):
    table = sp.find("div", attrs={"class": "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"})
    avail_data = []
    for row in table.findAll("div", attrs={"class":"a-section a-spacing-none"}):
        if "IMDbPro" not in row.find("span").get_text() : avail_data.append(row.find("span").get_text())
    print(avail_data)
    table = sp.find("div", attrs={"class": "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"})

    data = []
    store_next = False
    for t in table.findAll("span"):
        if store_next == True:
            data.append(t.getText().strip())
            store_next = False
        if t.getText().strip() in avail_data:
            store_next = True

    formatted_data = []
    for d in data:
        a = " ".join(d.split("\n"))
        a = " ".join(a.split(" "))
        a = re.sub(' +', ' ', a)
        a = a.strip()
        if "See full" in a:
            a = a.split("See full")[0]
            formatted_data.append(a.strip())
        elif "$" in a:
            dol = a.split("$")
            dol = "".join(dol[1].split(","))
            formatted_data.append(int(dol))
        else:
            formatted_data.append(a.strip())
        
    data_dict = {}
    for i in range(len(avail_data)):
        data_dict[avail_data[i]] = formatted_data[i]

    return data_dict

In [None]:
def get_title(sp):
    title = sp.find("h1", attrs={"class":"a-size-extra-large"}).getText()
    return title

In [None]:
def get_data_by_id(id):
    URL = f"https://www.boxofficemojo.com/title/{id}/?ref_=bo_se_r_2"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html5lib")
    title = get_title(soup)
    revenue = get_worldwide_rev(soup)
    data = get_data(soup)
    data["Title"] = title
    data["Revenue"] = revenue
    return data

In [None]:
# Test Example IMDb ID
print(get_data_by_id('tt0848228'))

In [None]:
# Function to Run Scrape and Output CSV
def process_csv(input_file, output_file):
    with open(input_file, 'r') as csv_file:
        reader = csv.reader(csv_file)
        
        # Skip the header if it exists
        next(reader, None)

        # Open output CSV file for writing
        with open(output_file, 'w', newline='') as output_csv:
            writer = csv.writer(output_csv)

            # Write header to output CSV
            writer.writerow(['Domestic Distributor', 'Domestic Opening', 'Budget', 'Earliest Release Date', 'MPAA', 'Running Time', 'Genres', 'Title', 'Revenue'])

            # Process each row in the input CSV
            for row in reader:
                imdb_id = row[0]

                # Call get_data_by_id and get the result
                result = get_data_by_id(imdb_id)

                # Extract info from returned data
                info_to_write = [
                    result['Domestic Distributor'],
                    result['Domestic Opening'],
                    result['Budget'],
                    result['Earliest Release Date'],
                    result['MPAA'],
                    result['Running Time'],
                    result['Genres'],
                    result['Title'],
                    result['Revenue']
                ]

                # Write the information to the output CSV
                writer.writerow(info_to_write)

In [None]:
input_csv_file = '../2.1.5 Get IMDb IDs/Outputs/2.1.5_IMDb_ids.csv'
output_csv_file = '/Outputs/2.1.6_BOM_Data.csv'
process_csv(input_csv_file, output_csv_file)