# Introduction
<p> The stakeholders noticed that within the IMDB dataset, there was no financial information. We will use TMDB to supplement the financial information. </p>

Imports

In [1]:
import json
import tmdbsimple as tmdb
import pandas as pd
import os
from time import sleep

Load API Key and start TMDB interactions

In [2]:
# open creds file to get API key
with open('creds.json', 'r') as file:
     login = json.load(file)
        
tmdb.API_KEY =  login['tmdb-api-key-v3']

Here we will define two custom functions: one to add certification and one to appen/extend the JSON files

In [3]:
# use a movie ID to get the financials and certification
def movie_info(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    for c in movie.countries:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
            break
    return info

In [4]:
# Taken directly from Coding Dojo LP
def write_json(new_data, filename): 
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

For POC, stakeholders want years 2000-2001

In [5]:
YEARS = [2000, 2001]
FOLDER = "Data"

In [6]:
basics = pd.read_csv('Data/basics.csv.gz', compression = 'gzip')

We are going to make a nested loop to go through all of the years we've specified, efficiently grab the movies information, and save to a CSV.GZ file. 

In [7]:
# Nested loop for years
for year in YEARS:
    # Set default value to 0 results and no previous results
    JSON_FILE = f'{FOLDER}/basics_in_progress_{year}.json'
    if os.path.isfile(JSON_FILE) == False:
        print(f"[!] {JSON_FILE} not found. Saving key imdb_id to file.")
        with open(JSON_FILE, 'w') as file:
            json.dump([{'imdb_id': 0}], file)
    
    df = basics.loc[basics['startYear'] == year].copy()
    movie_ids = df['tconst'].copy()
    
    previous_df = pd.read_json(JSON_FILE)
    
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    # loop through movie ids
    for movie_id in movie_ids_to_get:
        print(f"Year: {year} Movie: {movie_id}", end = '\r')
        try:
            temp = movie_info(movie_id)
            write_json(temp, JSON_FILE)
            sleep(0.02)
        except Exception:
            continue
    # Save file
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}/final_tmdb_data_{year}.csv.gz", compression="gzip", index=False)
    print(f"Saved {FOLDER}/final_tmdb_data_{year}.csv.gz")

[!] Data/basics_in_progress_2000.json not found. Saving key imdb_id to file.
[!] Data/basics_in_progress_2001.json not found. Saving key imdb_id to file.
Year: 2001 Movie: tt95784622