In [1]:
# imports
import os, time, math, json
import numpy as np
import pandas as pd
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

Your stakeholder would like you to extract and save the results for movies that meet all of the criteria established in part 1 of the project (You should already have a filtered dataframe saved from part one as a csv.gz file)

As a proof-of-concept, they requested you perform a test extraction of movies that started in 2000 or 2001

Each year should be saved as a separate .csv.gz file

In [2]:
# api credentials
with open('/Users/hkim1297/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

dict_keys(['api-key'])

In [3]:
# instantiate api-key variable
tmdb.API_KEY = login['api-key']

In [7]:
# folder to save results
FOLDER = "Data/"

# years desired
years_wanted = list(range(2010,2021))

In [8]:
# function for getting US certification
def get_certification(imdb_id):
    # make movie object
    movie = tmdb.Movies(imdb_id)
    # save .info dict
    info = movie.info()
    # save .releases dict
    releases = movie.releases()
    # loop to get us certification
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            # add a certification key and the certification as the value to info
            info['certification'] = c['certification']
    return info

In [9]:
# function to add new results to existing json
def write_json(new_data, filename):
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/""" 
    with open(filename, 'r+') as file:
        # r+ = read and write
        # load existing data into dict
        file_data = json.load(file)
        # if both are lists, extend
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # set file's current position at offset
        file.seek(0)
        # convert back to json
        json.dump(file_data, file)

In [None]:
# outer loop
# progress bar
for YEAR in tqdm_notebook(years_wanted, desc='YEARS', position=0):
    # define json file for saving results
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # if file doesn't exist, make it
    if file_exists == False:
        # save empty dict with imdb_id key to new json
        with open(JSON_FILE, 'w') as f:
            json.dump([{"imdb_id":0}], f)
    
    # load basics df
    basics = pd.read_csv('Data/title_basics.csv.gz')
    # save new year as df
    df = basics.loc[basics['startYear']==YEAR]
    # save movie ids to list
    movie_ids = df['tconst']
    
    # load existing data from json to prev_df
    prev_df = pd.read_json(JSON_FILE)
    # filter any ids already in the file
    movie_ids_to_get = movie_ids[~movie_ids.isin(prev_df['imdb_id'])].tolist()
    
    # inner loop
    # iterate through movie ids and get api calls
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                 desc=f'Movies from {YEAR}',
                                 position=1,
                                 leave=True):
        # try to retrieve data for movie id
        try:
            temp = get_certification(movie_id)
            # append/extend results to existing file
            write_json(temp, JSON_FILE)
            # add pause in between to not overload server
            time.sleep(.02)
        # if it fails, make a dict with the id and None for certification
        except Exception as e:
            continue
    # save results as a .csv.gz file
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz",
                        compression='gzip',
                        index=False)

YEARS:   0%|          | 0/11 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3756 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4142 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4432 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4635 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4780 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/4939 [00:00<?, ?it/s]