# API Calls

**Goal:** Extract the budget, revenue, and MPAA Rating (G/PG/PG-13/R; also called "Certification") from The Movie Database (TMDB; https://www.themoviedb.org/)

## Specifications - Financial Data

- Your stakeholder would like you to extract and save the results for movies that meet all of the criteria established in part 1 of the project (the filtered dataframe saved from part one as a csv.gz file).

- Each year should be saved as a separate .csv.gz file.

# 1. Preliminary Steps

In [3]:
# imports
import json, os, time
import tmdbsimple as tmdb
import pandas as pd
from tqdm.notebook import tqdm_notebook
import glob

In [2]:
# load api key
with open('/Users/yang0108/.secret/tmdb_api.json') as f:
    login = json.load(f)

login.keys()

dict_keys(['api-key'])

In [3]:
# set API_KEY variable
tmdb.API_KEY = login['api-key']

In [4]:
# make folder for API call data
FOLDER = "Data/final_tmdb_data_by_year/"
os.makedirs(FOLDER, exist_ok = True)
os.listdir(FOLDER)

['final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'final_tmdb_data_2003.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'final_tmdb_data_2005.csv.gz',
 'final_tmdb_data_2006.csv.gz',
 'final_tmdb_data_2007.csv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2009.csv.gz',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2011.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'final_tmdb_data_2013.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'final_tmdb_data_2015.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2017.csv.gz',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2020.csv.gz',
 'final_tmdb_data_2021.csv.gz',
 'final_tmdb_data_2022.csv.gz']

# 2. Helper Functions

In [5]:
# function to add certification to movie.info dict
def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    
    # set movie id
    movie = tmdb.Movies(movie_id)

    # save the .info and .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    
    # only get releases of the movie in the US
    for c in releases['countries']:
        
        # if country abbreviation == US
        if c['iso_3166_1'] == 'US':
            
            # save certification key into info dict
            info['certification'] = c['certification']
            
    return info

In [6]:
# test get_certification function
movie_id = 'tt0133093'
info = get_movie_with_rating(movie_id)
info['certification']

'R'

In [7]:
def write_json(new_data, filename):
    """Appends a list of records (new_data) to a json file
    (filename). Adapted from: https://www.geeksforgeeks.org/
    append-to-json-file-using-python/"""
    
    with open(filename, 'r+') as file:
        
        # load existing data from file into dict
        file_data = json.load(file)
        
        # if both types of data are lists, extend
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
            
        # else, append
        else:
            file_data.append(new_data)
            
        # set current position at offset
        file.seek(0)
        
        # convert to json
        json.dump(file_data, file)

# 3. Load Data

In [8]:
# load basics from Data folder
basics = pd.read_csv('Data/title_basics.csv.gz')

# check
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


# 4. API Calls

In [11]:
# create list of years to extract from API
# YEARS_TO_GET = list(range(2000, 2023))

YEARS_TO_GET = [2022]
print(YEARS_TO_GET)

[2022]


In [12]:
# create empty list to save errors
errors = []

In [13]:
# iterate over YEARS_TO_GET (outer loop)

# set up progress bar for outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc = 'YEARS', position = 0):
    
    # select JSON_FILE name to save results in progress
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # check if file already exists
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:
        
        # save an empty dict with "imdb_id" = 0 to new file
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id': 0}], f)
    
    # define df for current year
    df = basics.loc[basics['startYear'] == YEAR].copy()
    
    # save movie ids from df['tconst'] to list
    movie_ids = df['tconst'].copy()
    
    # load any existing data from json into previous_df
    previous_df = pd.read_json(JSON_FILE)
    
    # filter out any movies that already exist in JSON_FILE
    # (so as not to repeat any API calls for the same movie)
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    # iterate over movie_ids_to_get (inner loop)
    # some movies with imdb_ids don't exist in tmdb, so use
    # try/except to work through errors
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                 desc = f'Movies from {YEAR}',
                                 position = 1,
                                 leave = True):
        
        try:
            # retrieve data for movie_id
            temp = get_movie_with_rating(movie_id)
            
            # append/extend results to JSON_FILE
            write_json(temp, JSON_FILE)
            
            # pause to prevent overwhelming server with calls
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])
            
    # save year's results in csv.gz file
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz",
                        compression = "gzip",
                        index = False)

YEARS:   0%|          | 0/1 [00:00<?, ?it/s]

Movies from 2022:   0%|          | 0/4987 [00:00<?, ?it/s]

In [15]:
# print number of errors encountered
print(f"- Total errors: {len(errors)}")

- Total errors: 1209


# 5. Combining Results

In [5]:
# check head and info of each file, save each to variable

# use glob to get all the .csv.gz files in FOLDER
csv_files = glob.glob(FOLDER + '*.csv.gz')

# create empty dictionary to store dfs
dfs = {}

# iterate through files
for file in csv_files:
    file_name = file.split('/')[-1].split('.')[0][-4:]
    dfs[file_name] = pd.read_csv(file)
    print(file_name)
    print(dfs[file_name].info())
    print(dfs[file_name].head())
    print()

2000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1239 non-null   object 
 1   adult                  1238 non-null   float64
 2   backdrop_path          687 non-null    object 
 3   belongs_to_collection  116 non-null    object 
 4   budget                 1238 non-null   float64
 5   genres                 1238 non-null   object 
 6   homepage               63 non-null     object 
 7   id                     1238 non-null   float64
 8   original_language      1238 non-null   object 
 9   original_title         1238 non-null   object 
 10  overview               1217 non-null   object 
 11  popularity             1238 non-null   float64
 12  poster_path            1116 non-null   object 
 13  production_companies   1238 non-null   object 
 14  production_countries   1238 non-null   object 
 15 

2006
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1886 entries, 0 to 1885
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1886 non-null   object 
 1   adult                  1885 non-null   float64
 2   backdrop_path          1007 non-null   object 
 3   belongs_to_collection  140 non-null    object 
 4   budget                 1885 non-null   float64
 5   genres                 1885 non-null   object 
 6   homepage               310 non-null    object 
 7   id                     1885 non-null   float64
 8   original_language      1885 non-null   object 
 9   original_title         1885 non-null   object 
 10  overview               1843 non-null   object 
 11  popularity             1885 non-null   float64
 12  poster_path            1643 non-null   object 
 13  production_companies   1885 non-null   object 
 14  production_countries   1885 non-null   object 
 15 

2011
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3013 entries, 0 to 3012
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                3013 non-null   object 
 1   adult                  3012 non-null   float64
 2   backdrop_path          1653 non-null   object 
 3   belongs_to_collection  219 non-null    object 
 4   budget                 3012 non-null   float64
 5   genres                 3012 non-null   object 
 6   homepage               810 non-null    object 
 7   id                     3012 non-null   float64
 8   original_language      3012 non-null   object 
 9   original_title         3012 non-null   object 
 10  overview               2938 non-null   object 
 11  popularity             3012 non-null   float64
 12  poster_path            2626 non-null   object 
 13  production_companies   3012 non-null   object 
 14  production_countries   3012 non-null   object 
 15 

2015
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3835 entries, 0 to 3834
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                3835 non-null   object 
 1   adult                  3834 non-null   float64
 2   backdrop_path          2469 non-null   object 
 3   belongs_to_collection  238 non-null    object 
 4   budget                 3834 non-null   float64
 5   genres                 3834 non-null   object 
 6   homepage               1037 non-null   object 
 7   id                     3834 non-null   float64
 8   original_language      3834 non-null   object 
 9   original_title         3834 non-null   object 
 10  overview               3743 non-null   object 
 11  popularity             3834 non-null   float64
 12  poster_path            3550 non-null   object 
 13  production_companies   3834 non-null   object 
 14  production_countries   3834 non-null   object 
 15 

2018
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4682 entries, 0 to 4681
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                4682 non-null   object 
 1   adult                  4681 non-null   float64
 2   backdrop_path          3633 non-null   object 
 3   belongs_to_collection  271 non-null    object 
 4   budget                 4681 non-null   float64
 5   genres                 4681 non-null   object 
 6   homepage               1266 non-null   object 
 7   id                     4681 non-null   float64
 8   original_language      4681 non-null   object 
 9   original_title         4681 non-null   object 
 10  overview               4597 non-null   object 
 11  popularity             4681 non-null   float64
 12  poster_path            4585 non-null   object 
 13  production_companies   4681 non-null   object 
 14  production_countries   4681 non-null   object 
 15 

2021
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4013 entries, 0 to 4012
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                4013 non-null   object 
 1   adult                  4012 non-null   float64
 2   backdrop_path          3134 non-null   object 
 3   belongs_to_collection  231 non-null    object 
 4   budget                 4012 non-null   float64
 5   genres                 4012 non-null   object 
 6   homepage               1161 non-null   object 
 7   id                     4012 non-null   float64
 8   original_language      4012 non-null   object 
 9   original_title         4012 non-null   object 
 10  overview               3960 non-null   object 
 11  popularity             4012 non-null   float64
 12  poster_path            3939 non-null   object 
 13  production_companies   4012 non-null   object 
 14  production_countries   4012 non-null   object 
 15 

In [6]:
# concatenate dataframes
concat_df = pd.concat(dfs.values(), ignore_index = True)

# check
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66778 entries, 0 to 66777
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                66778 non-null  object 
 1   adult                  66755 non-null  float64
 2   backdrop_path          43661 non-null  object 
 3   belongs_to_collection  4435 non-null   object 
 4   budget                 66755 non-null  float64
 5   genres                 66755 non-null  object 
 6   homepage               16251 non-null  object 
 7   id                     66755 non-null  float64
 8   original_language      66755 non-null  object 
 9   original_title         66755 non-null  object 
 10  overview               65369 non-null  object 
 11  popularity             66755 non-null  float64
 12  poster_path            61654 non-null  object 
 13  production_companies   66755 non-null  object 
 14  production_countries   66755 non-null  object 
 15  re

In [8]:
# save merged df to file
concat_df.to_csv(f"Data/tmdb_results_2000_to_2022.csv.gz",
          compression = "gzip",
          index = False)