### LIBRARIEAS

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

### API CREDENTIALS

In [13]:
# Load API Credentials

with open('/Users/kritpatel/.secret/tmdb_api.json', 'r') as f:
    creds = json.load(f)

In [14]:
# Check if the credentials are loaded

creds.keys()

dict_keys(['api-key'])

In [15]:
# Load the Credentials to tmdb.api_key

tmdb.API_KEY = creds['api-key']

### FUNCTION TO ADD RATINGS

In [32]:
# TEST MOVIE

movie = tmdb.Movies(603)
movie_info = movie.info()
movie_info

{'adult': False,
 'backdrop_path': '/y9wuhlrqSHvhTLNVNwKMKe6HZzY.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 62.049,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

In [31]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [27]:
# This function will add the certification (MPGG Rating) to movie.info

def get_movie_with_rating(movie_id):
    
    # Get movie for the current id
    movie = tmdb.Movies(movie_id)
    
    # Save the .info and .release dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    
    # Loop through countries in releases
    for c in releases['countries']:
        
        # If country abbreviation = US
        if c['iso_3166_1']== 'US':
            
            # Save a 'certification' key in the info dict with the certification
            movie_info['certification'] = c['certification']
            
    return movie_info

In [28]:
# Test the Function

get_movie_with_rating(603)

{'adult': False,
 'backdrop_path': '/y9wuhlrqSHvhTLNVNwKMKe6HZzY.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 62.049,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

### DEFINE EXTRACTION PARAMETERS


In [30]:
# Set Parameters

YEARS_TO_GET = [2000, 2001]

### STORAGE FOLDER


In [29]:
# Open previously saved files from the Data folder

FOLDER = "Data/"
os.listdir(FOLDER)

['title_basics.csv.gz', 'title_akas.csv.gz', 'title_ratings.csv.gz']

### LOOP TO EXTRACT & SAVE THE DATA

In [37]:
# Start Outer Loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc= 'YEARS', position= 0):
    

    # Definie the JSON file to store results for year

    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'

    # Check if file exists

    file_exists = os.path.isfile(JSON_FILE)


    # If it doesn't exist create it

    if file_exists == False:

        # Save an empty dict with just the 'imdb_id' to the new JSON_FILE

        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id': 0}], f)



    # Load the basics dataframe from part 1:

    basics = pd.read_csv('Data/title_basics.csv.gz')

    # Saving the new year as the current data frame

    df=basics.loc[basics['startYear']== YEAR].copy()

    # Saving movie ids to list

    movie_ids = df['tconst'].copy()

    # Load Existing Data from JSON into previous_df dataframe

    previous_df = pd.read_json(JSON_FILE)

    # Filter out any ids that are already in the JSON_FILE

    movie_ids_to_get = movie_ids[-movie_ids.isin(previous_df['imdb_id'])]

    # Get index and movie id from list
    # INNER LOOP
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                 desc= f'Movies from {YEAR}',
                                 position= 1, 
                                 leave= True):

            # Attempt to retreive the data for the movie id
            try:
                # Add rating to movie using function
                temp= get_movie_with_rating(movie_id)

                write_json(temp, JSON_FILE)

                time.sleep(0.02)

            # If it fails make a dict with just the id and None for certification
            except Exception as e:
                continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz",
                         compression= "gzip", index= False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1409 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1525 [00:00<?, ?it/s]