![image.png](attachment:image.png)

# How to Make a Movie Successful

- Kevin Barnett
> Data Dictionary: https://www.imdb.com/interfaces/


## Import Libraries

In [1]:
import pandas as pd
import json
import tmdbsimple as tmdb
import os, json, time
from tqdm.notebook import tqdm_notebook

In [2]:
def write_json(new_data, filename):
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""
    with open(filename, 'r+') as f:
            # Load existing file into a dictionary
            file_data = json.load(f)
            # Choose extend or append
            if (type(new_data) == list) & (type(file_data) == list):
                file_data.extend(new_data)
            else:
                file_data.append(new_data)
            # Sets files current position at offset
            f.seek(0)
            # Convert back to json
            json.dump(file_data, f)

In [3]:
# Create function to return movies with certifications included
def get_movie_with_rating(movie_id):
    
    # Get movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # Save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    
    # Loop through countries in releases
    for c in releases['countries']:
        
        #if the country abbreviation == US
        if c['iso_3166_1'] == 'US':
            
            # Save a 'certification' key in info with the certification
            movie_info['certification'] = c['certification']
    return movie_info

## API Calls

In [4]:
# Load TMDB API credentials
with open('/Users/hamma/.secret/tmbd_api.json', 'r') as f:
    login = json.load(f)

# Check login keys
login.keys()

dict_keys(['client-id', 'api-key'])

In [5]:
# Set API_KEY variable to 'api-key'
tmdb.API_KEY = login['api-key']

In [6]:
# Test function
test = get_movie_with_rating('tt0848228')
test

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 99.992,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path'

In [7]:
# Test function with list of movies
test_ids = ["tt0848228", "tt0115937","tt0848228","tt0332280"]
results = []
errors = []
for movie_id in test_ids:
    try:
        movie_info  = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except Exception as e:
        errors.append([movie_id, e])

# Convert to dataframe
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.708,28578,PG-13
1,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.708,28578,PG-13
2,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.879,10324,PG-13


In [8]:
# Print errors list
print(f'Number of errors: {len(errors)}')
errors

Number of errors: 1


[['tt0115937',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=32277cb9dc625d918934e8a872d7e6b6')]]

In [9]:
# Designate folder for API call data
folder = 'Data/'
os.makedirs(folder, exist_ok=True)
os.listdir(folder)

['final_tmdb_data2000.csv.gz', 'final_tmdb_data2001.csv.gz']

In [10]:
# Load dataframe
basics = pd.read_csv('/Data/title_basics.csv.gz')
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81670 entries, 0 to 81669
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81670 non-null  object 
 1   titletype       81670 non-null  object 
 2   primarytitle    81670 non-null  object 
 3   originaltitle   81670 non-null  object 
 4   isadult         81670 non-null  int64  
 5   startyear       81670 non-null  float64
 6   endyear         81670 non-null  object 
 7   runtimeminutes  81670 non-null  int64  
 8   genres          81670 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.6+ MB


Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,np.nan,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,np.nan,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,np.nan,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,np.nan,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,np.nan,126,Drama


In [16]:
years_to_get = [*range(2000,2023,1)]
type(years_to_get)

list

In [12]:
errors = []

In [13]:
# Start of outer loop
for year in tqdm_notebook(years_to_get, desc='years', position=0):

    # Define json file to store results
    json_file = f'{folder}tmdb_api_results{year}.json'

    # Check if file exists
    file_exists = os.path.isfile(json_file)

    # If file does not exist
    if file_exists == False:
        # Save empty dictionary with 'imdb_id' to new json file
        with open(json_file, 'w') as f:
            json.dump([{'imdb_id':0}], f)

    # Save new year as current dataframe
    df = basics.loc[basics['startyear'] == year].copy()
    # Save movie ids to list
    movie_ids = df['tconst'].copy()

    # Load existing data from json into a dataframe called 'previous_df'
    previous_df = pd.read_json(json_file)

    # Filter out ids that are already in the json file
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    # Inner loop
    for movie_id in tqdm_notebook(movie_ids_to_get, desc=f'Movies from {year}',
                             position=1, leave=True):
        try:
            # Retrieve data for movie id
            temp = get_movie_with_rating(movie_id)
            # Append/extend results to existing file using a pre-made function
            write_json(temp, json_file)
            # 20ms pause to prevent overwhelming server
            time.sleep(0.02)
        
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df =pd.read_json(json_file)
    final_year_df.to_csv(f'{folder}final_tmdb_data{year}.csv.gz', 
                         compression='gzip', index=False)

years:   0%|          | 0/23 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1446 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1570 [00:00<?, ?it/s]

Movies from 2002:   0%|          | 0/1559 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1679 [00:00<?, ?it/s]

Movies from 2004:   0%|          | 0/1896 [00:00<?, ?it/s]

Movies from 2005:   0%|          | 0/2179 [00:00<?, ?it/s]

Movies from 2006:   0%|          | 0/2426 [00:00<?, ?it/s]

Movies from 2007:   0%|          | 0/2569 [00:00<?, ?it/s]

Movies from 2008:   0%|          | 0/2904 [00:00<?, ?it/s]

Movies from 2009:   0%|          | 0/3546 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3848 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4224 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4510 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4702 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4902 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/5055 [00:00<?, ?it/s]

Movies from 2016:   0%|          | 0/5251 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5644 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5772 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5873 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/4997 [00:00<?, ?it/s]

Movies from 2021:   0%|          | 0/5118 [00:00<?, ?it/s]

Movies from 2022: 0it [00:00, ?it/s]

In [14]:
print(f'[i] Total errors: {len(errors)}')

[i] Total errors: 18899


In [15]:
final_year_df.head()

Unnamed: 0,imdb_id
0,0
