# Imports

In [1]:
from tqdm.notebook import tqdm_notebook
import time
import tmdbsimple as tmdb
import pandas as pd
import numpy as np

In [2]:
import os
FOLDER = "Data/MovieProject/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'akas.csv.gz',
 'basics.csv.gz',
 'Data',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'ratings.csv.gz',
 'tmdb_2000_2001_results_combined.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [3]:
import json
with open('C:\\Users\\javaughn\\.secret\\tmdb_api.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['api-key'])

# API - Key

In [4]:
tmdb.API_KEY = login['api-key']

# Functions

In [5]:
def get_movie_with_rating(movie_id):
    # Get movie and release dates
    movie = tmdb.Movies(movie_id)
    # Construct output dict
    movie_info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification'] = c['certification']
    return movie_info

In [6]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [7]:
YEARS_TO_GET = list(range(2002, 2022))

In [8]:
basics = pd.read_csv("Data//MovieProject//basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"


# Extract years 2002 - 2021

In [9]:
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    
    #Saving new year as the current df
    df = basics.loc[basics['startYear']==YEAR].copy()
    # saving movie ids to pandas Series
    movie_ids = df['tconst'].copy()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    #Get index and movie id from list
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            write_json({'budget': 0, 'revenue': 0, 'imdb_id': movie_id,
                        'certification': 'None'}, JSON_FILE)
            continue
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False) 

YEARS:   0%|          | 0/20 [00:00<?, ?it/s]

Movies from 2002:   0%|          | 0/1505 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1631 [00:00<?, ?it/s]

Movies from 2004:   0%|          | 0/1833 [00:00<?, ?it/s]

Movies from 2005:   0%|          | 0/2124 [00:00<?, ?it/s]

Movies from 2006:   0%|          | 0/2347 [00:00<?, ?it/s]

Movies from 2007:   0%|          | 0/2483 [00:00<?, ?it/s]

Movies from 2008:   0%|          | 0/2824 [00:00<?, ?it/s]

Movies from 2009:   0%|          | 0/3451 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3749 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4143 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4425 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4621 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4771 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/4935 [00:00<?, ?it/s]

Movies from 2016:   0%|          | 0/5146 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5509 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5633 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5677 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/4769 [00:00<?, ?it/s]

Movies from 2021:   0%|          | 0/4716 [00:00<?, ?it/s]

In [10]:
df_2002 = pd.read_csv("Data/MovieProject/final_tmdb_data_2002.csv.gz", low_memory = False)
df_2003 = pd.read_csv("Data/MovieProject/final_tmdb_data_2003.csv.gz", low_memory = False)
df_2004 = pd.read_csv("Data/MovieProject/final_tmdb_data_2004.csv.gz", low_memory = False)
df_2005 = pd.read_csv("Data/MovieProject/final_tmdb_data_2005.csv.gz", low_memory = False)
df_2006 = pd.read_csv("Data/MovieProject/final_tmdb_data_2006.csv.gz", low_memory = False)
df_2007 = pd.read_csv("Data/MovieProject/final_tmdb_data_2007.csv.gz", low_memory = False)
df_2008 = pd.read_csv("Data/MovieProject/final_tmdb_data_2008.csv.gz", low_memory = False)
df_2009 = pd.read_csv("Data/MovieProject/final_tmdb_data_2009.csv.gz", low_memory = False)
df_2010 = pd.read_csv("Data/MovieProject/final_tmdb_data_2010.csv.gz", low_memory = False)
df_2011 = pd.read_csv("Data/MovieProject/final_tmdb_data_2011.csv.gz", low_memory = False)
df_2012 = pd.read_csv("Data/MovieProject/final_tmdb_data_2012.csv.gz", low_memory = False)
df_2013 = pd.read_csv("Data/MovieProject/final_tmdb_data_2013.csv.gz", low_memory = False)
df_2014 = pd.read_csv("Data/MovieProject/final_tmdb_data_2014.csv.gz", low_memory = False)
df_2015 = pd.read_csv("Data/MovieProject/final_tmdb_data_2015.csv.gz", low_memory = False)
df_2016 = pd.read_csv("Data/MovieProject/final_tmdb_data_2016.csv.gz", low_memory = False)
df_2017 = pd.read_csv("Data/MovieProject/final_tmdb_data_2017.csv.gz", low_memory = False)
df_2018 = pd.read_csv("Data/MovieProject/final_tmdb_data_2018.csv.gz", low_memory = False)
df_2019 = pd.read_csv("Data/MovieProject/final_tmdb_data_2019.csv.gz", low_memory = False)
df_2020 = pd.read_csv("Data/MovieProject/final_tmdb_data_2020.csv.gz", low_memory = False)
df_2021 = pd.read_csv("Data/MovieProject/final_tmdb_data_2021.csv.gz", low_memory = False)

In [11]:
df_2002_2003 = pd.concat([df_2002, df_2003])
df_2004_2005 = pd.concat([df_2004, df_2005])
df_2006_2007 = pd.concat([df_2006, df_2007])
df_2008_2009 = pd.concat([df_2008, df_2009])
df_2010_2011 = pd.concat([df_2010, df_2011])
df_2012_2013 = pd.concat([df_2012, df_2013])
df_2014_2015 = pd.concat([df_2014, df_2015])
df_2016_2017 = pd.concat([df_2016, df_2017])
df_2018_2019 = pd.concat([df_2018, df_2019])
df_2020_2021 = pd.concat([df_2020, df_2021])

In [12]:
df_2002_2005 = pd.concat([df_2002_2003, df_2004_2005])
df_2006_2009 = pd.concat([df_2006_2007, df_2008_2009])
df_2010_2013 = pd.concat([df_2010_2011, df_2012_2013])
df_2014_2017 = pd.concat([df_2014_2015, df_2016_2017])
df_2018_2021 = pd.concat([df_2018_2019, df_2020_2021])

In [13]:
df_2002_2009 = pd.concat([df_2002_2005, df_2006_2009])
df_2010_2017 = pd.concat([df_2010_2013, df_2014_2017])

In [14]:
df_2002_2017 = pd.concat([df_2002_2009, df_2010_2017])

In [15]:
df_2002_2021 = pd.concat([df_2002_2017, df_2018_2021])

In [16]:
df_2002_2021.to_csv("Data/MovieProject/tmdb_2002_2021_results_combined.csv.gz", compression = "gzip", index = False)

In [17]:
df1 = pd.read_csv("Data/MovieProject/tmdb_2000_2001_results_combined.csv.gz")
df1.head(3)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.4,21.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,4.4,7.0,


In [18]:
df2 = pd.read_csv("Data/MovieProject/tmdb_2002_2021_results_combined.csv.gz")
df2.head(3)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0096056,0.0,/95U3MUDXu4xSCmVLtWgargRipDi.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109809.0,en,Crime and Punishment,...,0.0,126.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Crime and Punishment,0.0,5.5,11.0,
2,tt0118926,0.0,/p3BzCgX1gDIPdWfuFqRHIe52Ynf.jpg,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,20689.0,en,The Dancer Upstairs,...,5227348.0,132.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"An honest man caught in a world of intrigue, p...",The Dancer Upstairs,0.0,6.3,50.0,R


In [19]:
df_ALL = pd.concat([df1, df2])

In [20]:
df_ALL.to_csv("Data/MovieProject/tmdb_ALL_results_combined.csv.gz", compression = "gzip", index = False)

In [21]:
df_final = pd.read_csv("Data/MovieProject/tmdb_ALL_results_combined.csv.gz")
df_final.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.4,21.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,4.4,7.0,
3,tt0115937,,,,0.0,,,,,,...,0.0,,,,,,,,,
4,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,


In [22]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79234 entries, 0 to 79233
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                79234 non-null  object 
 1   adult                  60067 non-null  float64
 2   backdrop_path          36928 non-null  object 
 3   belongs_to_collection  3732 non-null   object 
 4   budget                 79212 non-null  float64
 5   genres                 60067 non-null  object 
 6   homepage               14763 non-null  object 
 7   id                     60067 non-null  float64
 8   original_language      60067 non-null  object 
 9   original_title         60067 non-null  object 
 10  overview               58691 non-null  object 
 11  popularity             60067 non-null  float64
 12  poster_path            54292 non-null  object 
 13  production_companies   60067 non-null  object 
 14  production_countries   60067 non-null  object 
 15  re