#### Import Dataset

In [66]:
#import libraries
import pandas as pd
import numpy as np

In [67]:
# import basics dataset and convert to cvs
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10122745 entries, 0 to 10122744
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 695.1+ MB


In [None]:
# import rating dataset and convert to cvs
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.info()

In [None]:
# import rating dataset and convert to cvs
rating_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
rating = pd.read_csv(rating_url, sep='\t', low_memory=False)
rating.info()

In [None]:
#change all \\N to np.nan
basics = basics.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})
rating = rating.replace({'\\N':np.nan})

### Basics
- Keep only US movies
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- Keep only titleType==Movie
- Convert the startYear column to float data type.
- Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)
- Eliminate movies that include "Documentary" in the genre

### US only movies

In [6]:

# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

0            True
1            True
2            True
3            True
4            True
            ...  
10118043     True
10118044     True
10118045     True
10118046     True
10118047    False
Name: tconst, Length: 10118048, dtype: bool

In [62]:
#implament filter
basics = basics[keepers]
#verify changes
basics.info()

  basics = basics[keepers]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 291250 entries, 0 to 292725
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          291250 non-null  object 
 1   titleType       291250 non-null  object 
 2   primaryTitle    291250 non-null  object 
 3   originalTitle   291250 non-null  object 
 4   isAdult         291250 non-null  int64  
 5   startYear       291250 non-null  bool   
 6   endYear         0 non-null       float64
 7   runtimeMinutes  291250 non-null  int64  
 8   genres          291250 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 20.3+ MB


2. Include only movies that were released 2000 - 2021 (include 2000 and 2021)

In [8]:
# coerce all to numeric to be able to filter out the years by numbers
basics['startYear']= pd.to_numeric(basics['startYear'], errors= 'coerce')

In [9]:
# keep anything greater than or equal to 2000
basics['startYear'] = basics['startYear'] >= 2000

In [10]:
# Keep anything less than or equal to 2021
basics['startYear'] = basics['startYear'] <= 2021

3. Include only full-length movies (titleType = "movie").


In [11]:
basics = basics.loc[basics['titleType']== "movie"]

4. Exclude any movie with missing values for genre or runtime

In [12]:
basics = basics.dropna(subset = ['runtimeMinutes', 'genres'])

5. Include only fictional movies (not from the Documentary genre)


In [13]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('Documentary',case=False)
basics = basics[~is_documentary]

In [14]:
# save all new dfs
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [15]:
rating.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)

In [61]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37035964 entries, 0 to 37035963
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


## API

### Specifications 

In [16]:
import os, time,json
import tmdbsimple as tmdb 
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['tmdb_api_results_2001.json',
 'title_basics.csv.gz',
 'title_rating.csv.gz',
 'results_in_progress_mov.json',
 'title_akas.csv.gz']

In [17]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from tqdm.notebook import tqdm_notebook

In [18]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [19]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('/Users/kass/Documents/GitHub/Project-3/Data/title_basics.csv.gz')
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,True,,45,Romance
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,True,,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,True,,90,Drama
3,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,True,,120,"Adventure,Fantasy"
4,tt0000941,movie,Locura de amor,Locura de amor,0,True,,45,Drama
...,...,...,...,...,...,...,...,...,...
292721,tt9916190,movie,Safeguard,Safeguard,0,True,,95,"Action,Adventure,Thriller"
292722,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,True,,84,Thriller
292723,tt9916362,movie,Coven,Akelarre,0,True,,92,"Drama,History"
292724,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,True,,123,Drama


In [20]:
# we want years 2000 and 2001
YEARS_TO_GET = [2000,2001]
# a place to hold all the errors
errors = [ ]

In [46]:
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
#Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

In [47]:
# Check if file exists
file_exists = os.path.isfile(JSON_FILE)
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)

In [48]:
#Saving new year as the current df
df = basics.loc[ basics['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          0 non-null      object 
 1   titleType       0 non-null      object 
 2   primaryTitle    0 non-null      object 
 3   originalTitle   0 non-null      object 
 4   isAdult         0 non-null      int64  
 5   startYear       0 non-null      bool   
 6   endYear         0 non-null      float64
 7   runtimeMinutes  0 non-null      int64  
 8   genres          0 non-null      object 
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 0.0+ bytes


In [58]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)


# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #Get index and movie id from list
    # INNER Loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

Movies from 2001: 0it [00:00, ?it/s]

In [None]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

In [None]:
print(f"- Total errors: {len(errors)}")

### Exploratory analysis

In [23]:
!pip install tmdbsimple
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']



In [22]:
# Load API Credentials
with open('/Users/kass/secret/mov_API.json') as f:   #use your path here!
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [24]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = 'Data/results_in_progress_mov.json'
JSON_FILE

'Data/results_in_progress_mov.json'

In [25]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_mov.json already exists.


In [26]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


In [30]:
def get_movie_with_rating(movie_id):

# Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
# save the .info .releases dictionaries
    info = movie.info()

    releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']

return info

NameError: name 'releases' is not defined

In [32]:
## testing our function by looping through a list of ids
import pandas as pd
test_ids = ["tt0848228", "tt0115937","tt0848228","tt0332280"]
results = []
for movie_id in test_ids:
    
    try:
        movie_info = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except: 
        pass
    
pd.DataFrame(results)

Unnamed: 0,0
0,
1,
2,
