# IMDB Project 3
- Juliana Sahagun
- 08/17/22

In [1]:
# Import libraries
import pandas as pd
import numpy as np


In [2]:
# Download files
basics_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url= "https://datasets.imdbws.com/title.akas.tsv.gz"

In [12]:
# Loading the data
df_basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
df_akas= pd.read_csv(akas_url, sep='\t', low_memory=False)
df_ratings= pd.read_csv(ratings_url,sep='\t', low_memory=False)

ParserError: Error tokenizing data. C error: out of memory

In [4]:
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
df_ratings.head()

NameError: name 'df_ratings' is not defined

In [6]:
df_akas.head()

NameError: name 'df_akas' is not defined

## Cleaning/Filtering

Basics Preprocessing

In [7]:
# Replace "\N" with np.nan
df_basics = df_basics.replace({'\\N':np.nan})

In [8]:
# Remove movies having missing values for runtimeMinutes and genre
df_basics= df_basics.dropna(subset=['runtimeMinutes','genres','startYear'])

startYear also has null values that should be removed because it interferes with keep all movies with the start year 2000-2022

In [9]:
# Include only full-length movies (titleType = "movie")
df_basics = df_basics.loc[df_basics['titleType']=='movie']

In [10]:
# Eliminate movies that include "Documentary" in genre
doc= df_basics['genres'].str.contains('documentary', case=False)
df_basics = df_basics[~doc]

In [11]:
# Keep startYear 2000-2022
df_basics= df_basics.loc[(df_basics['startYear'] >=2000) & (df_basics['startYear'] <=2021)]

TypeError: '>=' not supported between instances of 'str' and 'int'

In [None]:
df_basics.info()

Akas Preprocessing

In [None]:
#Keep only movies that were released in the United States
df_akas = df_akas.loc[df_akas['region']=='US']

In [None]:
# Replace "\N" with np.nan
df_akas=df_akas.replace({'\\N': np.nan})

In [None]:
df_akas.info()

Ratings Preprocessing

In [None]:
# Replace "\N" with np.nan
df_ratings = df_ratings.replace({'\\N':np.nan})

In [None]:
#Filtering one dataframe based on another
keepers = df_basics['tconst'].isin(df_akas['titleId'])
df_basics =df_basics[keepers]
df_basics

In [None]:
# Make folders for data
import os
os.makedirs('Data/',exist_ok=True) 

# Confirm folder created
os.listdir("Data/")


In [None]:
# Save current dataframe to file
df_basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
df_akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
df_ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [None]:
# Open saved file and preview again
df_basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
df_basics.head()



In [None]:
df_ratings=pd.read_csv("Data/title_ratings.csv.gz", low_memory =False)
df_ratings.head()

In [None]:
df_akas=pd.read_csv("Data/title_akas.csv.gz",low_memory=False)
df_akas.head()

PART TWO

In [None]:
# Open and read file
with open('/Users/julia/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

In [None]:
tmdb.API_KEY =  login['api-key']

In [None]:
# Code adapted from the LP
def get_movie_and_rating(movie_id):    
    
    movie = tmdb.Movies(movie_id)
    movie_info = movie.info()
    releases = movie.releases()

    for c in releases['countries']:
        if c['iso_3166_1' ] =='US':
            movie_info['certification'] = c['certification']
    return movie_info

In [None]:
test = get_movie_and_rating("tt0848228")
test

In [None]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

In [None]:
YEARS_TO_GET =[2000,2001]

In [None]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [None]:
def read_and_fix_json(JSON_FILE):
    """Attempts to read in json file of records and fixes the final character
    to end with a ] if it errors.
    
    Args:
        JSON_FILE (str): filepath of JSON file
        
    Returns:
        DataFrame: the corrected data from the bad json file
    """
    try: 
        previous_df =  pd.read_json(JSON_FILE)
    
    ## If read_json throws an error
    except:
        
        ## manually open the json file
        with open(JSON_FILE,'r+') as f:
            ## Read in the file as a STRING
            bad_json = f.read()
            
            ## if the final character doesn't match first, select the right bracket
            first_char = bad_json[0]
            final_brackets = {'[':']', 
                           "{":"}"}
            ## Select expected final brakcet
            final_char = final_brackets[first_char]
            
            ## if the last character in file doen't match the first char, add it
            if bad_json[-1] != final_char:
                good_json = bad_json[:-1]
                good_json+=final_char
            else:
                raise Exception('ERROR is not due to mismatched final bracket.')
            
            ## Rewind to start of file and write new good_json to disk
            f.seek(0)
            f.write(good_json)
           
        ## Load the json file again now that its fixed
        previous_df =  pd.read_json(JSON_FILE)
        
    return previous_df

In [None]:
    ## Load the json file again now that its fixed
        previous_df =  pd.read_json(JSON_FILE)
        
    return previous_df
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    file_exists = os.path.isfile(JSON_FILE)
    
    if file_exists == False:
            with open(JSON_FILE,'w') as f:
                json.dump([{'imdb_id':0}],f)
                basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)

                df = basics.loc[ basics['startYear']==YEAR].copy()

                movie_ids = df['tconst'].copy()

                previous_df = read_and_fix_json(JSON_FILE)

                movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

                for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc=f'Movies from {YEAR}',
                                      position=1,
                                      leave=True):
            # Attempt to retrieve then data for the movie id
                    try:
                        temp = get_movie_and_rating(movie_id)  #This uses your pre-made function
                        # Append/extend results to existing file using a pre-made function
                        write_json(temp,JSON_FILE)
                        # Short 20 ms sleep to prevent overwhelming server
                        time.sleep(0.02)

                    # If it fails,  make a dict with just the id and None for certification.
                    except Exception as e:
                        continue

                    final_year_df = pd.read_json(JSON_FILE)

                    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz",
                                         compression="gzip", index=False)
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)