# Creating a Movies Database

## Initialize and Load

In [1]:
# Load libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
pd.set_option('display.max_columns',50)

In [2]:
# Making new folder, "Data", with os
import os
os.makedirs('Data/',exist_ok=True) 

In [3]:
# Confirm folder was created and files added successfully
os.listdir("Data/")

['title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title-akas-us-only.csv',
 '.ipynb_checkpoints']

In [4]:
# Load the akas file
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
1452559,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
1452560,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
1452561,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
1452562,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


In [5]:
# Load the title basics file
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10017006,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10017007,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10017008,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10017009,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [6]:
# Load the title ratings file
ratings = pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632
...,...,...,...
1331487,tt9916730,8.3,10
1331488,tt9916766,7.0,21
1331489,tt9916778,7.2,36
1331490,tt9916840,7.5,7


# Pre-processing

### Pre-processing the title basics table

In [7]:
# Filter the basics table to only include the US movies using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]

In [8]:
# Convert placeholder "\N" values in the basics table back to true null values to identify missing genres and runtimes
basics = basics.replace("\\N", np.nan)

In [9]:
# Drop rows with null values in the runtimeMinutes or genres columns
basics = basics.dropna(subset = ['runtimeMinutes', 'genres'])

In [10]:
# Filter to keep only full-length movies
filter_flength = basics['titleType'] == 'movie'
basics = basics[filter_flength]

In [11]:
# Convert startYear to a float dtype
basics['startYear'] = basics['startYear'].astype(float)

In [12]:
# Filter to keep movies with startYear that are >=2000 and <=2022
filter_startYear = (basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)
basics = basics[filter_startYear]

In [13]:
# Eliminate movies that include "Documentary" in genre
filter_documentaries = basics['genres'].str.contains('Documentary')
# Exclude movies in the documentary category.
basics = basics[~filter_documentaries]

In [14]:
# Display a final preview of the filtered title basics
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [15]:
# Display a final preview of the filtered title basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


In [16]:
# Save the preprocessed title basics as a csv file
basics.to_csv('Data/preprocessed-title-basics.csv', index = False)

### Pre-processing the title ratings table

In [17]:
# Load and filter the title ratings file to keep only movies that are included in your final title basics dataframe
filter_basics = ratings['tconst'].isin(basics['tconst'])

In [18]:
# Replace "\N" with np.nan
ratings = ratings.replace(r'\n', np.nan)

In [19]:
# Display a final preview of the filtered title ratings
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [20]:
# Display a final preview of the filtered title ratings
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


In [21]:
# Save the preprocessed title ratings as a csv file
ratings.to_csv('Data/preprocessed-title-ratings.csv', index = False)