# IMDB Project 3
- Juliana Sahagun
- 08/17/22

In [1]:
# Import libraries
import pandas as pd
import numpy as np


In [2]:
# Download files
basics_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url= "https://datasets.imdbws.com/title.akas.tsv.gz"

In [3]:
# Loading the data
df_basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
df_akas= pd.read_csv(akas_url, sep='\t', low_memory=False)
df_ratings= pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [4]:
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
df_ratings.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


## Cleaning/Filtering

Basics Preprocessing

In [34]:
# Replace "\N" with np.nan
df_basics = df_basics.replace({'\\N':np.nan})

In [35]:
# Remove movies having missing values for runtimeMinutes and genre
df_basics= df_basics.dropna(subset=['runtimeMinutes','genres','startYear'])

startYear also has null values that should be removed because it interferes with keep all movies with the start year 2000-2022

In [36]:
# Include only full-length movies (titleType = "movie")
df_basics = df_basics.loc[df_basics['titleType']=='movie']

In [37]:
# Eliminate movies that include "Documentary" in genre
doc= df_basics['genres'].str.contains('documentary', case=False)
df_basics = df_basics[~doc]

In [38]:
# Keep startYear 2000-2022
df_basics= df_basics.loc[(df_basics['startYear'] >=2000) & (df_basics['startYear'] <=2022)]

In [39]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82110 entries, 0 to 82109
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82110 non-null  object 
 1   titleType       82110 non-null  object 
 2   primaryTitle    82110 non-null  object 
 3   originalTitle   82110 non-null  object 
 4   isAdult         82110 non-null  int64  
 5   startYear       82110 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  82110 non-null  int64  
 8   genres          82110 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 6.3+ MB


Akas Preprocessing

In [40]:
#Keep only movies that were released in the United States
df_akas = df_akas.loc[df_akas['region']=='US']

In [41]:
# Replace "\N" with np.nan
df_akas=df_akas.replace({'\\N': np.nan})

In [42]:
df_akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342706 entries, 0 to 1342705
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1342706 non-null  object 
 1   ordering         1342706 non-null  int64  
 2   title            1342706 non-null  object 
 3   region           1342706 non-null  object 
 4   language         3681 non-null     object 
 5   types            963269 non-null   object 
 6   attributes       44738 non-null    object 
 7   isOriginalTitle  1341331 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 92.2+ MB


Ratings Preprocessing

In [43]:
# Replace "\N" with np.nan
df_ratings = df_ratings.replace({'\\N':np.nan})

In [45]:
#Filtering one dataframe based on another
keepers = df_basics['tconst'].isin(df_akas['titleId'])
df_basics =df_basics[keepers]
df_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy
...,...,...,...,...,...,...,...,...,...
82105,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
82106,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
82107,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
82108,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [46]:
# Make folders for data
import os
os.makedirs('Data/',exist_ok=True) 

# Confirm folder created
os.listdir("Data/")


['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [27]:
# Save current dataframe to file
df_basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
df_akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
df_ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [29]:
# Open saved file and preview again
df_basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
df_basics.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [30]:
df_ratings=pd.read_csv("Data/title_ratings.csv.gz", low_memory =False)
df_ratings.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short"


In [31]:
df_akas=pd.read_csv("Data/title_akas.csv.gz",low_memory=False)
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
