# Import & Upload

In [1]:
import pandas as pd 
import numpy as np

In [2]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
basic_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basic_url, sep = "\t", low_memory = False)
akas = pd.read_csv(akas_url, sep = "\t", low_memory = False)
ratings = pd.read_csv(ratings_url, sep = "\t", low_memory = False)

# Data Cleaning

In [4]:
basics.replace({'\\N':np.nan}, inplace = True)
akas.replace({'\\N':np.nan}, inplace = True)
ratings.replace({'\\N':np.nan}, inplace = True)

In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9151875 entries, 0 to 9151874
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 628.4+ MB


In [6]:
basics.dropna(subset = ["runtimeMinutes", "genres"], inplace = True)
basics["titleType"].value_counts()

tvEpisode       1061865
short            573313
movie            366361
video            174813
tvMovie           88223
tvSeries          86401
tvSpecial         16441
tvMiniSeries      16016
tvShort            9372
videoGame           294
Name: titleType, dtype: int64

In [7]:
basics = basics[basics.titleType == "movie"]


In [8]:
basics["titleType"].value_counts()

movie    366361
Name: titleType, dtype: int64

In [9]:
basics = basics[(basics.startYear >= "2000") & (basics.startYear <= "2022")]

basics["startYear"].value_counts()

2017    14181
2018    14122
2016    13811
2019    13790
2015    13320
2014    12980
2013    12264
2021    11693
2012    11534
2020    11243
2011    10674
2010    10115
2009     9261
2008     8067
2022     7429
2007     6878
2006     6427
2005     5760
2004     5127
2003     4526
2002     4084
2001     3810
2000     3591
Name: startYear, dtype: int64

In [10]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [11]:
akas.info()
akas = akas[akas.region == "US"]
akas["region"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32854693 entries, 0 to 32854692
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


US    1342445
Name: region, dtype: int64

In [12]:
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34790       True
61089       True
67634       True
77928      False
86765       True
           ...  
9151547     True
9151556     True
9151595    False
9151640     True
9151724    False
Name: tconst, Length: 142096, dtype: bool

In [13]:
basics = basics[keepers]
basics



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61089,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67634,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86765,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
92730,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy
...,...,...,...,...,...,...,...,...,...
9151011,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9151407,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9151547,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9151556,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [14]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['title_basics.csv.gz', 'title_akas.csv.gz', 'title_ratings.csv.gz']

In [15]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)



In [16]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [17]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)


In [18]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82108 entries, 0 to 82107
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82108 non-null  object 
 1   titleType       82108 non-null  object 
 2   primaryTitle    82108 non-null  object 
 3   originalTitle   82108 non-null  object 
 4   isAdult         82108 non-null  int64  
 5   startYear       82108 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  82108 non-null  int64  
 8   genres          82108 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


In [19]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342445 entries, 5 to 32854437
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1342445 non-null  object
 1   ordering         1342445 non-null  int64 
 2   title            1342445 non-null  object
 3   region           1342445 non-null  object
 4   language         3680 non-null     object
 5   types            963286 non-null   object
 6   attributes       44724 non-null    object
 7   isOriginalTitle  1341070 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.2+ MB


In [20]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263317 entries, 0 to 1263316
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1263317 non-null  object 
 1   averageRating  1263317 non-null  float64
 2   numVotes       1263317 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.9+ MB
