# Loading the data from IMDB 

In [1]:
import pandas as pd
import numpy as np

## The Data 

In [2]:
basics_url ="https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

## Loading TSV's with Pandas

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10265839,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10265840,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10265841,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10265842,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [4]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2003
1,tt0000002,5.8,269
2,tt0000003,6.5,1898
3,tt0000004,5.5,178
4,tt0000005,6.2,2683
...,...,...,...
1363560,tt9916730,7.6,11
1363561,tt9916766,7.0,22
1363562,tt9916778,7.2,36
1363563,tt9916840,8.8,6


In [5]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
37595399,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
37595400,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
37595401,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
37595402,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


# Specifications
# Required Preprocessing - Details

# Filtering/Cleaning Steps:

### Title Basics:

**Replace "\N" with np.nan**


In [6]:
basics.replace({'\\N':np.nan}, inplace=True)

In [7]:
for col in basics:
    print('Column {} has {} missing values'.format(col,basics[col].isnull().sum()))

Column tconst has 0 missing values
Column titleType has 0 missing values
Column primaryTitle has 17 missing values
Column originalTitle has 17 missing values
Column isAdult has 1 missing values
Column startYear has 1373989 missing values
Column endYear has 10151818 missing values
Column runtimeMinutes has 7171328 missing values
Column genres has 459016 missing values


* **Eliminate movies that are null for runtimeMinutes**

In [8]:
basics = basics[basics['runtimeMinutes'].notna()]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10265794,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,,49,Documentary
10265800,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Game-Show,Reality-TV"
10265835,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,,11,"Adventure,Animation,Comedy"
10265842,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [9]:
basics['runtimeMinutes'].isna().sum()

0

* **Eliminate movies that are null for genre**

In [10]:
basics['genres'].isnull().sum()

80832

In [11]:
basics = basics[basics['genres'].notna()]

In [12]:
basics['genres'].isnull().sum()

0

* **keep only titleType==Movie**

In [14]:
basics = basics[basics['titleType'] == 'movie']
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10265694,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
10265735,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
10265762,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
10265784,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,Drama


* **keep startYear 2000-2022**

In [15]:
basics = basics[(basics['startYear'] > '2000') & (basics['startYear'] < '2022')]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13081,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
...,...,...,...,...,...,...,...,...,...
10265694,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
10265735,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
10265762,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
10265784,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,Drama


* **Eliminate movies that include "Documentary" in genre**

In [16]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [17]:
(basics['genres'] == 'documentary').sum()

0

In [19]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
...,...,...,...,...,...,...,...,...,...
10265526,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
10265565,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
10265610,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
10265694,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama


* **Keep only US movies**

In [20]:
akas = akas[akas['region'] == 'US']
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
37594930,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
37595000,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
37595089,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
37595132,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


* **Replace "\N" with np.nan**

In [21]:
akas.replace({'\\N':np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  akas.replace({'\\N':np.nan}, inplace=True)


## Filtering one dataframe based on another

Next, we will filter the basics df to only include the movies that are present in the filter akas dataframe. This is how we will ultimately be able to filter the movies by the region being in the US.

In [24]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
keepers


34800        True
61111        True
67485        True
67663        True
80548        True
            ...  
10265526     True
10265565    False
10265610     True
10265694    False
10265784    False
Name: tconst, Length: 136521, dtype: bool

In [25]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
...,...,...,...,...,...,...,...,...,...
10264983,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
10265377,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019,,97,"Comedy,Drama,Fantasy"
10265517,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10265526,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


### Ratings:
* **Replace "\N" with np.nan (if any)** 

In [26]:
ratings.replace({'\\N':np.nan}, inplace=True)

In [27]:
for col in ratings:
    print('Column {} has {} missing values'.format(col,ratings[col].isnull().sum()))

Column tconst has 0 missing values
Column averageRating has 0 missing values
Column numVotes has 0 missing values


**Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)**

In [28]:
keepers = ratings['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1363560    False
1363561    False
1363562    False
1363563    False
1363564    False
Name: tconst, Length: 1363565, dtype: bool

In [29]:
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2003
1,tt0000002,5.8,269
4,tt0000005,6.2,2683
5,tt0000006,5.0,183
6,tt0000007,5.4,839
...,...,...,...
1363527,tt9916200,8.1,238
1363528,tt9916204,8.2,275
1363535,tt9916348,8.3,18
1363536,tt9916362,6.4,5583


## Creating a "Data" folder.

In [30]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

## Saving Compressed .csv.gz Files

In [31]:
## Save current dataframe to file.
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)

In [32]:
# Open saved file and preview again
basics_ = pd.read_csv("Data/basics.csv.gz", low_memory = False)
basics_.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
4,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror


In [33]:
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

In [34]:
ratings_ = pd.read_csv("Data/ratings.csv.gz", low_memory = False)
ratings_.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2003
1,tt0000002,5.8,269
2,tt0000005,6.2,2683
3,tt0000006,5.0,183
4,tt0000007,5.4,839


In [35]:
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [36]:
akas_ = pd.read_csv("Data/akas.csv.gz", low_memory = False)
akas_.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


## Deliverable

In [37]:
basics_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80699 entries, 0 to 80698
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          80699 non-null  object 
 1   titleType       80699 non-null  object 
 2   primaryTitle    80698 non-null  object 
 3   originalTitle   80698 non-null  object 
 4   isAdult         80699 non-null  int64  
 5   startYear       80699 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  80699 non-null  int64  
 8   genres          80699 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.5+ MB


In [38]:
ratings_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512735 entries, 0 to 512734
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         512735 non-null  object 
 1   averageRating  512735 non-null  float64
 2   numVotes       512735 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.7+ MB


In [39]:
akas_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473807 entries, 0 to 1473806
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1473807 non-null  object 
 1   ordering         1473807 non-null  int64  
 2   title            1473805 non-null  object 
 3   region           1473807 non-null  object 
 4   language         4182 non-null     object 
 5   types            984902 non-null   object 
 6   attributes       47663 non-null    object 
 7   isOriginalTitle  1472466 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 90.0+ MB
