# **Data Processing**

_John Andrew Dixon_

---

##### **Imports**

In [78]:
# Imports 
import pandas as pd
import numpy as np

##### **Data Load**

In [79]:
# Remote URL to the IMDB data
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
# Loading the remote data
basics_df = pd.read_csv(basics_url, sep="\t", low_memory=False)
# Verify
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9811396,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9811397,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9811398,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9811399,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [80]:
# Remote URL to the IMDB data
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
# Loading the remote data
akas_df = pd.read_csv(akas_url, sep="\t", low_memory=False)
# Verify
akas_df

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
35744236,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
35744237,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
35744238,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
35744239,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


In [81]:
# Remote URL to the IMDB data
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Loading the remote data
ratings_df = pd.read_csv(ratings_url, sep="\t", low_memory=False)
# Verify
ratings_df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
2,tt0000003,6.5,1808
3,tt0000004,5.6,178
4,tt0000005,6.2,2607
...,...,...,...
1306411,tt9916730,8.3,10
1306412,tt9916766,7.0,21
1306413,tt9916778,7.2,36
1306414,tt9916840,8.8,6


---

## **Processing**

##### _AKAs_

In [82]:
# Replace "\N" with np.nan
akas_df.replace({"\\N": np.nan}, inplace=True)

##### _Ratings_

In [84]:
# Replace "\N" with np.nan
ratings_df.replace({"\\N": np.nan}, inplace=True)

##### _Basics_

In [86]:
# Replace "\N" with np.nan
basics_df.replace({"\\N": np.nan}, inplace=True)

In [87]:
# Eliminate movies that are null for runtimeMinutes
basics_df.dropna(axis=0, subset=["runtimeMinutes"], inplace=True)
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9811351,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,,49,Documentary
9811357,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Game-Show,Reality-TV"
9811392,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,,11,"Adventure,Animation,Comedy"
9811399,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [88]:
# Eliminate movies that are null for genre
basics_df.dropna(axis=0, subset=["genres"], inplace=True)
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9811351,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,,49,Documentary
9811357,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Game-Show,Reality-TV"
9811392,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,,11,"Adventure,Animation,Comedy"
9811399,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [89]:
# Keep only titleType==Movie

# Create a filter than gets titleType = Movie for all rows
is_movie = basics_df["titleType"] == "movie"

# Apply the filter
basics_df = basics_df.loc[is_movie, :]
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
9811167,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9811251,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9811292,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9811319,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [90]:
basics_df["startYear"] = basics_df["startYear"].astype(float)
basics_df["endYear"] = basics_df["endYear"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_df["startYear"] = basics_df["startYear"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_df["endYear"] = basics_df["endYear"].astype(float)


In [91]:
# Create a filter that selects all movies with a startYear between 2000 and 2022
between_2000_2022 = (basics_df["startYear"] >= 2000) & (basics_df["startYear"] <= 2022)

# Apply the filter
basics_df = basics_df.loc[between_2000_2022, :]
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
76059,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022.0,,46,Documentary
...,...,...,...,...,...,...,...,...,...
9811167,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
9811251,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,,123,Drama
9811292,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015.0,,57,Documentary
9811319,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007.0,,100,Documentary


In [92]:
# Eliminate movies that include "Documentary" in genre

# Create a filter that gets all non-documentary movies
is_not_documentary = ~basics_df["genres"].str.contains("documentary", case=False)

# Apply the filter
basics_df = basics_df.loc[is_not_documentary, :]
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9811074,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9811083,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9811122,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9811167,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


##### _Keep Only US Movies_

In [94]:
# Keep only US movies.
akas_df = akas_df.loc[akas_df["region"] == "US", :]
akas_df

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
35743767,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
35743837,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
35743926,tt9916702,1,Loving London: The Playground,US,,,,0
35743969,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [95]:
# Basics: Keep only US movies

# Get the ratings that have tconst within the basics titleId
only_US = basics_df["tconst"].isin(akas_df["titleId"])

# Apply the filter
basics_df = basics_df.loc[only_US, :]
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9810539,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9810934,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9811074,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9811083,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [96]:
# Keep only US movies

# Get the ratings that have tconst within the akas titleId
only_US = ratings_df["tconst"].isin(akas_df["titleId"])

# Apply the filter
ratings_df = ratings_df.loc[only_US, :]
ratings_df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
4,tt0000005,6.2,2607
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1306377,tt9916200,8.1,229
1306378,tt9916204,8.1,262
1306385,tt9916348,8.1,18
1306386,tt9916362,6.4,5307


## **Pre-save check**

_Before saving, run a final .info() for each of the dataframes to show a summary of how many movies remain and the datatypes of each feature._

In [97]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86563 entries, 34803 to 9811167
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86563 non-null  object 
 1   titleType       86563 non-null  object 
 2   primaryTitle    86563 non-null  object 
 3   originalTitle   86563 non-null  object 
 4   isAdult         86563 non-null  object 
 5   startYear       86563 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86563 non-null  object 
 8   genres          86563 non-null  object 
dtypes: float64(2), object(7)
memory usage: 6.6+ MB


In [98]:
akas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1433173 entries, 5 to 35743985
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1433173 non-null  object
 1   ordering         1433173 non-null  int64 
 2   title            1433173 non-null  object
 3   region           1433173 non-null  object
 4   language         3893 non-null     object
 5   types            978227 non-null   object
 6   attributes       46470 non-null    object
 7   isOriginalTitle  1431828 non-null  object
dtypes: int64(1), object(7)
memory usage: 98.4+ MB


In [99]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496798 entries, 0 to 1306391
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         496798 non-null  object 
 1   averageRating  496798 non-null  float64
 2   numVotes       496798 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.2+ MB


## **Saving Results**

In [100]:
# Save each DataFrame to compressed files
basics_df.to_csv("Data/title_basics.csv.gz", compression='gzip',index=False)
ratings_df.to_csv("Data/title_ratings.csv.gz", compression='gzip',index=False)
akas_df.to_csv("Data/title_akas.csv.gz", compression='gzip',index=False)

In [101]:
# Open saved file and preview again
basics_df = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
86558,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
86559,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
86560,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
86561,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [102]:
# Open saved file and preview again
ratings_df = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings_df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
2,tt0000005,6.2,2607
3,tt0000006,5.2,181
4,tt0000007,5.4,816
...,...,...,...
496793,tt9916200,8.1,229
496794,tt9916204,8.1,262
496795,tt9916348,8.1,18
496796,tt9916362,6.4,5307


In [103]:
# Open saved file and preview again
akas_df = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas_df

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
...,...,...,...,...,...,...,...,...
1433168,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0.0
1433169,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0.0
1433170,tt9916702,1,Loving London: The Playground,US,,,,0.0
1433171,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0.0
