### Loading Libraries

In [71]:
import pandas as pd
import numpy as np
import os

### Loading Data

In [16]:
# links from IMDB
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
titles_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [None]:
# creating dataframes from the links
basics = pd.read_csv(basics_url, sep='\t', low_memory = False)
titles = pd.read_csv(titles_url, sep='\t', low_memory = False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory = False)

In [25]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [24]:
titles.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


### Cleaning Dataframes

In [26]:
# removing \N and replacing with nan
basics = basics.replace({'\\N':np.nan})
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [27]:
# removing \N and replacing with nan
titles = titles.replace({'\\N':np.nan})
titles.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [28]:
# removing \N and replacing with nan
ratings = ratings.replace({'\\N':np.nan})
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000002,5.9,254
1,tt0000003,6.5,1692
2,tt0000004,5.7,166
3,tt0000005,6.2,2507
4,tt0000006,5.1,171


In [31]:
# removing rows that are nan for runtimeMinutes
basics = basics[basics['runtimeMinutes'].notnull()]

In [34]:
# removing rows that are nan for genre
basics = basics[basics['genres'].notnull()]

In [50]:
# filtering only titletype with category movie
basics = basics[basics['titleType'] == 'movie']

In [62]:
# filtering out movies older than 2000
basics.drop(basics[(basics['startYear'] < '2000') | (basics['startYear'] > '2022')].index, inplace = True)

In [64]:
# filtering movies that are documentary
documentary_movies = basics['genres'].str.contains('documentary', case = False)
basics = basics[~documentary_movies]

In [67]:
# filtering only US movies from titles dataframe
titles = titles[titles['region'] == 'US']

In [70]:
# filtering basics to only include US movies using the titles dataframe
US_movie_filter = basics['tconst'].isin(titles['titleId'])

# using filter on basics dataframe
basics = basics[US_movie_filter]

### Saving updated dataframes

In [72]:
# creating a new folder to hold the dataframes
os.makedirs('Data/',exist_ok=True) 
# checking that the folder has been created
os.listdir("Data/")

[]

In [73]:
# saving dataframes to newly created folder
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)
titles.to_csv("Data/titles.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

### Dataframe summaries

In [74]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83870 entries, 34785 to 9080156
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          83870 non-null  object
 1   titleType       83870 non-null  object
 2   primaryTitle    83870 non-null  object
 3   originalTitle   83870 non-null  object
 4   isAdult         83870 non-null  object
 5   startYear       81720 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  83870 non-null  object
 8   genres          83870 non-null  object
dtypes: object(9)
memory usage: 6.4+ MB


In [75]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1336074 entries, 5 to 32657507
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1336074 non-null  object
 1   ordering         1336074 non-null  int64 
 2   title            1336074 non-null  object
 3   region           1336074 non-null  object
 4   language         3622 non-null     object
 5   types            1026471 non-null  object
 6   attributes       44450 non-null    object
 7   isOriginalTitle  1334699 non-null  object
dtypes: int64(1), object(7)
memory usage: 91.7+ MB


In [76]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219066 entries, 0 to 1219065
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1219066 non-null  object 
 1   averageRating  1219066 non-null  float64
 2   numVotes       1219066 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 27.9+ MB
