In [11]:
# Imports
import pandas as pd
import numpy as np
import gzip

In [9]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)





# Preprocessing

## AKAs Cleaning

- Keep only US movies
- Replace "\N" with np.nan

In [15]:
# Filter US movies
akas = akas[akas['region'] == 'US']
print(akas['region'].value_counts())

# Replace "\N" with np.nan
akas.replace("\\N", np.nan, inplace=True)

US    1460235
Name: region, dtype: int64


In [16]:
# Check for NaNs
akas.isna().sum()

titleId                  0
ordering                 0
title                    0
region                   0
language           1456113
types               477381
attributes         1412906
isOriginalTitle       1342
dtype: int64

## Titles Cleaning
- Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- Convert the startYear column to float data type.
- Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)
- Eliminate movies that include "Documentary" in the genre (see tip below).

In [19]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])



False    8719154
True     1372925
Name: tconst, dtype: int64

In [22]:
# Filter Basics
basics = basics[keepers]
basics



  basics = basics[keepers]


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10091940,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10091969,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10092007,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10092030,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


## Ratings Cleaning