In [67]:
#import libraries
import pandas as pd
import numpy as np

In [68]:
# import basics dataset and convert to cvs
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10100795 entries, 0 to 10100794
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 693.6+ MB


In [69]:
# import akas dataset
akas_url = "/Users/kass/Downloads/title-akas-us-only.csv"
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36951101 entries, 0 to 36951100
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [70]:
# import rating dataset and convert to cvs
rating_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
rating = pd.read_csv(rating_url, sep='\t', low_memory=False)
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340671 entries, 0 to 1340670
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1340671 non-null  object 
 1   averageRating  1340671 non-null  float64
 2   numVotes       1340671 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.7+ MB


In [71]:
#change all \\N to np.nan
basics = basics.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})
rating = rating.replace({'\\N':np.nan})

1. Include only movies that were released in the United States. This was already done when we imported our data 

In [72]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

0            True
1            True
2            True
3            True
4            True
            ...  
10100790     True
10100791     True
10100792     True
10100793     True
10100794    False
Name: tconst, Length: 10100795, dtype: bool

In [None]:
basics = basics[keepers]
basics

2. Include only movies that were released 2000 - 2021 (include 2000 and 2021)

In [None]:
# coerce all to numeric to be able to filter out the years by numbers
basics['startYear']= pd.to_numeric(basics['startYear'], errors= 'coerce')

In [None]:
# keep anything greater than or equal to 2000
basics['startYear'] = basics['startYear'] >= 2000

In [None]:
# Keep anything less than or equal to 2021
basics['startYear'] = basics['startYear'] <= 2021

3. Include only full-length movies (titleType = "movie").


In [None]:
basics = basics.loc[basics['titleType']== "movie"]

4. Exclude any movie with missing values for genre or runtime

In [None]:
basics = basics.dropna(subset = ['runtimeMinutes', 'genres'])

5. Include only fictional movies (not from the Documentary genre)


In [None]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('Documentary',case=False)
basics = basics[~is_documentary]

In [None]:
# save all new dfs
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)