# IMDB DATA ANALYSIS

In [1]:
import pandas as pd
import numpy as np
import os

## The Data

In [2]:
#Basics
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

#Akas
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

#Ratings
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

## Preprocessing

### Title Basics

In [3]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9864435 entries, 0 to 9864434
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 677.3+ MB


In [4]:
#Replace "\N" with np.nan
basics.replace({'\\N': np.nan}, inplace=True)

In [5]:
#Eliminate movies that are null for runtimeMinutes
basics.dropna (subset = ["runtimeMinutes"], inplace = True)

In [6]:
#Eliminate movies that are null for genres
basics.dropna (subset = ["genres"], inplace = True)

In [7]:
#keep only titleType==Movie
basics=basics[(basics['titleType'] == 'movie')]

In [8]:
#keep startYear 2000-2022
##from .info, its observed its an object, so ill change dtype to int 
basics.dropna (subset = ["startYear"], inplace = True)
basics['startYear'] = basics['startYear'].astype(int)

##apply two filters to make startYear between 2000 to 2022
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]
basics['startYear'].value_counts()

2017    14375
2018    14336
2019    14076
2016    13962
2015    13481
2014    13115
2022    12844
2013    12388
2021    12373
2012    11637
2020    11576
2011    10781
2010    10208
2009     9361
2008     8158
2007     6964
2006     6523
2005     5838
2004     5213
2003     4592
2002     4129
2001     3869
2000     3641
Name: startYear, dtype: int64

In [9]:
#Eliminate movies that include "Documentary" in genre 
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [10]:
#Keep only US movies
##Filter the basics table down to only include the US by using the filter akas dataframe
basic_keepers =basics['tconst'].isin(akas['titleId'])

##filter basics
basics = basics[basic_keepers]

### AKAs

In [11]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35949831 entries, 0 to 35949830
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [12]:
#Replace "\N" with np.nan
akas.replace({'\\N': np.nan}, inplace=True)

In [13]:
#keep only US movies
##drop nulls
akas.dropna (subset = ['region'], inplace = True)
##apply filter
akas = akas[akas['region'] == 'US']
akas['region'].value_counts()

US    1438079
Name: region, dtype: int64

### Ratings

In [14]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313744 entries, 0 to 1313743
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1313744 non-null  object 
 1   averageRating  1313744 non-null  float64
 2   numVotes       1313744 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.1+ MB


In [15]:
#Replace "\N" with np.nan
ratings.replace({'\\N': np.nan}, inplace=True)

In [17]:
#keep only US movies
##Filter the ratings table down to only include the US by using the filter akas dataframe
ratings_keepers =ratings['tconst'].isin(akas['titleId'])

##filter ratings
ratings = ratings[ratings_keepers]

## Overview

In [18]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147576 entries, 34803 to 9864285
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          147576 non-null  object
 1   titleType       147576 non-null  object
 2   primaryTitle    147576 non-null  object
 3   originalTitle   147576 non-null  object
 4   isAdult         147576 non-null  object
 5   startYear       147576 non-null  int64 
 6   endYear         0 non-null       object
 7   runtimeMinutes  147576 non-null  object
 8   genres          147576 non-null  object
dtypes: int64(1), object(8)
memory usage: 15.3+ MB


In [19]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1438079 entries, 5 to 35949575
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1438079 non-null  object
 1   ordering         1438079 non-null  int64 
 2   title            1438079 non-null  object
 3   region           1438079 non-null  object
 4   language         3933 non-null     object
 5   types            978972 non-null   object
 6   attributes       46596 non-null    object
 7   isOriginalTitle  1436734 non-null  object
dtypes: int64(1), object(7)
memory usage: 98.7+ MB


In [20]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499625 entries, 0 to 1313719
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         499625 non-null  object 
 1   averageRating  499625 non-null  float64
 2   numVotes       499625 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.2+ MB


## Save

In [21]:
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [22]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [23]:
## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [24]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1974
1,tt0000002,5.8,264
2,tt0000005,6.2,2617
3,tt0000006,5.1,182
4,tt0000007,5.4,820
