# IMDB MySQL Database Part One
* James Belk
* 3/22/2023

## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Load Data

In [2]:
# basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
# akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
# ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
basics = pd.read_csv('Data/title_basics.csv.gz', low_memory = False)


In [4]:
akas = pd.read_csv('Data/title_akas.csv.gz', low_memory=False)

In [5]:
ratings = pd.read_csv('Data/title_ratings.csv.gz', low_memory=False)

### Save Datasets to 'Data' Folder

In [6]:
basics.to_csv('Data/title_basics.csv.gz',compression='gzip',index=False)

In [7]:
akas.to_csv('Data/title_akas.csv.gz', compression='gzip', index = False)

In [8]:
ratings.to_csv('Data/title_ratings.csv.gz', compression='gzip', index = False)

# Clean Data

## Title Basics

In [9]:
display(basics.head())
display(basics.info())

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
1,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,"Comedy,Short"
2,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894.0,,1,Short
3,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894.0,,1,"Short,Sport"
4,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714643 entries, 0 to 714642
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          714643 non-null  object 
 1   titleType       714643 non-null  object 
 2   primaryTitle    714643 non-null  object 
 3   originalTitle   714643 non-null  object 
 4   isAdult         714643 non-null  int64  
 5   startYear       707871 non-null  float64
 6   endYear         17538 non-null   float64
 7   runtimeMinutes  714643 non-null  int64  
 8   genres          714643 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 49.1+ MB


None

### Replace '\N'

In [10]:
basics = basics.replace({'\\N': np.nan})
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
1,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,"Comedy,Short"
2,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894.0,,1,Short
3,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894.0,,1,"Short,Sport"
4,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance


### Eliminate Movies That Are Null For 'runtimeMinutes'

In [11]:
runtime_drop = basics.dropna(subset=['runtimeMinutes'], inplace = True)

### Eliminate Movies That Are Null For Genre

In [12]:
genre_null = basics.dropna(subset=['genres'], inplace = True)

### Keep Only 'titleType' Movie

In [13]:
movie_type = basics['titleType']=='Movie'

### Keep 'startYear' 2000-2002

In [14]:
date_range = basics['startYear']==('2000', '2001', '2002')

  return op(a, b)


### Eliminate Movies That Include 'Documentary' in Genre

In [15]:
is_documentary = basics['genres'].str.contains('documentary', case = False)
basics = basics[~is_documentary]

### Keep Only US Movies

In [16]:
keepers = basics['tconst'].isin(akas['titleId'])
keepers

0         True
1         True
2         True
3         True
4         True
          ... 
714638    True
714639    True
714640    True
714641    True
714642    True
Name: tconst, Length: 714643, dtype: bool

In [17]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
1,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,"Comedy,Short"
2,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894.0,,1,Short
3,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894.0,,1,"Short,Sport"
4,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
...,...,...,...,...,...,...,...,...,...
714638,tt9916214,short,Drown the Clown,Drown the Clown,0,2019.0,,8,"Drama,Short"
714639,tt9916254,video,Big Tit Cream Pie 32,Big Tit Cream Pie 32,1,2015.0,,226,Adult
714640,tt9916348,video,Ancient World Exposed,Ancient World Exposed,0,2019.0,,67,History
714641,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


In [18]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714643 entries, 0 to 714642
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          714643 non-null  object 
 1   titleType       714643 non-null  object 
 2   primaryTitle    714643 non-null  object 
 3   originalTitle   714643 non-null  object 
 4   isAdult         714643 non-null  int64  
 5   startYear       707871 non-null  float64
 6   endYear         17538 non-null   float64
 7   runtimeMinutes  714643 non-null  int64  
 8   genres          714643 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 54.5+ MB


## AKAs

In [19]:
display(akas.head())
akas.info()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1424064 entries, 0 to 1424063
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1424064 non-null  object 
 1   ordering         1424064 non-null  int64  
 2   title            1424064 non-null  object 
 3   region           1424064 non-null  object 
 4   language         3866 non-null     object 
 5   types            976227 non-null   object 
 6   attributes       46241 non-null    object 
 7   isOriginalTitle  1422719 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 86.9+ MB


In [20]:
akas['region'].isin(['US']).any()

True

In [21]:
usa_filter = akas['region']=='US'
akas = akas.loc[usa_filter, :]
akas['region'].value_counts(dropna = False)

US    1424064
Name: region, dtype: int64

In [22]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [23]:
akas = akas.replace({'\\N': np.nan})
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
...,...,...,...,...,...,...,...,...
1424059,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0.0
1424060,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0.0
1424061,tt9916702,1,Loving London: The Playground,US,,,,0.0
1424062,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0.0


In [24]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1424064 entries, 0 to 1424063
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1424064 non-null  object 
 1   ordering         1424064 non-null  int64  
 2   title            1424064 non-null  object 
 3   region           1424064 non-null  object 
 4   language         3866 non-null     object 
 5   types            976227 non-null   object 
 6   attributes       46241 non-null    object 
 7   isOriginalTitle  1422719 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 97.8+ MB


## Ratings

In [25]:
display(ratings.head())
ratings.info()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1961
1,tt0000002,5.8,263
2,tt0000005,6.2,2600
3,tt0000006,5.1,178
4,tt0000007,5.4,817


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492984 entries, 0 to 492983
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         492984 non-null  object 
 1   averageRating  492984 non-null  float64
 2   numVotes       492984 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.3+ MB


In [26]:
usa = ratings['tconst'].isin(akas['titleId'])
usa

0         True
1         True
2         True
3         True
4         True
          ... 
492979    True
492980    True
492981    True
492982    True
492983    True
Name: tconst, Length: 492984, dtype: bool

In [27]:
ratings = ratings[usa]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1961
1,tt0000002,5.8,263
2,tt0000005,6.2,2600
3,tt0000006,5.1,178
4,tt0000007,5.4,817
...,...,...,...
492979,tt9916200,8.2,224
492980,tt9916204,8.2,256
492981,tt9916348,8.3,18
492982,tt9916362,6.4,5242


In [28]:
ratings = ratings.replace({'\\N': np.nan})
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1961
1,tt0000002,5.8,263
2,tt0000005,6.2,2600
3,tt0000006,5.1,178
4,tt0000007,5.4,817
...,...,...,...
492979,tt9916200,8.2,224
492980,tt9916204,8.2,256
492981,tt9916348,8.3,18
492982,tt9916362,6.4,5242


In [29]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492984 entries, 0 to 492983
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         492984 non-null  object 
 1   averageRating  492984 non-null  float64
 2   numVotes       492984 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.0+ MB
