In [3]:
import pandas as pd

### Main IMDB Dataset of All Movies ###

In [16]:
# convert imdb tsv files to pandas dataframe and assign to a variable
imdb = pd.read_csv('imdbtitle_akas.tsv', delimiter='\t', dtype=str)

In [5]:
imdb.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
1,tt0000001,2,Καρμενσίτα,GR,\N,\N,\N,0
2,tt0000001,3,Карменсита,RU,\N,\N,\N,0
3,tt0000001,4,Carmencita,US,\N,\N,\N,0
4,tt0000001,5,Carmencita,\N,\N,original,\N,1


In [6]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3645877 entries, 0 to 3645876
Data columns (total 8 columns):
titleId            object
ordering           object
title              object
region             object
language           object
types              object
attributes         object
isOriginalTitle    object
dtypes: object(8)
memory usage: 222.5+ MB


In [7]:
# drop duplicates in dataframe based on titleid
imdb_nodup = imdb.drop_duplicates(subset=['titleId'])
imdb_nodup.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
5,tt0000002,1,Le clown et ses chiens,\N,\N,original,\N,1
11,tt0000003,1,Sarmanul Pierrot,RO,\N,imdbDisplay,\N,0
17,tt0000004,1,Un bon bock,\N,\N,original,\N,1
23,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0


In [8]:
imdb_nodup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2072736 entries, 0 to 3645875
Data columns (total 8 columns):
titleId            object
ordering           object
title              object
region             object
language           object
types              object
attributes         object
isOriginalTitle    object
dtypes: object(8)
memory usage: 142.3+ MB


In [46]:
# remove unneeded columns from dataframe
imdb_reduced = imdb_nodup.drop(['isOriginalTitle', 'attributes', 'language', 'ordering', 'types'], axis=1)
imdb_reduced.head()

Unnamed: 0,titleId,title,region
0,tt0000001,Carmencita - spanyol tánc,HU
5,tt0000002,Le clown et ses chiens,\N
11,tt0000003,Sarmanul Pierrot,RO
17,tt0000004,Un bon bock,\N
23,tt0000005,Blacksmithing Scene,US


In [51]:
imdb_reduced = imdb_reduced.rename(columns = {'titleId':'tconst'})

In [53]:
imdb_reduced.head()

Unnamed: 0,tconst,title,region
0,tt0000001,Carmencita - spanyol tánc,HU
5,tt0000002,Le clown et ses chiens,\N
11,tt0000003,Sarmanul Pierrot,RO
17,tt0000004,Un bon bock,\N
23,tt0000005,Blacksmithing Scene,US


In [49]:
# only movies with region noted as US
imdb_us = imdb_reduced[imdb_nodup['region'] == 'US']

In [50]:
imdb_us.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 771709 entries, 23 to 3645859
Data columns (total 3 columns):
titleId    771709 non-null object
title      771709 non-null object
region     771709 non-null object
dtypes: object(3)
memory usage: 23.6+ MB


In [32]:
# list of IMDB movie titleID - US only
imdb_us_titleID = imdb_reduced['titleId'].values.tolist()

In [33]:
imdb_us_titleID[:10]

['tt0000001',
 'tt0000002',
 'tt0000003',
 'tt0000004',
 'tt0000005',
 'tt0000006',
 'tt0000007',
 'tt0000008',
 'tt0000009',
 'tt0000010']

In [34]:
# list of IMDB movie titleid
imdb_titleID = imdb_reduced['titleId'].values.tolist()

In [35]:
imdb_titleID[:10]

['tt0000001',
 'tt0000002',
 'tt0000003',
 'tt0000004',
 'tt0000005',
 'tt0000006',
 'tt0000007',
 'tt0000008',
 'tt0000009',
 'tt0000010']

### IMDB Dataset of Movie Ratings ###

In [17]:
imdb_ratings = pd.read_csv('./imdbtitle_ratings.tsv', delimiter='\t')

In [18]:
imdb_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1482
1,tt0000002,6.4,177
2,tt0000003,6.6,1112
3,tt0000004,6.5,108
4,tt0000005,6.2,1816


In [19]:
imdb_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919378 entries, 0 to 919377
Data columns (total 3 columns):
tconst           919378 non-null object
averageRating    919378 non-null float64
numVotes         919378 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 21.0+ MB


### IMDB Dataset of Movie Genres ###

In [20]:
imdb_genres = pd.read_csv('./imdbtitle_basics.tsv', delimiter='\t')

In [21]:
imdb_genres.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [41]:
imdb_genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5743271 entries, 0 to 5743270
Data columns (total 9 columns):
tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           int64
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtypes: int64(1), object(8)
memory usage: 394.4+ MB


In [43]:
genres_reduced = imdb_genres.drop(['titleType', 'originalTitle', 'isAdult', 'endYear', 'runtimeMinutes'], axis=1)

In [44]:
genres_reduced.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres
0,tt0000001,Carmencita,1894,"Documentary,Short"
1,tt0000002,Le clown et ses chiens,1892,"Animation,Short"
2,tt0000003,Pauvre Pierrot,1892,"Animation,Comedy,Romance"
3,tt0000004,Un bon bock,1892,"Animation,Short"
4,tt0000005,Blacksmith Scene,1893,"Comedy,Short"


### IMDB Dataset of Directors for Each Movie ###

In [23]:
imdb_crew = pd.read_csv('./imdbtitle_crew.tsv', delimiter='\t')

In [24]:
imdb_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [26]:
imdb_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5743271 entries, 0 to 5743270
Data columns (total 3 columns):
tconst       object
directors    object
writers      object
dtypes: object(3)
memory usage: 131.5+ MB


### IMDB Dataset of Actors and Directors Details ###

In [28]:
imdb_team = pd.read_csv('./imdbname_basics.tsv', delimiter='\t')

In [36]:
imdb_team.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0072308,tt0053137,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0071877,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0059956,tt0049189,tt0057345,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,soundtrack","tt0080455,tt0078723,tt0072562,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0050986,tt0083922,tt0069467"


In [37]:
imdb_team.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9220668 entries, 0 to 9220667
Data columns (total 6 columns):
nconst               object
primaryName          object
birthYear            object
deathYear            object
primaryProfession    object
knownForTitles       object
dtypes: object(6)
memory usage: 422.1+ MB


### Join IMDB Datasets ###

In [56]:
imdb_details = imdb_reduced.merge(genres_reduced, on="tconst")

In [57]:
imdb_details.head()

Unnamed: 0,tconst,title,region,primaryTitle,startYear,genres
0,tt0000001,Carmencita - spanyol tánc,HU,Carmencita,1894,"Documentary,Short"
1,tt0000002,Le clown et ses chiens,\N,Le clown et ses chiens,1892,"Animation,Short"
2,tt0000003,Sarmanul Pierrot,RO,Pauvre Pierrot,1892,"Animation,Comedy,Romance"
3,tt0000004,Un bon bock,\N,Un bon bock,1892,"Animation,Short"
4,tt0000005,Blacksmithing Scene,US,Blacksmith Scene,1893,"Comedy,Short"


In [58]:
imdb_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2068181 entries, 0 to 2068180
Data columns (total 6 columns):
tconst          object
title           object
region          object
primaryTitle    object
startYear       object
genres          object
dtypes: object(6)
memory usage: 110.5+ MB


In [59]:
imdb_details.tail()

Unnamed: 0,tconst,title,region,primaryTitle,startYear,genres
2068176,tt9915592,A Husband to Rent,XWW,A Husband to Rent,1974,Comedy
2068177,tt9916170,O Ensaio,BR,The Rehearsal,2019,Drama
2068178,tt9916174,Kleiderglück,\N,Kleiderglück,2018,Short
2068179,tt9916192,Danielle Darrieux: Filmstar mit dem frechen Etwas,DE,Danielle Darrieux: Il est poli d'être gai!,2019,Biography
2068180,tt9916460,Ροζ Ταξί,GR,Pink Taxi,2019,Comedy


In [60]:
imdb_details.isnull().values.any()

True

In [61]:
imdb_details.isnull().sum()

tconst           0
title            0
region          53
primaryTitle     0
startYear        0
genres           1
dtype: int64