![image.png](attachment:image.png)

# How to Make a Movie Successful

- Kevin Barnett
> Data Dictionary: https://www.imdb.com/interfaces/


# Part 1

## Import Libraries

In [1]:
import pandas as pd
import json
import tmdbsimple as tmdb

## Creating DataFrames

In [2]:
# URLs for data
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_rating = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
# Pandas dataframes
basics = pd.read_csv(title_basics, sep='\t', low_memory=False)
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9823612 entries, 0 to 9823611
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 674.5+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
# Change column titles to lowercase
basics.columns = basics.columns.str.lower()
basics.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
akas = pd.read_csv(title_akas, sep='\t', low_memory=False)
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35794725 entries, 0 to 35794724
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [6]:
# Change column titles to lowercase
akas.columns = akas.columns.str.lower()
akas.head()

Unnamed: 0,titleid,ordering,title,region,language,types,attributes,isoriginaltitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [7]:
rating = pd.read_csv(title_rating, sep='\t', low_memory=False)
rating.info()
rating.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306793 entries, 0 to 1306792
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1306793 non-null  object 
 1   averageRating  1306793 non-null  float64
 2   numVotes       1306793 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.9+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,264
2,tt0000003,6.5,1811
3,tt0000004,5.6,178
4,tt0000005,6.2,2610


In [8]:
# Change column titles to lowercase
rating.columns = rating.columns.str.lower()
rating.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,264
2,tt0000003,6.5,1811
3,tt0000004,5.6,178
4,tt0000005,6.2,2610


## Data Cleaning

In [9]:
# Basics df
# Replace '\N with np.nan'
basics.replace({'\\N':'np.nan'}, inplace=True)
basics.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,np.nan,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,np.nan,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,np.nan,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,np.nan,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,np.nan,1,"Comedy,Short"


In [10]:
# Eliminate movies that are null for runtimeminutes
(basics['runtimeminutes'] == 'np.nan').sum()

6928148

In [11]:
basics.drop(basics[basics['runtimeminutes'] == 'np.nan'].index, inplace = True)
(basics['runtimeminutes'] == 'np.nan').sum()

0

In [12]:
# Eliminate movies that are null for genre
(basics['genres'] == 'np.nan').sum()

76610

In [13]:
basics.drop(basics[basics['genres'] == 'np.nan'].index, inplace = True)
(basics['genres'] == 'np.nan').sum()

0

In [14]:
# Keep only titletype == movie
basics['titletype'].value_counts()

tvEpisode       1430585
short            600029
movie            381794
video            180291
tvMovie           91487
tvSeries          90305
tvSpecial         18085
tvMiniSeries      17149
tvShort            8807
videoGame           322
Name: titletype, dtype: int64

In [15]:
basics.drop(basics[basics['titletype'] != 'movie'].index, inplace = True)
basics['titletype'].value_counts()

movie    381794
Name: titletype, dtype: int64

In [16]:
# Keep startyear == 2000-2022
basics['startyear'].value_counts()

2017    14365
2018    14324
2019    14057
2016    13953
2015    13478
        ...  
1904        1
1897        1
1896        1
2026        1
1894        1
Name: startyear, Length: 131, dtype: int64

In [17]:
basics.drop(basics[basics['startyear'] < '2000'].index, inplace = True)
basics.drop(basics[basics['startyear'] >= '2022'].index, inplace = True)
basics['startyear'].value_counts()

2017    14365
2018    14324
2019    14057
2016    13953
2015    13478
2014    13102
2013    12384
2021    12332
2012    11628
2020    11564
2011    10777
2010    10203
2009     9353
2008     8152
2007     6963
2006     6515
2005     5831
2004     5203
2003     4588
2002     4130
2001     3866
2000     3641
Name: startyear, dtype: int64

In [18]:
# Convert to 'startyear' to float64
basics['startyear'] = basics['startyear'].astype('float64')
basics.dtypes

tconst             object
titletype          object
primarytitle       object
originaltitle      object
isadult            object
startyear         float64
endyear            object
runtimeminutes     object
genres             object
dtype: object

In [19]:
# Eliminate movies that have 'documentary' in the genre
documentary = basics['genres'].str.contains('documentary', case = False)
basics = basics[~documentary]
(basics['genres'].str.contains('documetary')).sum()

0

In [20]:
# Keep only movies in the US
akas['region'].value_counts()

DE    4286251
FR    4282042
JP    4280756
IN    4221782
ES    4202201
       ...   
JE          2
NU          1
TV          1
PW          1
NR          1
Name: region, Length: 248, dtype: int64

In [21]:
akas.drop(akas[akas['region'] != 'US'].index, inplace = True)
akas.value_counts()

titleid     ordering  title                          region  language  types        attributes  isoriginaltitle
tt0000001   6         Carmencita                     US      \N        imdbDisplay  \N          0                  1
tt26685237  1         Return                         US      \N        \N           \N          0                  1
tt26685264  1         Adóptame                       US      \N        \N           \N          0                  1
tt26685263  1         Mt St Hell                     US      \N        \N           \N          0                  1
tt26685260  1         Desolate                       US      \N        \N           \N          0                  1
                                                                                                                  ..
tt11674072  1         Monica and Friends: Lessons    US      \N        \N           new title   0                  1
tt11674020  2         Charles Ponzi the Documentary  US      \N      

In [22]:
# Filter basics df to only include movies from the US
US_basics = basics['tconst'].isin(akas['titleid'])
US_basics

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9823285     True
9823294     True
9823333    False
9823378     True
9823462    False
Name: tconst, Length: 138414, dtype: bool

In [23]:
basics = basics[US_basics]
basics

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,np.nan,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,np.nan,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,np.nan,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,np.nan,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,np.nan,126,Drama
...,...,...,...,...,...,...,...,...,...
9822750,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,np.nan,74,Drama
9823145,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,np.nan,97,"Comedy,Drama,Fantasy"
9823285,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,np.nan,51,Drama
9823294,tt9916190,movie,Safeguard,Safeguard,0,2020.0,np.nan,95,"Action,Adventure,Thriller"


In [24]:
# Replace '\N' with np.nan in akas df
akas.replace({'\\N':'np.nan'}, inplace = True)
akas.head()

Unnamed: 0,titleid,ordering,title,region,language,types,attributes,isoriginaltitle
5,tt0000001,6,Carmencita,US,np.nan,imdbDisplay,np.nan,0
14,tt0000002,7,The Clown and His Dogs,US,np.nan,np.nan,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,np.nan,imdbDisplay,np.nan,0
36,tt0000005,1,Blacksmithing Scene,US,np.nan,alternative,np.nan,0
41,tt0000005,6,Blacksmith Scene #1,US,np.nan,alternative,np.nan,0


In [25]:
# Replace '\N' with np.nan in rating df if applicable
(rating == '\\N').sum()

tconst           0
averagerating    0
numvotes         0
dtype: int64

In [26]:
# Filter ratings to only include US movies
US_rating = rating['tconst'].isin(akas['titleid'])
rating = rating[US_rating]
rating

Unnamed: 0,tconst,averagerating,numvotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,264
4,tt0000005,6.2,2610
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1306754,tt9916200,8.1,229
1306755,tt9916204,8.1,262
1306762,tt9916348,8.3,18
1306763,tt9916362,6.4,5312


## Saving Compressed .csv.gz Files

In [28]:
import os
os.makedirs('/Data')
os.listdir('/Data')

[]

In [29]:
basics.to_csv('/Data/title_basics.csv.gz',compression='gzip',index=False)
akas.to_csv('/Data/title_akas.csv.gz',compression='gzip',index=False)
rating.to_csv('/Data/title_rating.csv.gz',compression='gzip',index=False)

In [30]:
basics = pd.read_csv('/Data/title_basics.csv.gz', low_memory=False)
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81670 entries, 0 to 81669
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81670 non-null  object 
 1   titletype       81670 non-null  object 
 2   primarytitle    81670 non-null  object 
 3   originaltitle   81670 non-null  object 
 4   isadult         81670 non-null  int64  
 5   startyear       81670 non-null  float64
 6   endyear         81670 non-null  object 
 7   runtimeminutes  81670 non-null  int64  
 8   genres          81670 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.6+ MB


Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,np.nan,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,np.nan,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,np.nan,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,np.nan,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,np.nan,126,Drama


In [31]:
akas = pd.read_csv('/Data/title_akas.csv.gz', low_memory=False)
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1434625 entries, 0 to 1434624
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleid          1434625 non-null  object
 1   ordering         1434625 non-null  int64 
 2   title            1434625 non-null  object
 3   region           1434625 non-null  object
 4   language         1434625 non-null  object
 5   types            1434625 non-null  object
 6   attributes       1434625 non-null  object
 7   isoriginaltitle  1434625 non-null  object
dtypes: int64(1), object(7)
memory usage: 87.6+ MB


Unnamed: 0,titleid,ordering,title,region,language,types,attributes,isoriginaltitle
0,tt0000001,6,Carmencita,US,np.nan,imdbDisplay,np.nan,0
1,tt0000002,7,The Clown and His Dogs,US,np.nan,np.nan,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,np.nan,imdbDisplay,np.nan,0
3,tt0000005,1,Blacksmithing Scene,US,np.nan,alternative,np.nan,0
4,tt0000005,6,Blacksmith Scene #1,US,np.nan,alternative,np.nan,0


In [32]:
rating = pd.read_csv('/Data/title_rating.csv.gz', low_memory=False)
rating.info()
rating.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497338 entries, 0 to 497337
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         497338 non-null  object 
 1   averagerating  497338 non-null  float64
 2   numvotes       497338 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ MB


Unnamed: 0,tconst,averagerating,numvotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,264
2,tt0000005,6.2,2610
3,tt0000006,5.2,181
4,tt0000007,5.4,816
