In [1]:
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload
%load_ext autoreload

%autoreload 2

In [10]:
import gc
import numpy as np
import os
import pandas as pd
import sqlalchemy

# Local imports
import imdb
import transform

# Useful resources

1. [How to Work with BIG Datasets on Kaggle Kernels (16G RAM)](https://www.kaggle.com/yuliagm/how-to-work-with-big-datasets-on-16g-ram-dask)

In [3]:
data_folder = './imdb'

if not os.path.exists(data_folder):
    os.mkdir(data_folder)

def save_csv(df, file):
    filename = os.path.join(data_folder, file)
    df.to_csv(filename, index=False)

In [32]:
engine = sqlalchemy.create_engine('mysql+pymysql://imdb:imdb@localhost:3306/imdb')

def df_to_mysql(df, table_name, delete_before=True):
    
    if delete_before:
        # Delete table before adding new rows
        connection = engine.connect()
        trans = connection.begin()
        connection.execute('SET FOREIGN_KEY_CHECKS = 0;')
        stmt = 'TRUNCATE {};'.format(table_name)
        print(stmt)
        connection.execute(stmt)
        connection.execute('SET FOREIGN_KEY_CHECKS = 1;')
        trans.commit()
        connection.close()
        print('Table {} deleted'.format(table_name))
        
    df.to_sql(table_name, con=engine, if_exists='append', index=False, chunksize=10**4)

# Name_basics

In [None]:
name_basics = imdb.name_basics_df()

In [12]:
name_basics_pre = name_basics.copy()

# nconst to int
name_basics_pre['nconst'] = transform.nconst_to_float(name_basics_pre['nconst'])

name_basics_pre.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,1.0,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,2.0,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,3.0,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,4.0,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,5.0,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [65]:
%%timeit -n 1 -r 1

df_to_mysql(name_basics_pre, 'name_basics')

#save_csv(name_basics_pre, 'name_basics.csv')

Table name_basics deleted
5min 3s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# title_basics

In [18]:
title_basics = imdb.title_basics_df()

In [20]:
title_basics_pre = title_basics.copy()

title_basics_pre['tconst'] = transform.tconst_to_float(title_basics_pre['tconst'])

# Preserve tconst for future filterings
title_basics_tconst = title_basics_pre['tconst'].copy()

title_basics_pre.info()
title_basics_pre.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5465783 entries, 0 to 5465782
Data columns (total 9 columns):
tconst            float64
titleType         object
primaryTitle      object
originalTitle     object
isAdult           int64
startYear         float64
endYear           float64
runtimeMinutes    float64
genres            object
dtypes: float64(4), int64(1), object(4)
memory usage: 375.3+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1.0,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
1,2.0,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
2,3.0,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
3,4.0,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
4,5.0,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short"


In [69]:
%%timeit -n 1 -r 1

df_to_mysql(title_basics_pre, 'title_basics')

# save_csv(title_basics_pre, 'title_basics.csv')

Table title_basics deleted
4min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [21]:
del title_basics
del title_basics_pre
gc.collect()

48

# title_akas

In [7]:
title_akas = imdb.title_akas_df()
title_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,tt0000001,2,Карменсита,RU,,,,0.0
2,tt0000001,3,Carmencita,US,,,,0.0
3,tt0000001,4,Carmencita,,,original,,1.0
4,tt0000002,1,Le clown et ses chiens,,,original,,1.0


In [8]:
title_akas_pre = title_akas.copy()
title_akas_pre['titleId'] = transform.tconst_to_float(title_akas_pre['titleId'])
print('Shape', title_akas_pre.shape)

# Remove title_akas for non-existing# title_basic
title_akas_pre = title_akas_pre[title_akas_pre['titleId'].isin(title_basics_pre['tconst'])]

print('Shape', title_akas_pre.shape)
title_akas_pre.head()

Shape (3699803, 8)
Shape (3695781, 8)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,1.0,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,1.0,2,Карменсита,RU,,,,0.0
2,1.0,3,Carmencita,US,,,,0.0
3,1.0,4,Carmencita,,,original,,1.0
4,2.0,1,Le clown et ses chiens,,,original,,1.0


In [13]:
%%timeit -n 1 -r 1

df_to_mysql(title_akas_pre, 'title_akas')

#save_csv(title_akas_pre, 'title_akas.csv')

TRUNCATE title_akas;
Table title_akas deleted
TRUNCATE title_akas;
Table title_akas deleted
TRUNCATE title_akas;
Table title_akas deleted
2min 23s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)


# title_crew

In [18]:
title_crew = imdb.title_crew_df()

In [22]:
title_crew_pre = title_crew.copy()

# Expand rows based on directors and writers list
title_crew_pre['directors'] = title_crew_pre['directors'].astype('str')
title_crew_pre = transform.expand_rows_using_repeat(title_crew_pre, 'directors', ',')

title_crew_pre['writers'] = title_crew_pre['writers'].astype('str')
title_crew_pre = transform.expand_rows_using_repeat(title_crew_pre, 'writers', ',')

title_crew_pre.rename(index=str, columns={"directors": "director", "writers": "writer"}, inplace=True)

# Transform identifiers
title_crew_pre['tconst'] = transform.tconst_to_float(title_crew_pre['tconst'])
title_crew_pre['director'] = transform.nconst_to_float(title_crew_pre['director'])
title_crew_pre['writer'] = transform.nconst_to_float(title_crew_pre['writer'])

# Remove rows for non-existing titles or names
title_crew_pre = title_crew_pre[(title_crew_pre['director'].isin(name_basics_pre['nconst'])) | (title_crew_pre['director'].isna())]
title_crew_pre = title_crew_pre[(title_crew_pre['writer'].isin(name_basics_pre['nconst'])) | (title_crew_pre['writer'].isna())]

title_crew_pre.info()
title_crew_pre.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 16443637 entries, 0 to 16443636
Data columns (total 3 columns):
tconst      float64
director    float64
writer      float64
dtypes: float64(3)
memory usage: 501.8+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 16443508 entries, 0 to 16443636
Data columns (total 3 columns):
tconst      float64
director    float64
writer      float64
dtypes: float64(3)
memory usage: 501.8+ MB


Unnamed: 0,tconst,director,writer
0,1.0,5690.0,
1,2.0,721526.0,
2,3.0,721526.0,
3,4.0,721526.0,
4,5.0,5690.0,
5,6.0,5690.0,
6,7.0,5690.0,
7,7.0,374658.0,
8,8.0,5690.0,
9,9.0,85156.0,85156.0


In [26]:
title_crew_pre.isna().sum() / len(title_crew_pre)

tconst      0.000000
director    0.159349
writer      0.172583
dtype: float64

In [23]:
%%timeit -n 1 -r 1

df_to_mysql(title_crew_pre, 'title_crew')

#save_csv(title_crew_pre, 'title_crew.csv')

TRUNCATE title_crew;
Table title_crew deleted


KeyboardInterrupt: 

# title_episode

In [15]:
title_episode = imdb.title_episode_df()

In [29]:
title_episode_pre = title_episode.copy()

# Transform identifiers
title_episode_pre['tconst'] = transform.tconst_to_float(title_episode_pre['tconst'])
title_episode_pre['parentTconst'] = transform.tconst_to_float(title_episode_pre['parentTconst'])

# Remove rows for non-existing titles
title_episode_pre = title_episode_pre[(title_episode_pre['tconst'].isin(title_basics_tconst))]
title_episode_pre = title_episode_pre[title_episode_pre['parentTconst'].isin(title_basics_tconst)]

title_episode_pre.info()
title_episode_pre.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3735207 entries, 0 to 3735227
Data columns (total 4 columns):
tconst           float64
parentTconst     float64
seasonNumber     float64
episodeNumber    float64
dtypes: float64(4)
memory usage: 142.5 MB


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,41951.0,41038.0,1.0,9.0
1,42816.0,989125.0,1.0,17.0
2,42889.0,989125.0,,
3,43426.0,40051.0,3.0,42.0
4,43631.0,989125.0,2.0,16.0


In [33]:
%%timeit -n 1 -r 1

df_to_mysql(title_episode_pre, 'title_episode')

#save_csv(title_episode_pre, 'title_episode.csv')

TRUNCATE title_episode;
Table title_episode deleted
6min 27s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [34]:
del title_episode
del title_episode_pre
gc.collect()

114

# title_principals

In [35]:
title_principals = imdb.title_principals_df()

In [37]:
title_principals_pre = title_principals.copy()

# Transform identifiers
title_principals_pre['tconst'] = transform.tconst_to_float(title_principals_pre['tconst'])
title_principals_pre['nconst'] = transform.nconst_to_float(title_principals_pre['nconst'])

# Remove rows for non-existing titles
title_principals_pre.info()
title_episode_pre = title_episode_pre[(title_episode_pre['tconst'].isin(title_basics_tconst))]
title_episode_pre = title_episode_pre[title_episode_pre['parentTconst'].isin(title_basics_tconst)]


title_principals_pre.info()
title_principals_pre.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31211515 entries, 0 to 31211514
Data columns (total 6 columns):
tconst        float64
ordering      int64
nconst        float64
category      object
job           object
characters    object
dtypes: float64(2), int64(1), object(3)
memory usage: 1.4+ GB


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,1.0,1,1588970.0,self,,"[""Herself""]"
1,1.0,2,5690.0,director,,
2,1.0,3,374658.0,cinematographer,director of photography,
3,2.0,1,721526.0,director,,
4,2.0,2,1335271.0,composer,,


In [15]:
%%timeit -n 1 -r 1

df_to_mysql(title_principals_pre, 'title_principals')

#save_csv(title_principals_pre, 'title_principals.csv')

In [None]:
del title_principals
del title_principals_pre
gc.collect()

# title_ratings

In [7]:
title_ratings = title_ratings_df()

print('Shape', title_ratings.shape)
title_ratings.head()

Shape (897434, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1443
1,tt0000002,6.4,174
2,tt0000003,6.6,1045
3,tt0000004,6.4,104
4,tt0000005,6.2,1741


In [18]:
title_ratings_pre = title_ratings.copy()

# Transform identifiers
title_ratings_pre['tconst'] = transform.tconst_to_float(title_ratings_pre['tconst'])

print('Shape', title_ratings_pre.shape)
title_ratings_pre.head()

Shape (897434, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,1.0,5.8,1443
1,2.0,6.4,174
2,3.0,6.6,1045
3,4.0,6.4,104
4,5.0,6.2,1741


In [20]:
save_csv(title_ratings_pre, 'title_ratings.csv')