In [1]:
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload
%load_ext autoreload

%autoreload 2

In [2]:
import numpy as np
import os
import pandas as pd
import sqlalchemy

# Local imports
import imdb
import transform

# Download the data from the page and load to Pandas DataFrame

In [12]:
data_folder = './imdb'

if not os.path.exists(data_folder):
    os.mkdir(data_folder)

def save_csv(df, file):
    filename = os.path.join(data_folder, file)
    df.to_csv(filename, index=False)

In [13]:
engine = sqlalchemy.create_engine('mysql+pymysql://imdb:imdb@localhost:3306/imdb')

def df_to_mysql(df, table_name, delete_before=True):
    
    if delete_before:
        # Delete table before adding new rows
        connection = engine.connect()
        stmt = sqlalchemy.delete(sqlalchemy.table(table_name))
        connection.execute(stmt)
        connection.close()
        print('Table {} deleted'.format(table_name))
        
    df.to_sql(table_name, con=engine, if_exists='append', index=False, chunksize=1024)

# Name_basics

In [46]:
name_basics = get_imdb_dataframe('https://datasets.imdbws.com/name.basics.tsv.gz')
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [27]:
name_basics_pre = name_basics.copy()
# nconst to int
name_basics_pre['nconst'] = transform.nconst_to_float(name_basics['nconst'])

name_basics_pre.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,1.0,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,2.0,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,3.0,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,4.0,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,5.0,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [65]:
%%timeit -n 1 -r 1

df_to_mysql(name_basics_pre, 'name_basics')

#save_csv(name_basics_pre, 'name_basics.csv')

Table name_basics deleted
5min 3s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# title_basics

In [8]:
title_basics = imdb.title_basics_df()
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short"


In [9]:
title_basics_pre = title_basics.copy()
title_basics_pre['tconst'] = transform.tconst_to_float(title_basics_pre['tconst'])

title_basics_pre.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1.0,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
1,2.0,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
2,3.0,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
3,4.0,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
4,5.0,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short"


In [69]:
%%timeit -n 1 -r 1

df_to_mysql(title_basics_pre, 'title_basics')

# save_csv(title_basics_pre, 'title_basics.csv')

Table title_basics deleted
4min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# title_akas

In [4]:
title_akas = imdb.title_akas_df()
title_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,tt0000001,2,Карменсита,RU,,,,0.0
2,tt0000001,3,Carmencita,US,,,,0.0
3,tt0000001,4,Carmencita,,,original,,1.0
4,tt0000002,1,Le clown et ses chiens,,,original,,1.0


In [10]:
title_akas_pre = title_akas.copy()
title_akas_pre['titleId'] = transform.tconst_to_float(title_akas_pre['titleId'])
print('Shape', title_akas_pre.shape)

# Remove title_akas for non-existing# title_basic
title_akas_pre = title_akas_pre[title_akas_pre['titleId'].isin(title_basics_pre['tconst'])]

print('Shape', title_akas_pre.shape)
title_akas_pre.head()

Shape (3699803, 8)
Shape (3695781, 8)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,1.0,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,1.0,2,Карменсита,RU,,,,0.0
2,1.0,3,Carmencita,US,,,,0.0
3,1.0,4,Carmencita,,,original,,1.0
4,2.0,1,Le clown et ses chiens,,,original,,1.0


In [41]:
%%timeit -n 1 -r 1

df_to_mysql(title_akas_pre, 'title_akas')

#save_csv(title_akas_pre, 'title_akas.csv')

Table title_akas deleted
3min 12s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# title_crew

In [42]:
title_crew = get_imdb_dataframe('https://datasets.imdbws.com/title.crew.tsv.gz')
print("Shape", title_crew.shape)
title_crew.head(10)

NameError: name 'get_imdb_dataframe' is not defined

In [98]:
def expand_rows_using_repeat(df, target_column, separator):
    '''
    Expand the rows of a DataFrame splitting values of the target column using numpy repeat.
    '''
    target_df = df[target_column].str.split(separator)
    lens = [len(item) for item in target_df]
    
    other_df = df[df.columns.difference([target_column])]
    
    other_array = np.repeat(other_df.values, lens, axis=0)
    
    # Put each target element in a row
    target_array = np.concatenate(target_df.values).reshape(-1, 1)
    
    data = np.concatenate((other_array, target_array), axis=1)
    
    columns = np.append(other_df.columns.values, target_column)
    
    final_df = pd.DataFrame(data=data, columns=columns)
    
    # Preserve original column order
    final_df = final_df[df.columns]
    
    return final_df

In [101]:
title_crew_pre = title_crew.copy()

# Expand rows based on directors and writers list
title_crew_pre['directors'] = title_crew_pre['directors'].astype('str')
title_crew_pre = expand_rows_using_repeat(title_crew_pre, 'directors', ',')

title_crew_pre['writers'] = title_crew_pre['writers'].astype('str')
title_crew_pre = expand_rows_using_repeat(title_crew_pre, 'writers', ',')

# Transform identifiers
title_crew_pre['tconst'] = tconst_transform(title_crew_pre['tconst'])
title_crew_pre['directors'] = nconst_transform(title_crew_pre['directors'])
title_crew_pre['writers'] = nconst_transform(title_crew_pre['writers'])

print('Shape', title_crew_pre.shape)
title_crew_pre.head(10)

Shape (16443637, 3)


Unnamed: 0,tconst,directors,writers
0,1.0,5690.0,
1,2.0,721526.0,
2,3.0,721526.0,
3,4.0,721526.0,
4,5.0,5690.0,
5,6.0,5690.0,
6,7.0,5690.0,
7,7.0,374658.0,
8,8.0,5690.0,
9,9.0,85156.0,85156.0


In [104]:
save_csv(title_crew_pre, 'title_crew.csv')

# title_episode

In [None]:
title_episode = title_episode_df()
print('Shape', title_episode.shape)
title_episode.head()

In [None]:
title_episode_pre = title_episode.copy()

# Transform identifiers
title_episode_pre['tconst'] = tconst_transform(title_episode_pre['tconst'])
title_episode_pre['parentTconst'] = tconst_transform(title_episode_pre['parentTconst'])

print('Shape', title_episode_pre.shape)
title_episode_pre.head()

In [110]:
save_csv(title_episode_pre, 'title_episode.csv')

# title_principals

In [7]:
title_principals = title_principals_df()

print('Shape', title_principals.shape)
title_principals.head()

Shape (31211515, 6)


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Herself""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,


In [13]:
title_principals_pre = title_principals.copy()

# Transform identifiers
title_principals_pre['tconst'] = tconst_transform(title_principals_pre['tconst'])
title_principals_pre['nconst'] = nconst_transform(title_principals_pre['nconst'])

print('Shape', title_principals_pre.shape)
title_principals_pre.head()

Shape (31211515, 6)


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,1.0,1,1588970.0,self,,"[""Herself""]"
1,1.0,2,5690.0,director,,
2,1.0,3,374658.0,cinematographer,director of photography,
3,2.0,1,721526.0,director,,
4,2.0,2,1335271.0,composer,,


In [15]:
save_csv(title_principals_pre, 'title_principals.csv')

# title_ratings

In [7]:
title_ratings = title_ratings_df()

print('Shape', title_ratings.shape)
title_ratings.head()

Shape (897434, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1443
1,tt0000002,6.4,174
2,tt0000003,6.6,1045
3,tt0000004,6.4,104
4,tt0000005,6.2,1741


In [18]:
title_ratings_pre = title_ratings.copy()

# Transform identifiers
title_ratings_pre['tconst'] = transform.tconst_to_float(title_ratings_pre['tconst'])

print('Shape', title_ratings_pre.shape)
title_ratings_pre.head()

Shape (897434, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,1.0,5.8,1443
1,2.0,6.4,174
2,3.0,6.6,1045
3,4.0,6.4,104
4,5.0,6.2,1741


In [20]:
save_csv(title_ratings_pre, 'title_ratings.csv')