In [17]:
import csv
import math
import pandas as pd
import os
import requests

from tqdm.auto import tqdm

# Download the data from the page and load to Pandas DataFrame

In [24]:
data_folder = './imdb'

if not os.path.exists(data_folder):
    os.mkdir(data_folder)

def fetch_or_resume(url, filename):
    block_size = 1024
    wrote = 0

    # Connecto to server
    headers = {}
    response = requests.get(url, headers=headers, stream=True)
    total_size = int(response.headers.get('content-length'))

    if os.path.exists(filename):
        print("File {} already exists".format(filename))

        # Check file size
        filename_size = os.path.getsize(filename)
        print(filename_size, total_size)
        if filename_size == total_size:
            print("Warning, No downloading, the file {} has the required size.".format(filename))
            return

    # Download file
    with open(filename, 'wb') as file:
        for data in tqdm(iterable = response.iter_content(chunk_size = block_size),
                                  total = total_size//block_size,
                                  desc = os.path.basename(url),
                                  unit = 'KB'):
            wrote = wrote  + len(data)
            file.write(data)

        if total_size != 0 and wrote != total_size:
            print("ERROR, something went wrong")

def get_imdb_dataframe(url, download=False, low_memory=True):
    base = os.path.basename(url)
    filename = os.path.join(data_folder, base)
    if download:
        fetch_or_resume(url, filename)

    return pd.read_csv(filename, sep='\t', low_memory=low_memory,
                        na_values={'\\N'}, quoting=csv.QUOTE_NONE)


def get_title_episode():
    return get_imdb_dataframe('https://datasets.imdbws.com/title.episode.tsv.gz')

def get_title_principals():
    return get_imdb_dataframe('https://datasets.imdbws.com/title.principals.tsv.gz')

def get_title_ratings():
    return get_imdb_dataframe('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [25]:
def save_csv(df, file):
    filename = os.path.join(data_folder, file)
    df.to_csv(filename, index=False)

# Name_basics

In [19]:
name_basics = get_imdb_dataframe('https://datasets.imdbws.com/name.basics.tsv.gz')
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [38]:
def nconst_transform(df):
    return df.str.replace('nm', '').astype(int)

name_basics_pre = name_basics.copy()
# nconst to int
name_basics_pre['nconst'] = nconst_transform(name_basics['nconst'])

name_basics_pre.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,1,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,2,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,3,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,4,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,5,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [None]:
save_csv(name_basics_pre, 'name_basics.csv')

# title_basics

In [26]:
title_basics = get_imdb_dataframe('https://datasets.imdbws.com/title.basics.tsv.gz')
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short"


In [28]:
def tconst_transform(df):
    return df.str.replace('tt', '').astype(int)

title_basics_pre = title_basics.copy()
title_basics_pre['tconst'] = tconst_transform(title_basics_pre['tconst'])

title_basics_pre.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
1,2,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
2,3,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
3,4,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
4,5,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short"


In [None]:
save_csv(title_basics_pre, 'title_basics.csv')

# title_akas

In [29]:
title_akas = get_imdb_dataframe('https://datasets.imdbws.com/title.akas.tsv.gz')
title_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,tt0000001,2,Карменсита,RU,,,,0.0
2,tt0000001,3,Carmencita,US,,,,0.0
3,tt0000001,4,Carmencita,,,original,,1.0
4,tt0000002,1,Le clown et ses chiens,,,original,,1.0


In [34]:
title_akas_pre = title_akas.copy()
title_akas_pre['titleId'] = tconst_transform(title_akas_pre['titleId'])
print('Shape', title_akas_pre.shape)
# Remove title_akas for non-existing# title_basic
title_akas_pre = title_akas_pre[title_akas_pre['titleId'].isin(title_basics_pre['tconst'])]

print('Shape', title_akas_pre.shape)
title_akas_pre.head()

Shape (3699803, 8)
Shape (3695781, 8)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,1,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,1,2,Карменсита,RU,,,,0.0
2,1,3,Carmencita,US,,,,0.0
3,1,4,Carmencita,,,original,,1.0
4,2,1,Le clown et ses chiens,,,original,,1.0


In [35]:
save_csv(title_akas_pre, 'title_akas.csv')

# title_crew

In [36]:
title_crew = get_imdb_dataframe('https://datasets.imdbws.com/title.crew.tsv.gz')
title_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


In [53]:
def splitDataFrameList(df, target_column, separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    https://gist.github.com/jlln/338b4b0b55bd6984f883
    '''
    row_accumulator = []

    def splitListToRows(row, separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    df.apply(splitListToRows, axis=1, args = (separator, ))
    new_df = pd.DataFrame(row_accumulator)
    return new_df


In [54]:
title_crew_pre = title_crew.copy()
title_crew_pre['tconst'] = tconst_transform(title_crew_pre['tconst'])

print('Shape', title_crew_pre.shape)

print(title_crew_pre.info())
title_crew_pre['directors'] = title_crew_pre['directors'].astype('str')
title_crew_pre = splitDataFrameList(title_crew_pre, 'directors', ',')
#title_crew_pre['directors'] = nconst_transform(title_crew_pre['directors'])
#title_crew_pre['writers'] = nconst_transform(title_crew_pre['writers'])

print('Shape', title_crew_pre.shape)

# Remove title_crew for non-existing# directors or writers
title_crew_pre = title_crew_pre[title_crew_pre['tconst'].isin(title_basics_pre['tconst'])]

# Remove title_crew for non-existing# title_basic
title_crew_pre = title_crew_pre[title_crew_pre['tconst'].isin(title_basics_pre['tconst'])]

print('Shape', title_crew_pre.shape)
title_crew_pre.head()

Shape (5465783, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5465783 entries, 0 to 5465782
Data columns (total 3 columns):
tconst       int64
directors    object
writers      object
dtypes: int64(1), object(2)
memory usage: 125.1+ MB
None
Shape (6383306, 3)
Shape (6383306, 3)


Unnamed: 0,directors,tconst,writers
0,nm0005690,1,
1,nm0721526,2,
2,nm0721526,3,
3,nm0721526,4,
4,nm0005690,5,


In [8]:
title_episode = get_data.get_title_episode()
title_episode.head()

File ./data/title.episode.tsv.gz already exists
20425199 20425199


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1.0,9.0
1,tt0042816,tt0989125,1.0,17.0
2,tt0042889,tt0989125,,
3,tt0043426,tt0040051,3.0,42.0
4,tt0043631,tt0989125,2.0,16.0


In [5]:
title_principals = get_data.get_title_principals()
title_principals.head()

File ./imdb/title.principals.tsv.gz already exists
268372349 268372349


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Herself""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,


In [9]:
title_ratings = get_data.get_title_ratings()
title_ratings.head()

File ./imdb/title.ratings.tsv.gz already exists
4426510 4426510


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1443
1,tt0000002,6.4,174
2,tt0000003,6.6,1045
3,tt0000004,6.4,104
4,tt0000005,6.2,1741
