Download the data from the page and load to Pandas DataFrame

In [1]:
import math
import pandas as pd
import os
import requests

from tqdm.auto import tqdm

In [2]:
data_folder = './imdb'

if not os.path.exists(data_folder):
    os.mkdir(data_folder)

def fetch_or_resume(url, filename):
    block_size = 1024
    wrote = 0
    
    # Connecto to server
    headers = {}
    response = requests.get(url, headers=headers, stream=True)    
    total_size = int(response.headers.get('content-length'))
    
    if os.path.exists(filename):
        print("File {} already exists".format(filename))
        
        # Check file size
        filename_size = os.path.getsize(filename)
        print(filename_size, total_size)
        if filename_size == total_size:
            print("Warning, No downloading, the file {} has the required size.".format(filename))
            return
    
    # Download file
    with open(filename, 'wb') as file:
        for data in tqdm(iterable = response.iter_content(chunk_size = block_size), 
                                  total = total_size//block_size, 
                                  desc = os.path.basename(url), 
                                  unit = 'KB'):
            wrote = wrote  + len(data)
            file.write(data)
            
        if total_size != 0 and wrote != total_size:
            print("ERROR, something went wrong")
            
def get_imdb_file_dataframe(url, low_memory=True, dtype=None):
    base = os.path.basename(url)
    filename = os.path.join(data_folder, base)
    fetch_or_resume(url, filename)
    return pd.read_csv(filename, sep='\t', low_memory=low_memory, dtype=dtype, na_values={'\\N'})

In [3]:
def get_name_basics():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/name.basics.tsv.gz')

def get_title_akas():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.akas.tsv.gz')

def get_title_basics():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.basics.tsv.gz')
    
def get_title_crew():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.crew.tsv.gz')
    
def get_title_episode():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.episode.tsv.gz')
    
def get_title_principals():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.principals.tsv.gz')
    
def get_title_ratings():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [4]:
name_basics = get_name_basics()
name_basics.head()

File ./data/name.basics.tsv.gz already exists
176729080 176729080


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [5]:
title_akas = get_title_akas()
title_akas.head()

File ./data/title.akas.tsv.gz already exists
54491835 54491835


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,tt0000001,2,Карменсита,RU,,,,0.0
2,tt0000001,3,Carmencita,US,,,,0.0
3,tt0000001,4,Carmencita,,,original,,1.0
4,tt0000002,1,Le clown et ses chiens,,,original,,1.0


In [6]:
title_basics = get_title_basics()
title_basics.head()

File ./data/title.basics.tsv.gz already exists
98294787 98294787


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,"Comedy,Short"


In [7]:
title_crew = get_title_crew()
title_crew.head()

File ./data/title.crew.tsv.gz already exists
39982560 39982560


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


In [8]:
title_episode = get_title_episode()
title_episode.head()

File ./data/title.episode.tsv.gz already exists
20425199 20425199


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1.0,9.0
1,tt0042816,tt0989125,1.0,17.0
2,tt0042889,tt0989125,,
3,tt0043426,tt0040051,3.0,42.0
4,tt0043631,tt0989125,2.0,16.0


In [9]:
title_principals = get_title_principals()
title_principals.head()

File ./data/title.principals.tsv.gz already exists
116683776 268372349


HBox(children=(IntProgress(value=0, description='title.principals.tsv.gz', max=262082, style=ProgressStyle(des…




Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Herself""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,


In [10]:
title_ratings = get_title_ratings()
title_ratings.head()

HBox(children=(IntProgress(value=0, description='title.ratings.tsv.gz', max=4322, style=ProgressStyle(descript…




Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1443
1,tt0000002,6.4,174
2,tt0000003,6.6,1045
3,tt0000004,6.4,104
4,tt0000005,6.2,1741
