Download the data from the page and load to Pandas DataFrame

In [1]:
import math
import pandas as pd
import os
import requests

from tqdm.auto import tqdm

In [8]:
data_folder = './data'

if not os.path.exists(data_folder):
    os.mkdir(data_folder)

def fetch_or_resume(url, filename):
    block_size = 1024
    wrote = 0
    
    # Connecto to server
    headers = {}
    response = requests.get(url, headers=headers, stream=True)    
    total_size = int(response.headers.get('content-length'))
    
    if os.path.exists(filename):
        print("File {} already exists".format(filename))
        
        # Check file size
        filename_size = os.path.getsize(filename)
        print(filename_size, total_size)
        if filename_size == total_size:
            print("Warning, No downloading, the file {} has the required size.".format(filename))
            return
    
    # Download file
    with open(filename, 'wb') as file:
        for data in tqdm(iterable = response.iter_content(chunk_size = block_size), 
                                  total = total_size//block_size, 
                                  desc = os.path.basename(url), 
                                  unit = 'KB'):
            wrote = wrote  + len(data)
            file.write(data)
            
        if total_size != 0 and wrote != total_size:
            print("ERROR, something went wrong")
            
def get_imdb_file_dataframe(url, low_memory=True):
    base = os.path.basename(url)
    filename = os.path.join(data_folder, base)
    fetch_or_resume(url, filename)
    return pd.read_csv(filename, sep='\t', low_memory=low_memory, na_values={'\\N'})

In [9]:
def get_name_basics():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/name.basics.tsv.gz')

def get_title_akas():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.akas.tsv.gz')

def get_title_basics():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.basics.tsv.gz')
    
def get_title_crew():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.crew.tsv.gz')
    
def get_title_episode():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.episode.tsv.gz')
    
def get_title_principals():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.principals.tsv.gz')
    
def get_title_ratings():
    return get_imdb_file_dataframe('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [10]:
name_basics = get_name_basics()
name_basics.head()

File ./data/name.basics.tsv.gz already exists
176729080 176729080


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0049189,tt0054452,tt0057345"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0060827"


In [11]:
title_akas = get_title_akas()
title_akas.head()

File ./data/title.akas.tsv.gz already exists
54491835 54491835


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
1,tt0000001,2,Карменсита,RU,,,,0.0
2,tt0000001,3,Carmencita,US,,,,0.0
3,tt0000001,4,Carmencita,,,original,,1.0
4,tt0000002,1,Le clown et ses chiens,,,original,,1.0


In [13]:
title_basics = get_title_basics()
title_basics.head()

File ./data/title.basics.tsv.gz already exists
14221312 98294787


HBox(children=(IntProgress(value=0, description='title.basics.tsv.gz', max=95991, style=ProgressStyle(descript…

KeyboardInterrupt: 

In [None]:
title_crew = get_title_crew()
title_crew.head()

In [None]:
title_episode = get_title_episode()
title_episode.head()

In [None]:
title_principals = get_title_principals()
title_principals.head()

In [None]:
title_ratings = get_title_ratings()
title_ratings.head()