In [10]:
! imdb-sqlite

2020-10-25 14:33:18,565 GET https://datasets.imdbws.com/name.basics.tsv.gz -> downloads/name.basics.tsv.gz
2020-10-25 14:33:40,136 GET https://datasets.imdbws.com/title.basics.tsv.gz -> downloads/title.basics.tsv.gz
2020-10-25 14:33:53,880 GET https://datasets.imdbws.com/title.akas.tsv.gz -> downloads/title.akas.tsv.gz
2020-10-25 14:34:15,406 GET https://datasets.imdbws.com/title.principals.tsv.gz -> downloads/title.principals.tsv.gz
2020-10-25 14:34:51,065 GET https://datasets.imdbws.com/title.episode.tsv.gz -> downloads/title.episode.tsv.gz
2020-10-25 14:34:54,831 GET https://datasets.imdbws.com/title.ratings.tsv.gz -> downloads/title.ratings.tsv.gz
2020-10-25 14:34:55,681 Populating database: imdb.db
2020-10-25 14:34:55,682 Applying schema
2020-10-25 14:34:55,686 Importing file: downloads/name.basics.tsv.gz
2020-10-25 14:34:55,686 Reading number of rows ...
2020-10-25 14:34:59,323 Inserting rows into table: people
100%|████████████████████████| 10455333/10455333 [01:15<00:00, 137734

In [1]:
import sqlite3
import pandas as pd

In [2]:
db = sqlite3.connect("imdb.db")
cur = db.cursor()

In [3]:
db_short = sqlite3.connect("imdb_short.db")
cur_short = db_short.cursor()

In [7]:
cur.execute("SELECT distinct category FROM crew ORDER BY category")
role_categories = {name[0]: idx + 1 for idx, name in enumerate(cur.fetchall())}

In [21]:
role_categories.items()

dict_items([('actor', 1), ('actress', 2), ('archive_footage', 3), ('archive_sound', 4), ('cinematographer', 5), ('composer', 6), ('director', 7), ('editor', 8), ('producer', 9), ('production_designer', 10), ('self', 11), ('writer', 12)])

In [28]:
pd.DataFrame({"id": role_categories.values(), "role_type": role_categories.keys()}).to_sql("role_categories", con=db_short, if_exists="replace", index=False)

In [4]:
cur_short.execute("CREATE TABLE crew (title_id INT, person_id INT, category INT)")
db_short.commit()

In [8]:
cur.execute("""
SELECT 
    CAST(substr(title_id, 3) AS INT) as title_id, 
    CAST(substr(person_id, 3) AS INT) as person_id,
    category 
FROM crew""")
data = cur.fetchmany(100000)
while data:
    data = [(i, j, role_categories.get(k)) for i, j, k in data]
    cur_short.executemany("INSERT INTO crew VALUES (?, ?, ?)", data)
    db_short.commit()
    data = cur.fetchmany(100000)

In [9]:
cur_short.execute("CREATE TABLE people (person_id INT, name TEXT, born INT, died INT)")
db_short.commit()

In [11]:
cur.execute("""
SELECT 
    CAST(substr(person_id, 3) AS INT) as person_id, 
    name,
    born,
    died
FROM people""")
data = cur.fetchmany(100000)
while data:
    cur_short.executemany("INSERT INTO people VALUES (?, ?, ?, ?)", data)
    db_short.commit()
    data = cur.fetchmany(100000)

In [12]:
cur_short.execute("CREATE TABLE rating (title_id INT, rating REAL, votes INT)")
db_short.commit()

In [14]:
cur.execute("""
SELECT 
    CAST(substr(title_id, 3) AS INT) as title_id, 
    rating, votes
FROM ratings""")
data = cur.fetchmany(100000)
while data:
    cur_short.executemany("INSERT INTO rating VALUES (?, ?, ?)", data)
    db_short.commit()
    data = cur.fetchmany(100000)

In [15]:
cur_short.execute("CREATE TABLE titles (title_id INT, type INT, title TEXT, is_adult INT, premiered INT, ended INT, runtime_min INT)")
db_short.commit()

In [16]:
cur.execute("SELECT distinct type FROM titles ORDER BY type")
film_types = {name[0]: idx + 1 for idx, name in enumerate(cur.fetchall())}

In [17]:
film_types

{'movie': 1,
 'short': 2,
 'tvEpisode': 3,
 'tvMiniSeries': 4,
 'tvMovie': 5,
 'tvSeries': 6,
 'tvShort': 7,
 'tvSpecial': 8,
 'video': 9,
 'videoGame': 10}

In [27]:
cur.execute("SELECT distinct genres FROM titles WHERE genres != '\\N' ORDER BY genres ")
genres = set()
for list_of in cur.fetchall():
    for g in list_of[0].split(","):
        genres.add(g)

In [30]:
pd.DataFrame({"id": film_types.values(), "film_type": film_types.keys()}).to_sql("film_types", con=db_short, if_exists="replace", index=False)

In [19]:
cur_short.execute("CREATE TABLE film_genres (title_id INT, genre_id INT)")
db_short.commit()

In [32]:
genres = {name: idx + 1 for idx, name in enumerate(sorted(genres))}

In [33]:
pd.DataFrame({"id": genres.values(), "genre_name": genres.keys()}).to_sql("genre_types", con=db_short, if_exists="replace", index=False)

In [43]:
cur.execute("""
SELECT 
    CAST(substr(title_id, 3) AS INT) as title_id, 
    type,
    primary_title,
    is_adult,
    premiered,
    ended,
    runtime_minutes,
    genres 
FROM titles""")
data = cur.fetchmany(100000)
while data:
    data1 = [(tid, film_types.get(ty), tt, ad, st, end, run) for tid, ty, tt, ad, st, end, run, gen in data]
    data2 = []
    for row in data:
        if row[-1] == "\\N":
            continue
        for g in row[-1].split(","):
            data2.append((row[0], genres.get(g)))
    cur_short.executemany("INSERT INTO titles VALUES (?, ?, ?, ?, ?, ?, ?)", data1)
    cur_short.executemany("INSERT INTO film_genres VALUES (?, ?)", data2)
    db_short.commit()
    data = cur.fetchmany(100000)