下載 & 解壓 IMDb 資料

In [None]:
import urllib.request
import gzip
import shutil

def download_and_extract(url, output_filename):
    gz_path = output_filename + '.gz'
    urllib.request.urlretrieve(url, gz_path)
    print(f"Downloaded: {gz_path}")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(output_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Extracted: {output_filename}")

download_and_extract("https://datasets.imdbws.com/title.basics.tsv.gz", "title.basics.tsv")
download_and_extract("https://datasets.imdbws.com/title.principals.tsv.gz", "title.principals.tsv")
download_and_extract("https://datasets.imdbws.com/name.basics.tsv.gz", "name.basics.tsv")

In [15]:
!wget https://datasets.imdbws.com/title.basics.tsv.gz
!wget https://datasets.imdbws.com/title.principals.tsv.gz
!wget https://datasets.imdbws.com/name.basics.tsv.gz

!gunzip title.basics.tsv.gz
!gunzip title.principals.tsv.gz
!gunzip name.basics.tsv.gz

--2025-05-19 18:33:03--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 3.169.121.31, 3.169.121.125, 3.169.121.105, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|3.169.121.31|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206037556 (196M) [binary/octet-stream]
Saving to: ‘title.basics.tsv.gz’


2025-05-19 18:33:05 (103 MB/s) - ‘title.basics.tsv.gz’ saved [206037556/206037556]

--2025-05-19 18:33:05--  https://datasets.imdbws.com/title.principals.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 3.169.121.31, 3.169.121.105, 3.169.121.100, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|3.169.121.31|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 716117860 (683M) [binary/octet-stream]
Saving to: ‘title.principals.tsv.gz’


2025-05-19 18:33:37 (22.2 MB/s) - ‘title.principals.tsv.gz’ saved [716117860/716117860]

--2025-05-19 18:33:37--  https://

PostgreSQL 資料表

In [6]:
import psycopg2

# 連接資料庫
conn = psycopg2.connect(
    dbname="autosteer_db",
    user="autosteer",
    password="autosteer_database", 
    host="localhost",
    port="5432"
)
cur = conn.cursor()

# 建立資料表
cur.execute("""
    CREATE TABLE IF NOT EXISTS title_basics (
        tconst TEXT PRIMARY KEY,
        titleType TEXT,
        primaryTitle TEXT,
        originalTitle TEXT,
        isAdult BOOLEAN,
        startYear INTEGER,
        endYear INTEGER,
        runtimeMinutes INTEGER,
        genres TEXT
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS title_principals (
        tconst TEXT,
        ordering INTEGER,
        nconst TEXT,
        category TEXT,
        job TEXT,
        characters TEXT
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS name_basics (
        nconst TEXT PRIMARY KEY,
        primaryName TEXT,
        birthYear INTEGER,
        deathYear INTEGER,
        primaryProfession TEXT,
        knownForTitles TEXT
    );
""")

# 提交並關閉
conn.commit()
cur.close()
conn.close()

print("✅ Tables created successfully.")

✅ Tables created successfully.


IMDb 資料匯入

In [8]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2://autosteer:autosteer_database@localhost:5432/autosteer_db")

# --- 匯入 title.basics.tsv ---
print("📥 匯入 title.basics.tsv ...")
df_title = pd.read_csv("title.basics.tsv", sep="\t", dtype=str, na_values="\\N")
df_title["isAdult"] = df_title["isAdult"].astype(bool)
df_title["startYear"] = pd.to_numeric(df_title["startYear"], errors="coerce")
df_title["endYear"] = pd.to_numeric(df_title["endYear"], errors="coerce")
df_title["runtimeMinutes"] = pd.to_numeric(df_title["runtimeMinutes"], errors="coerce")
df_title.head(500).to_sql("title_basics", engine, if_exists="replace", index=False)
print("✅ 匯入完成：title_basics (前 500 筆)")

# --- 匯入 title.principals.tsv ---
print("📥 匯入 title.principals.tsv ...")
df_principals = pd.read_csv("title.principals.tsv", sep="\t", dtype=str, na_values="\\N")
df_principals["ordering"] = pd.to_numeric(df_principals["ordering"], errors="coerce")
df_principals.head(500).to_sql("title_principals", engine, if_exists="replace", index=False)
print("✅ 匯入完成：title_principals")

# --- 匯入 name.basics.tsv ---
print("📥 匯入 name.basics.tsv ...")
df_names = pd.read_csv("name.basics.tsv", sep="\t", dtype=str, na_values="\\N")
df_names["birthYear"] = pd.to_numeric(df_names["birthYear"], errors="coerce")
df_names["deathYear"] = pd.to_numeric(df_names["deathYear"], errors="coerce")
df_names.head(500).to_sql("name_basics", engine, if_exists="replace", index=False)
print("✅ 匯入完成：name_basics")

# 關閉原始連線（可選）
cur.close()
conn.close()

📥 匯入 title.basics.tsv ...
✅ 匯入完成：title_basics (前 500 筆)
📥 匯入 title.principals.tsv ...


In [None]:
# Linux / macOS / Git Bash
# chmod +x import_all.sh
# ./import_all.sh

In [None]:
# Windows PowerShell
# ./import_all.ps1