# BERDL Lakehouse — Ingest: {TENANT}_{DATASET}

Points at a directory containing source files and ingests them into the BERDL Lakehouse.

**Supported source layouts:**
- `<dir>/<name>.db` + `<dir>/<name>.sql` — SQLite database with schema file
- `<dir>/*.tsv` + `<dir>/*.sql` — pre-exported TSV files with schema file
- `<dir>/*.csv` + `<dir>/*.sql` — pre-exported CSV files with schema file

Set `DATA_DIR`, `TENANT`, `DATASET`, and `MODE` in the **Configuration** cell, then run all cells.

### Configuration

In [None]:
from pathlib import Path

# ── USER CONFIGURATION ───────────────────────────────────────────────────────
DATA_DIR = Path("{DATA_DIR}")   # directory containing source files
TENANT   = "{TENANT}"           # Lakehouse tenant name
DATASET  = "{DATASET}"          # dataset name (or None to use DATA_DIR.name)
BUCKET   = "cdm-lake"
MODE     = "{MODE}"             # "overwrite" or "append"
# ─────────────────────────────────────────────────────────────────────────────

DATASET = DATASET or DATA_DIR.name
BRONZE_PREFIX = f"tenant-general-warehouse/{TENANT}/datasets/{DATASET}"
CONFIG_KEY    = f"{BRONZE_PREFIX}/{DATASET}.json"

print(f"Tenant  : {TENANT}")
print(f"Dataset : {DATASET}")
print(f"Mode    : {MODE}")
print(f"Source  : {DATA_DIR.resolve()}")
print(f"Bronze  : s3a://{BUCKET}/{BRONZE_PREFIX}/")
print(f"Silver  : s3a://{BUCKET}/tenant-sql-warehouse/{TENANT}/{TENANT}_{DATASET}.db")

### Imports and `berdl_notebook_utils` stubs

The installed `berdl_notebook_utils` package imports JupyterHub-only dependencies at module
load time. We replace every submodule with a lightweight stub **before** importing
`data_lakehouse_ingest`, then wire in real implementations once clients are built.

In [None]:
import csv
import io
import json
import logging
import re
import sqlite3
import sys
from types import ModuleType

_STUB_MODULES = [
    "berdl_notebook_utils",
    "berdl_notebook_utils.berdl_settings",
    "berdl_notebook_utils.clients",
    "berdl_notebook_utils.setup_spark_session",
    "berdl_notebook_utils.spark",
    "berdl_notebook_utils.spark.database",
    "berdl_notebook_utils.spark.cluster",
    "berdl_notebook_utils.spark.dataframe",
    "berdl_notebook_utils.minio_governance",
]
for _name in _STUB_MODULES:
    sys.modules[_name] = ModuleType(_name)

def _create_namespace_if_not_exists(spark, namespace=None, append_target=True, tenant_name=None):
    ns = f"{tenant_name}_{namespace}" if tenant_name else namespace
    location = f"s3a://cdm-lake/tenant-sql-warehouse/{tenant_name}/{ns}.db"
    spark.sql(f"CREATE DATABASE IF NOT EXISTS `{ns}` LOCATION '{location}'")
    print(f"Namespace {ns} ready at {location}")
    return ns

sys.modules["berdl_notebook_utils.spark.database"].create_namespace_if_not_exists = (
    _create_namespace_if_not_exists
)
sys.modules["berdl_notebook_utils.setup_spark_session"].get_spark_session = None
sys.modules["berdl_notebook_utils.clients"].get_minio_client = None

from minio import Minio
from data_lakehouse_ingest import ingest
from get_spark_session import get_spark_session

logging.basicConfig(level=logging.INFO)
print("Imports OK.")

### Initialize Spark and MinIO clients

MinIO credentials are read from `~/.mc/config.json`. Spark connects via pproxy on port 8123.

In [None]:
import urllib3

_mc_cfg = json.loads(Path.home().joinpath(".mc/config.json").read_text())
_berdl  = _mc_cfg["aliases"]["berdl-minio"]

minio_client = Minio(
    endpoint=_berdl["url"].replace("https://", "").replace("http://", ""),
    access_key=_berdl["accessKey"],
    secret_key=_berdl["secretKey"],
    secure=_berdl["url"].startswith("https"),
    http_client=urllib3.ProxyManager("http://127.0.0.1:8123"),
)

spark = get_spark_session()

sys.modules["berdl_notebook_utils.setup_spark_session"].get_spark_session = lambda **kw: spark
sys.modules["berdl_notebook_utils.clients"].get_minio_client = lambda **kw: minio_client

print("Spark and MinIO clients ready.")

### Detect source format

In [None]:
if not DATA_DIR.exists():
    raise FileNotFoundError(f"DATA_DIR not found: {DATA_DIR}")

db_files  = sorted(DATA_DIR.glob("*.db")) + sorted(DATA_DIR.glob("*.sqlite")) + sorted(DATA_DIR.glob("*.sqlite3"))
sql_files = sorted(DATA_DIR.glob("*.sql"))
tsv_files = sorted(DATA_DIR.glob("*.tsv"))
csv_files = sorted(DATA_DIR.glob("*.csv"))

print(f"SQLite databases : {[f.name for f in db_files]}")
print(f"SQL schema files : {[f.name for f in sql_files]}")
print(f"TSV files        : {[f.name for f in tsv_files]}")
print(f"CSV files        : {[f.name for f in csv_files]}")

if db_files:
    SOURCE_MODE = "sqlite"
    SOURCE_DB   = db_files[0]
    print(f"\nMode: SQLite → TSV  (source: {SOURCE_DB.name})")
elif tsv_files:
    SOURCE_MODE = "tsv"
    print(f"\nMode: TSV files ({len(tsv_files)} found)")
elif csv_files:
    SOURCE_MODE = "csv"
    print(f"\nMode: CSV files ({len(csv_files)} found)")
else:
    raise ValueError(f"No recognised source files found in {DATA_DIR}")

SQL_SCHEMA = sql_files[0] if sql_files else None
print(f"Schema file      : {SQL_SCHEMA.name if SQL_SCHEMA else 'none — all columns default to STRING'}")

### Parse schema from SQL file

Extracts `CREATE TABLE` statements and maps SQL types to Spark SQL types.
Any table or column without a match defaults to `STRING`.

In [None]:
_TYPE_MAP = {
    "TEXT": "STRING",    "VARCHAR": "STRING",  "CHAR": "STRING",   "CLOB": "STRING",
    "INTEGER": "INT",    "INT": "INT",          "SMALLINT": "INT",  "TINYINT": "INT",
    "MEDIUMINT": "INT",  "BIGINT": "BIGINT",
    "REAL": "DOUBLE",    "FLOAT": "DOUBLE",     "DOUBLE": "DOUBLE",
    "NUMERIC": "DOUBLE", "DECIMAL": "DOUBLE",   "NUMBER": "DOUBLE",
    "BLOB": "BINARY",    "BOOLEAN": "BOOLEAN",  "BOOL": "BOOLEAN",
}

def parse_sql_schema(sql_path):
    """Return {table_name: spark_schema_sql} parsed from CREATE TABLE statements."""
    text = sql_path.read_text(encoding="utf-8", errors="replace")
    schemas = {}
    pattern = re.compile(
        r'CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?[`"\[]?(\w+)[`"\]]?\s*\(([^;]+?)\)\s*;',
        re.IGNORECASE | re.DOTALL,
    )
    for m in pattern.finditer(text):
        table_name = m.group(1)
        cols = []
        for line in m.group(2).splitlines():
            line = line.strip().rstrip(",")
            if not line:
                continue
            if re.match(r'(PRIMARY\s+KEY|UNIQUE|INDEX|FOREIGN\s+KEY|CHECK|CONSTRAINT)\b', line, re.I):
                continue
            tokens = re.split(r'\s+', line, maxsplit=2)
            if len(tokens) < 2:
                continue
            col_name  = re.sub(r'[`"\[\]]', '', tokens[0])
            raw_type  = re.sub(r'\(.*', '', tokens[1]).upper()
            spark_type = _TYPE_MAP.get(raw_type, "STRING")
            cols.append(f"{col_name} {spark_type}")
        if cols:
            schemas[table_name] = ", ".join(cols)
    return schemas


if SQL_SCHEMA:
    SCHEMAS = parse_sql_schema(SQL_SCHEMA)
    print(f"Parsed {len(SCHEMAS)} tables from {SQL_SCHEMA.name}:")
    for name, schema in SCHEMAS.items():
        print(f"  {name}: {schema[:100]}{'…' if len(schema) > 100 else ''}")
else:
    SCHEMAS = {}
    print("No .sql schema file — all columns will default to STRING")

### Prepare data files

- **SQLite**: exports each table to TSV in `/tmp/<dataset>_tsv/`. Embedded tabs and newlines
  in text fields are replaced with a space to prevent TSV corruption.
- **TSV / CSV**: files are used directly from `DATA_DIR`; no conversion needed.

In [None]:
if SOURCE_MODE == "sqlite":
    WORK_DIR  = Path(f"/tmp/{DATASET}_tsv")
    FILE_EXT  = ".tsv"
    DELIMITER = "\t"
    WORK_DIR.mkdir(exist_ok=True)

    def _clean(v):
        if v is None: return ""
        if isinstance(v, str):
            return v.replace("\t", " ").replace("\n", " ").replace("\r", " ")
        return v

    conn = sqlite3.connect(SOURCE_DB)
    cur  = conn.cursor()
    cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
    TABLES = [r[0] for r in cur.fetchall()]

    for table in TABLES:
        out = WORK_DIR / f"{table}.tsv"
        cur.execute(f'SELECT * FROM "{table}"')
        cols = [d[0] for d in cur.description]
        if table not in SCHEMAS:
            SCHEMAS[table] = ", ".join(f"{c} STRING" for c in cols)
        with open(out, "w", newline="", encoding="utf-8") as fh:
            w = csv.writer(fh, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
            w.writerow(cols)
            for row in cur:
                w.writerow([_clean(v) for v in row])
        rows = cur.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
        print(f"  {table:30s}: {rows:>9,} rows  {out.stat().st_size / 1e6:.1f} MB")

    conn.close()

elif SOURCE_MODE in ("tsv", "csv"):
    source_files = tsv_files if SOURCE_MODE == "tsv" else csv_files
    WORK_DIR  = DATA_DIR
    FILE_EXT  = ".tsv" if SOURCE_MODE == "tsv" else ".csv"
    DELIMITER = "\t"  if SOURCE_MODE == "tsv" else ","
    TABLES    = [f.stem for f in source_files]

    for f in source_files:
        if f.stem not in SCHEMAS:
            with open(f, newline="") as fh:
                cols = next(csv.reader(fh, delimiter=DELIMITER))
            SCHEMAS[f.stem] = ", ".join(f"{c} STRING" for c in cols)

    print(f"{len(TABLES)} {SOURCE_MODE.upper()} files ready:")
    for f in source_files:
        print(f"  {f.name:35s}: {f.stat().st_size / 1e6:.1f} MB")

print(f"\nTables ({len(TABLES)}): {TABLES}")

### Build ingestion config

In [None]:
config = {
    "tenant":   TENANT,
    "dataset":  DATASET,
    "is_tenant": True,
    "paths": {
        "data_plane":  f"s3a://{BUCKET}/tenant-general-warehouse/{TENANT}/",
        "bronze_base": f"s3a://{BUCKET}/{BRONZE_PREFIX}/",
        "silver_base": f"s3a://{BUCKET}/tenant-sql-warehouse/{TENANT}/{DATASET}.db",
    },
    "defaults": {
        "csv": {"header": True, "delimiter": DELIMITER, "inferSchema": False}
    },
    "tables": [
        {
            "name":        table,
            "enabled":     True,
            "schema_sql":  SCHEMAS.get(table, ""),
            "partition_by": None,
            "mode":        MODE,
            "bronze_path": f"s3a://{BUCKET}/{BRONZE_PREFIX}/{table}{FILE_EXT}",
        }
        for table in TABLES
    ],
}

print(json.dumps(config, indent=2))

### Upload config and data files to MinIO

In [None]:
config_bytes = json.dumps(config, indent=2).encode("utf-8")
minio_client.put_object(
    BUCKET, CONFIG_KEY, io.BytesIO(config_bytes), len(config_bytes),
    content_type="application/json",
)
print(f"Config → s3a://{BUCKET}/{CONFIG_KEY}")

for table in TABLES:
    src = WORK_DIR / f"{table}{FILE_EXT}"
    key = f"{BRONZE_PREFIX}/{table}{FILE_EXT}"
    minio_client.fput_object(BUCKET, key, str(src))
    print(f"  {table:30s}: {src.stat().st_size / 1e6:.1f} MB → s3a://{BUCKET}/{key}")

print("\nAll files uploaded.")

### Run ingestion pipeline

In [None]:
cfg_path = f"s3a://{BUCKET}/{CONFIG_KEY}"
report = ingest(cfg_path, spark=spark, minio_client=minio_client)
report

### Verify

In [None]:
namespace = f"{TENANT}_{DATASET}"
spark.sql(f"SHOW TABLES IN {namespace}").show()

print("Row counts:")
for table in TABLES:
    count = spark.sql(f"SELECT count(*) FROM {namespace}.{table}").collect()[0][0]
    print(f"  {table:30s}: {count:>9,}")

In [None]:
spark.stop()