# Bootstrap Lakehouse: microbialdiscoveryforge_observatory

One-time setup notebook. Creates the `microbialdiscoveryforge_observatory` database and
registry tables in the BERDL lakehouse under the `microbialdiscoveryforge` tenant.

**Run this on BERDL JupyterHub before using the upload notebook.**

In [None]:
from get_spark_session import get_spark_session
spark = get_spark_session()
print(f"Spark version: {spark.version}")

## 1. Create Database

In [None]:
DATABASE = "microbialdiscoveryforge_observatory"

spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE}")
print(f"Database '{DATABASE}' created.")

# Verify it appears in the database list
dbs = [row.namespace for row in spark.sql("SHOW DATABASES").collect()]
assert DATABASE in dbs, f"{DATABASE} not found in database list!"
print(f"Verified: {DATABASE} exists in SHOW DATABASES.")

## 2. Create Registry Tables

In [None]:
# Project registry: one row per uploaded project
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {DATABASE}.project_registry (
        project_id STRING,
        title STRING,
        status STRING,
        authors STRING,
        git_repo STRING,
        git_branch STRING,
        git_commit STRING,
        upload_date TIMESTAMP,
        file_manifest STRING
    )
    USING DELTA
""")
print(f"Table '{DATABASE}.project_registry' created.")

In [None]:
# Project files: non-tabular files (notebooks, figures, markdown, etc.)
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {DATABASE}.project_files (
        project_id STRING,
        file_path STRING,
        file_type STRING,
        content STRING,
        size_bytes LONG,
        modified_date TIMESTAMP,
        upload_date TIMESTAMP
    )
    USING DELTA
""")
print(f"Table '{DATABASE}.project_files' created.")

## 3. Validate Write Access

In [None]:
from pyspark.sql import Row
from datetime import datetime

# Test write to project_registry
test_row = Row(
    project_id="_bootstrap_test",
    title="Bootstrap Test",
    status="test",
    authors="bootstrap",
    git_repo="test",
    git_branch="test",
    git_commit="test",
    upload_date=datetime.now(),
    file_manifest="[]",
)
df = spark.createDataFrame([test_row])
df.write.format("delta").mode("append").insertInto(f"{DATABASE}.project_registry")
print("Write to project_registry: OK")

# Verify we can read it back
result = spark.sql(f"""
    SELECT * FROM {DATABASE}.project_registry
    WHERE project_id = '_bootstrap_test'
""").collect()
assert len(result) == 1, "Failed to read back test row!"
print("Read from project_registry: OK")

# Clean up test row
spark.sql(f"""
    DELETE FROM {DATABASE}.project_registry
    WHERE project_id = '_bootstrap_test'
""")
print("Delete from project_registry: OK")
print()
print("All write access tests passed!")

## 4. Summary

In [None]:
tables = spark.sql(f"SHOW TABLES IN {DATABASE}").collect()
print(f"Database: {DATABASE}")
print(f"Tables: {len(tables)}")
for t in tables:
    print(f"  - {t['tableName']}")
print()
print("Bootstrap complete. You can now run upload_to_lakehouse.ipynb.")