# Upload Projects to Lakehouse

Upload BERIL Observatory project data and files to the `microbialdiscoveryforge_observatory`
lakehouse collection.

**Prerequisites**: Run `bootstrap_lakehouse.ipynb` first to create the database.

## Parameters

Set `PROJECT_ID` below:
- A specific project name (e.g., `"metal_fitness_atlas"`) to upload one project
- `"all"` to upload all projects (bulk backfill)

In [None]:
# --- PARAMETERS ---
PROJECT_ID = "metal_fitness_atlas"  # Change this, or set to "all" for bulk upload
OVERWRITE = True                     # Overwrite existing tables if they exist
BASE_PATH = "."                      # Path to BERIL-research-observatory root

In [None]:
import sys
from pathlib import Path

# Add tools/ to path so we can import lakehouse_upload
base = Path(BASE_PATH).resolve()
tools_dir = base / "tools"
if str(tools_dir) not in sys.path:
    sys.path.insert(0, str(tools_dir))

import lakehouse_upload
print(f"Base path: {base}")
print(f"Database: {lakehouse_upload.DATABASE}")

In [None]:
from get_spark_session import get_spark_session
spark = get_spark_session()
print(f"Spark version: {spark.version}")

## Preview: Files to Upload

Scan the project directory and show what will be uploaded.

In [None]:
import os

projects_dir = base / "projects"

if PROJECT_ID == "all":
    project_ids = sorted([
        d.name for d in projects_dir.iterdir()
        if d.is_dir() and not d.name.startswith(".")
    ])
else:
    project_ids = [PROJECT_ID]

print(f"Projects to upload: {len(project_ids)}\n")

for pid in project_ids:
    project_path = projects_dir / pid
    if not project_path.exists():
        print(f"  WARNING: {pid} not found")
        continue
    manifest = lakehouse_upload.get_upload_manifest(project_path)
    tabular = sum(1 for f in manifest if f['classification'] == 'tabular' and f['in_data_dir'])
    other = len(manifest) - tabular
    total_mb = sum(f['size_bytes'] for f in manifest) / 1024 / 1024
    print(f"  {pid}: {tabular} data tables, {other} files, {total_mb:.1f} MB")

## Upload

In [None]:
if PROJECT_ID == "all":
    results = lakehouse_upload.upload_all_projects(spark, str(base), overwrite=OVERWRITE)
else:
    result = lakehouse_upload.upload_project(spark, PROJECT_ID, str(base), overwrite=OVERWRITE)
    results = [result] if result else []

## Verify Upload

In [None]:
# Show the project registry
print("=== Project Registry ===")
lakehouse_upload.list_uploaded_projects(spark)

In [None]:
# Show tables for each uploaded project
for r in results:
    if r:
        print()
        lakehouse_upload.list_project_tables(spark, r['project_id'])

In [None]:
# Quick sanity check: query the first table
if results and results[0] and results[0]['tables']:
    first_table = results[0]['tables'][0]['table']
    print(f"Sample from {first_table}:")
    spark.sql(f"SELECT * FROM {first_table} LIMIT 5").show(truncate=40)

In [None]:
# Show non-tabular files uploaded
if results:
    pid = results[0]['project_id']
    print(f"Files uploaded for {pid}:")
    spark.sql(f"""
        SELECT file_path, file_type, size_bytes
        FROM {lakehouse_upload.DATABASE}.project_files
        WHERE project_id = '{pid}'
        ORDER BY file_path
    """).show(truncate=60)