In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "genome"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "genome"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())

# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)

{"time": "2025-07-17 21:11:37,179", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "2161914337", "msg": "Notebook started"}
{"time": "2025-07-17 21:11:37,195", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: genome"}
{"time": "2025-07-17 21:11:37,196", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/genome.json"}
{"time": "2025-07-17 21:11:37,197", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/genome.json"}
{"time": "2025-07-17 21:11:37,318", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Config loaded succes

In [2]:
from src.validate_schema_against_delta import validate_schema_against_delta
validate_schema_against_delta(loader, spark, logger)

{"time": "2025-07-17 21:12:50,471", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema_against_delta", "msg": "Starting schema vs. Delta table column validation"}
{"time": "2025-07-17 21:12:50,471", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "DEBUG", "module": "config_loader", "msg": "Schema file path: s3a://cdm-lake/schemas/pangenome-schema.yaml"}
{"time": "2025-07-17 21:12:50,472", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema_against_delta", "msg": "Schema file: s3a://cdm-lake/schemas/pangenome-schema.yaml"}
{"time": "2025-07-17 21:12:50,498", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "DEBUG", "module": "config_loader", "msg": "Target table: pangenome.genome"}
{"time": "2025-07-17 21:12:50,499", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "valid

{'delta_columns': ['genome_id',
  'gtdb_species_clade_id',
  'gtdb_taxonomy_id',
  'ncbi_biosample_id',
  'fna_file_path_nersc',
  'faa_file_path_nersc'],
 'schema_columns': ['genome_id',
  'gtdb_taxonomy_id',
  'gtdb_species_clade_id',
  'ncbi_biosample_id',
  'fna_file_path_nersc',
  'faa_file_path_nersc'],
 'missing_in_table': [],
 'extra_in_table': []}

In [3]:
# Upload log file to MinIO
upload_to_minio(logger.log_file_path)

✅ Uploaded log to MinIO at: s3://cdm-lake/logs/pangenome/pipeline_run_20250717_211137.log
