In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "genome"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "genome"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())



# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)



# Validate input files
from src.input_file_validator import validate_input_files
validate_input_files(loader, logger)


# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)


# Check TSV format
#from src.input_file_validator import validate_input_files
#validate_input_files(loader)

# Run validations
from src.run_validations import run_validations_from_config
run_validations_from_config(loader, logger)


# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader, logger)


#import importlib
#import src.run_great_expectations_validations

#importlib.reload(src.run_great_expectations_validations)

# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader, logger)


# Upload log file to MinIO
upload_to_minio(logger.log_file_path)


{"time": "2025-07-07 18:37:18,041", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "1616997403", "msg": "Notebook started"}
{"time": "2025-07-07 18:37:18,053", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: genome"}
{"time": "2025-07-07 18:37:18,053", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/genome.json"}
{"time": "2025-07-07 18:37:18,055", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/genome.json"}
{"time": "2025-07-07 18:37:18,258", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Config loaded succes

{"time": "2025-07-07 18:38:24,018", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "DEBUG", "module": "validate_schema", "msg": "Schema columns: ['genome_id', 'gtdb_taxonomy_id', 'gtdb_species_clade_id', 'ncbi_biosample_id', 'fna_file_path_nersc', 'faa_file_path_nersc']"}
{"time": "2025-07-07 18:38:24,018", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema", "msg": "Comparison between schema and file header completed"}
{"time": "2025-07-07 18:38:24,019", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema", "msg": "All schema columns are present in the file"}
{"time": "2025-07-07 18:38:24,020", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema", "msg": "No extra columns in the file"}
🔹 Header columns from file:
['genome_id', 'gtdb_species_clade_id', 'gtdb_taxonomy_id', 'ncbi_bios

{"time": "2025-07-07 18:38:48,126", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "run_referential_integrity_checks", "msg": "Violations logged to Delta table: pangenome.genome_errors at s3a://cdm-lake/logs/errors/genome"}
 → Violations logged to: pangenome.genome_errors
{"time": "2025-07-07 18:38:48,127", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "ERROR", "module": "run_referential_integrity_checks", "msg": "Referential integrity check failed on FK gtdb_taxonomy_id. Stopping further checks."}
 ❌ Stopping further checks due to failed RI check.
{"time": "2025-07-07 18:38:50,693", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "DEBUG", "module": "config_loader", "msg": "Target table: pangenome.genome"}
{"time": "2025-07-07 18:38:50,695", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Sta

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{"time": "2025-07-07 18:38:53,669", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Expectations saved to suite."}


Calculating Metrics:   0%|          | 0/20 [00:00<?, ?it/s]

{"time": "2025-07-07 18:38:54,289", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Validation run completed."}
{"time": "2025-07-07 18:38:54,292", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "DEBUG", "module": "run_great_expectations_validations", "msg": "Validation result:
{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "genome_id",
          "batch_id": "88d891994727c49a497dadf2198dd057"
        },
        "meta": {}
      },
      "result": {
        "element_count": 293059,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "

Calculating Metrics:   0%|          | 0/28 [00:00<?, ?it/s]

{"time": "2025-07-07 18:38:56,913", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Checkpoint executed."}
{"time": "2025-07-07 18:38:57,835", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs rebuilt with checkpoint results."}
✅ GE validation and checkpoint complete. Data Docs generated.
✅ Uploaded log to MinIO at: s3://cdm-lake/logs/pangenome/pipeline_run_20250707_183718.log
