In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "gene_cluster"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "gene_cluster"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())

# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)

{"time": "2025-07-15 03:42:30,474", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "4268986019", "msg": "Notebook started"}
{"time": "2025-07-15 03:42:30,488", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: gene_cluster"}
{"time": "2025-07-15 03:42:30,489", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/gene_cluster.json"}
{"time": "2025-07-15 03:42:30,489", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gene_cluster.json"}
{"time": "2025-07-15 03:42:30,577", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module

In [2]:
# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)

{"time": "2025-07-15 03:42:36,234", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "validate_schema", "msg": "Starting schema vs. file validation"}
{"time": "2025-07-15 03:42:36,240", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_gene_cluster_V1.0.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-15 03:42:36,240", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "validate_schema", "msg": "Output file: s3a://cdm-lake/pangenome-source/table_gene_cluster_V1.0.tsv"}
{"time": "2025-07-15 03:42:36,241", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "validate_schema", "msg": "Delimiter: 	, Ignore first line: False"}
{"time": "2025-07-15 03:42:36

{'file_columns': ['gene_cluster_id',
  'gtdb_species_clade_id',
  'is_core',
  'is_auxiliary',
  'is_singleton',
  'likelihood'],
 'schema_columns': ['gene_cluster_id',
  'gtdb_species_clade_id',
  'is_core',
  'is_accessory',
  'is_singleton'],
 'missing_in_file': ['is_accessory'],
 'extra_in_file': ['is_auxiliary', 'likelihood']}

In [3]:
# Run validations
#from src.run_validations import run_validations_from_config
#run_validations_from_config(loader, logger)

from src.run_validations_pyspark import run_validations_from_config_spark
run_validations_from_config_spark(spark, loader, logger)

{"time": "2025-07-15 03:42:40,364", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_gene_cluster_V1.0.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-15 03:42:40,365", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_validations_pyspark", "msg": "Starting validation on file: s3a://cdm-lake/pangenome-source/table_gene_cluster_V1.0.tsv"}
{"time": "2025-07-15 03:42:40,365", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "run_validations_pyspark", "msg": "Delimiter: 	"}
{"time": "2025-07-15 03:42:40,366", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "config_loader", "msg": "Validations: [{'column': 'gene_cluster_id', 'validation_type':

{'file_path': 's3a://cdm-lake/pangenome-source/table_gene_cluster_V1.0.tsv',
 'validation_errors': [],
 'success': True}

In [4]:
# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader, logger)

{"time": "2025-07-15 03:44:12,752", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gene_cluster.json"}
{"time": "2025-07-15 03:44:12,927", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "config_loader", "msg": "Config loaded successfully from MinIO"}
{"time": "2025-07-15 03:44:12,936", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "config_loader", "msg": "All required fields are present in the config"}
{"time": "2025-07-15 03:44:12,945", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "config_loader", "msg": "Referential integrity rules: [{'foreign_key': 'gtdb_species_clade_id', 'reference_table': 'gtdb_species_clade', 'reference_column': 'gtdb_species_clade_id', 'database': 'pangenome', 'acti

True

In [5]:

# Check TSV format
#from src.input_file_validator import validate_input_files
#validate_input_files(loader)







#import importlib
#import src.run_great_expectations_validations

#importlib.reload(src.run_great_expectations_validations)




In [6]:
# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader, logger)


# Upload log file to MinIO
upload_to_minio(logger.log_file_path)

{"time": "2025-07-15 03:45:11,237", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "config_loader", "msg": "Target table: pangenome.gene_cluster"}
{"time": "2025-07-15 03:45:11,245", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Starting Great Expectations validation for table: pangenome.gene_cluster with suite: default_suite"}
{"time": "2025-07-15 03:45:12,312", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Loaded Spark table: pangenome.gene_cluster"}
{"time": "2025-07-15 03:45:20,210", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Great Expectations context initialized."}
{"time": "2025-07-15 03:45:21,362", "pipeline": "pangenome", "sch

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{"time": "2025-07-15 03:46:23,242", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_not_be_null with args {'column': 'gene_cluster_id'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-15 03:46:46,710", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_match_regex with args {'column': 'gene_cluster_id', 'regex': '^[A-Za-z0-9_.\\-]+$'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-15 03:46:48,710", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_in_set with args {'column': 'is_core', 'value_set': ['true', 'false']} → result: False"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-15 03:46:50,668", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_in_set with args {'column': 'is_auxiliary', 'value_set': ['true', 'false']} → result: False"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-15 03:46:52,493", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_in_set with args {'column': 'is_singleton', 'value_set': ['true', 'false']} → result: False"}
{"time": "2025-07-15 03:46:53,415", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Expectations saved to suite."}


Calculating Metrics:   0%|          | 0/33 [00:00<?, ?it/s]

{"time": "2025-07-15 03:47:20,255", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Validation run completed."}
{"time": "2025-07-15 03:47:20,265", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "DEBUG", "module": "run_great_expectations_validations", "msg": "Validation summary: {'success': False, 'successful_expectations': 2, 'unsuccessful_expectations': 3, 'success_percent': 40.0}"}
{"time": "2025-07-15 03:49:02,530", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs built after validation."}


Calculating Metrics:   0%|          | 0/43 [00:00<?, ?it/s]

{"time": "2025-07-15 03:50:27,762", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Checkpoint executed."}
{"time": "2025-07-15 03:51:36,093", "pipeline": "pangenome", "schema": "pangenome", "table": "gene_cluster", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs rebuilt with checkpoint results."}
✅ GE validation and checkpoint complete. Data Docs generated.
✅ Uploaded log to MinIO at: s3://cdm-lake/logs/pangenome/pipeline_run_20250715_034230.log
