In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "gtdb_species_clade"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "gtdb_species_clade"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())

# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)

{"time": "2025-07-09 21:45:00,842", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "927176468", "msg": "Notebook started"}
{"time": "2025-07-09 21:45:00,854", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: gtdb_species_clade"}
{"time": "2025-07-09 21:45:00,854", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/gtdb_species_clade.json"}
{"time": "2025-07-09 21:45:00,858", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gtdb_species_clade.json"}
{"time": "2025-07-09 21:45:01,025", "pipeline": "pangenome", "schema": "pangenome", "table":

In [2]:
# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)

{"time": "2025-07-09 21:45:06,384", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "validate_schema", "msg": "Starting schema vs. file validation"}
{"time": "2025-07-09 21:45:06,390", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_gtdb_species_clade_V1.1.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-09 21:45:06,391", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "validate_schema", "msg": "Output file: s3a://cdm-lake/pangenome-source/table_gtdb_species_clade_V1.1.tsv"}
{"time": "2025-07-09 21:45:06,392", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "validate_schema", "msg": "Delimiter: 	, Ignore first line: F

{'file_columns': ['gtdb_species_clade_id',
  'representative_genome_id',
  'GTDB_species',
  'GTDB_taxonomy',
  'ANI_circumscription_radius',
  'mean_intra_species_ANI',
  'min_intra_species_ANI',
  'mean_intra_species_AF',
  'min_intra_species_AF',
  'no_clustered_genomes_unfiltered',
  'no_clustered_genomes_filtered'],
 'schema_columns': ['gtdb_species_clade_id',
  'representative_genome_id',
  'GTDB_species',
  'GTDB_taxonomy',
  'ANI_circumscription_radius',
  'mean_intra_species_ANI',
  'min_intra_species_ANI',
  'mean_intra_species_AF',
  'min_intra_species_AF',
  'no_clustered_genomes_unfiltered',
  'no_clustered_genomes_filtered'],
 'missing_in_file': [],
 'extra_in_file': []}

In [3]:
# Run validations
from src.run_validations import run_validations_from_config
run_validations_from_config(loader, logger)

{"time": "2025-07-09 21:45:06,482", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_gtdb_species_clade_V1.1.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-09 21:45:06,483", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_validations", "msg": "Starting validation on file: s3a://cdm-lake/pangenome-source/table_gtdb_species_clade_V1.1.tsv"}
{"time": "2025-07-09 21:45:06,484", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "run_validations", "msg": "Delimiter: 	, Ignore first line: False"}
{"time": "2025-07-09 21:45:06,484", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "config_loader", "msg": "Validations: [{'

{'file_path': 's3a://cdm-lake/pangenome-source/table_gtdb_species_clade_V1.1.tsv',
 'validation_errors': [],
 'success': True}

In [4]:
# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader, logger)

{"time": "2025-07-09 21:45:06,736", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gtdb_species_clade.json"}
{"time": "2025-07-09 21:45:06,746", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "Config loaded successfully from MinIO"}
{"time": "2025-07-09 21:45:06,747", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "All required fields are present in the config"}
{"time": "2025-07-09 21:45:06,747", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "config_loader", "msg": "Referential integrity rules: [{'foreign_key': 'representative_genome_id', 'reference_table': 'genome', 'reference_column': 'genome_id', 'database': 'pangenom

False

In [5]:

# Check TSV format
#from src.input_file_validator import validate_input_files
#validate_input_files(loader)







#import importlib
#import src.run_great_expectations_validations

#importlib.reload(src.run_great_expectations_validations)




In [6]:
# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader, logger)


# Upload log file to MinIO
upload_to_minio(logger.log_file_path)

{"time": "2025-07-09 21:45:25,115", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "config_loader", "msg": "Target table: pangenome.gtdb_species_clade"}
{"time": "2025-07-09 21:45:25,116", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Starting Great Expectations validation for table: pangenome.gtdb_species_clade with suite: default_suite"}
{"time": "2025-07-09 21:45:25,169", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Loaded Spark table: pangenome.gtdb_species_clade"}
{"time": "2025-07-09 21:45:25,326", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Great Expectations context initialized."}
{"time": "2025-07-09 21

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:26,441", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_not_be_null with args {'column': 'gtdb_species_clade_id'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:26,712", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_match_regex with args {'column': 'gtdb_species_clade_id', 'regex': '^[A-Za-z0-9_.-]+$'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:26,954", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_between with args {'column': 'ANI_circumscription_radius', 'min_value': 0, 'max_value': 100} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:27,173", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_between with args {'column': 'mean_intra_species_ANI', 'min_value': 0, 'max_value': 100} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:27,376", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_between with args {'column': 'min_intra_species_ANI', 'min_value': 0, 'max_value': 100} → result: True"}
{"time": "2025-07-09 21:45:27,402", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Expectations saved to suite."}


Calculating Metrics:   0%|          | 0/33 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:27,968", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Validation run completed."}
{"time": "2025-07-09 21:45:27,972", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "run_great_expectations_validations", "msg": "Validation summary: {'success': True, 'successful_expectations': 5, 'unsuccessful_expectations': 0, 'success_percent': 100.0}"}
{"time": "2025-07-09 21:45:29,460", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs built after validation."}


Calculating Metrics:   0%|          | 0/43 [00:00<?, ?it/s]

{"time": "2025-07-09 21:45:31,143", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Checkpoint executed."}
{"time": "2025-07-09 21:45:32,747", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs rebuilt with checkpoint results."}
✅ GE validation and checkpoint complete. Data Docs generated.
✅ Uploaded log to MinIO at: s3://cdm-lake/logs/pangenome/pipeline_run_20250709_214500.log
