In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "gtdb_metadata"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "gtdb_metadata"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())

# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)

{"time": "2025-07-10 05:19:23,244", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "2742075585", "msg": "Notebook started"}
{"time": "2025-07-10 05:19:23,256", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: gtdb_metadata"}
{"time": "2025-07-10 05:19:23,257", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/gtdb_metadata.json"}
{"time": "2025-07-10 05:19:23,258", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gtdb_metadata.json"}
{"time": "2025-07-10 05:19:23,415", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO",

In [2]:
# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)

{"time": "2025-07-10 05:19:28,033", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "validate_schema", "msg": "Starting schema vs. file validation"}
{"time": "2025-07-10 05:19:28,039", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_gtdb_metadata_V1.1.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-10 05:19:28,040", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "validate_schema", "msg": "Output file: s3a://cdm-lake/pangenome-source/table_gtdb_metadata_V1.1.tsv"}
{"time": "2025-07-10 05:19:28,040", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "validate_schema", "msg": "Delimiter: 	, Ignore first line: False"}
{"time": "2025-07-10 05

{'file_columns': ['accession',
  'ambiguous_bases',
  'checkm_completeness',
  'checkm_contamination',
  'checkm_marker_count',
  'checkm_marker_lineage',
  'checkm_marker_set_count',
  'checkm_strain_heterogeneity',
  'coding_bases',
  'coding_density',
  'contig_count',
  'gc_count',
  'gc_percentage',
  'genome_size',
  'gtdb_genome_representative',
  'gtdb_representative',
  'gtdb_taxonomy',
  'gtdb_type_designation_ncbi_taxa',
  'gtdb_type_designation_ncbi_taxa_sources',
  'gtdb_type_species_of_genus',
  'l50_contigs',
  'l50_scaffolds',
  'longest_contig',
  'longest_scaffold',
  'lsu_23s_contig_len',
  'lsu_23s_count',
  'lsu_23s_length',
  'lsu_23s_query_id',
  'lsu_5s_contig_len',
  'lsu_5s_count',
  'lsu_5s_length',
  'lsu_5s_query_id',
  'lsu_silva_23s_blast_align_len',
  'lsu_silva_23s_blast_bitscore',
  'lsu_silva_23s_blast_evalue',
  'lsu_silva_23s_blast_perc_identity',
  'lsu_silva_23s_blast_subject_id',
  'lsu_silva_23s_taxonomy',
  'mean_contig_length',
  'mean_scaffol

In [3]:
# Run validations
from src.run_validations import run_validations_from_config
run_validations_from_config(loader, logger)

{"time": "2025-07-10 05:19:31,267", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_gtdb_metadata_V1.1.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-10 05:19:31,268", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_validations", "msg": "Starting validation on file: s3a://cdm-lake/pangenome-source/table_gtdb_metadata_V1.1.tsv"}
{"time": "2025-07-10 05:19:31,269", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "run_validations", "msg": "Delimiter: 	, Ignore first line: False"}
{"time": "2025-07-10 05:19:31,270", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "config_loader", "msg": "Validations: [{'column': 'accession', 'validat

{'file_path': 's3a://cdm-lake/pangenome-source/table_gtdb_metadata_V1.1.tsv',
 'validation_errors': [],
 'success': True}

In [4]:
# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader, logger)

{"time": "2025-07-10 05:19:43,270", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gtdb_metadata.json"}
{"time": "2025-07-10 05:19:43,289", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "config_loader", "msg": "Config loaded successfully from MinIO"}
{"time": "2025-07-10 05:19:43,290", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "config_loader", "msg": "All required fields are present in the config"}
{"time": "2025-07-10 05:19:43,290", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "config_loader", "msg": "Referential integrity rules: [{'foreign_key': 'sample_id', 'reference_table': 'sample', 'reference_column': 'sample_id', 'database': 'pangenome', 'action': 'log'}, {'foreign_key': 'n

False

In [5]:

# Check TSV format
#from src.input_file_validator import validate_input_files
#validate_input_files(loader)







#import importlib
#import src.run_great_expectations_validations

#importlib.reload(src.run_great_expectations_validations)




In [6]:
# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader, logger)



{"time": "2025-07-10 05:19:52,911", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "config_loader", "msg": "Target table: pangenome.gtdb_metadata"}
{"time": "2025-07-10 05:19:52,912", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Starting Great Expectations validation for table: pangenome.gtdb_metadata with suite: default_suite"}
{"time": "2025-07-10 05:19:52,998", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Loaded Spark table: pangenome.gtdb_metadata"}
{"time": "2025-07-10 05:19:53,184", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Great Expectations context initialized."}
{"time": "2025-07-10 05:19:53,207", "pipeline": "pangenome

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:02,216", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_not_be_null with args {'column': 'accession'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:02,661", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_match_regex with args {'column': 'accession', 'regex': '^[A-Za-z0-9_.-]+$'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:03,066", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_between with args {'column': 'checkm_completeness', 'min_value': 0, 'max_value': 100} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:03,344", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_be_between with args {'column': 'checkm_contamination', 'min_value': 0, 'max_value': 100} → result: True"}


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:03,354", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_to_exist with args {'column': 'sample_id'} → result: False"}
{"time": "2025-07-10 05:20:03,383", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Expectations saved to suite."}


Calculating Metrics:   0%|          | 0/26 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:04,128", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Validation run completed."}
{"time": "2025-07-10 05:20:04,132", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "DEBUG", "module": "run_great_expectations_validations", "msg": "Validation summary: {'success': False, 'successful_expectations': 4, 'unsuccessful_expectations': 1, 'success_percent': 80.0}"}
{"time": "2025-07-10 05:20:05,786", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs built after validation."}


Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

{"time": "2025-07-10 05:20:07,629", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Checkpoint executed."}
{"time": "2025-07-10 05:20:09,246", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_metadata", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs rebuilt with checkpoint results."}
✅ GE validation and checkpoint complete. Data Docs generated.


In [7]:
# Upload log file to MinIO
upload_to_minio(logger.log_file_path)

✅ Uploaded log to MinIO at: s3://cdm-lake/logs/pangenome/pipeline_run_20250710_051923.log
