In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "ncbi_env"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "ncbi_env"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())

# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)

{"time": "2025-07-11 19:11:37,380", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "1035436919", "msg": "Notebook started"}
{"time": "2025-07-11 19:11:37,392", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: ncbi_env"}
{"time": "2025-07-11 19:11:37,393", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/ncbi_env.json"}
{"time": "2025-07-11 19:11:37,394", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/ncbi_env.json"}
{"time": "2025-07-11 19:11:37,654", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "Conf

In [2]:
# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)

{"time": "2025-07-11 19:11:45,374", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "validate_schema", "msg": "Starting schema vs. file validation"}
{"time": "2025-07-11 19:11:45,379", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_ncbi_env_V1.1.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-11 19:11:45,380", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "validate_schema", "msg": "Output file: s3a://cdm-lake/pangenome-source/table_ncbi_env_V1.1.tsv"}
{"time": "2025-07-11 19:11:45,380", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "validate_schema", "msg": "Delimiter: 	, Ignore first line: False"}
{"time": "2025-07-11 19:11:47,672", "pipeline": "pang

{'file_columns': ['accession',
  'attribute_name',
  'content',
  'display_name',
  'harmonized_name',
  'id',
  'package_content'],
 'schema_columns': ['accession',
  'attribute_name',
  'content',
  'display_name',
  'harmonized_name',
  'id',
  'package_content'],
 'missing_in_file': [],
 'extra_in_file': []}

In [3]:
# Run validations
from src.run_validations import run_validations_from_config
run_validations_from_config(loader, logger)

{"time": "2025-07-11 19:11:52,525", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "config_loader", "msg": "Output file: {'output_file_path': 's3a://cdm-lake/pangenome-source/table_ncbi_env_V1.1.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}"}
{"time": "2025-07-11 19:11:52,526", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_validations", "msg": "Starting validation on file: s3a://cdm-lake/pangenome-source/table_ncbi_env_V1.1.tsv"}
{"time": "2025-07-11 19:11:52,527", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "run_validations", "msg": "Delimiter: 	, Ignore first line: False"}
{"time": "2025-07-11 19:11:52,527", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "config_loader", "msg": "Validations: [{'column': 'accession', 'validation_type': 'not_null', 'error_

{'file_path': 's3a://cdm-lake/pangenome-source/table_ncbi_env_V1.1.tsv',
 'validation_errors': [],
 'success': True}

In [4]:
# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader, logger)

{"time": "2025-07-11 19:12:15,849", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/ncbi_env.json"}
{"time": "2025-07-11 19:12:15,872", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "Config loaded successfully from MinIO"}
{"time": "2025-07-11 19:12:15,873", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "config_loader", "msg": "All required fields are present in the config"}
{"time": "2025-07-11 19:12:15,874", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "config_loader", "msg": "Referential integrity rules: []"}
{"time": "2025-07-11 19:12:15,875", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "config_loader", "msg": "Target table: p

False

In [None]:

# Check TSV format
#from src.input_file_validator import validate_input_files
#validate_input_files(loader)







#import importlib
#import src.run_great_expectations_validations

#importlib.reload(src.run_great_expectations_validations)




In [5]:
# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader, logger)


# Upload log file to MinIO
upload_to_minio(logger.log_file_path)

{"time": "2025-07-11 19:12:30,132", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "config_loader", "msg": "Target table: pangenome.ncbi_env"}
{"time": "2025-07-11 19:12:30,133", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Starting Great Expectations validation for table: pangenome.ncbi_env with suite: default_suite"}
{"time": "2025-07-11 19:12:37,662", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Loaded Spark table: pangenome.ncbi_env"}
{"time": "2025-07-11 19:12:37,817", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Great Expectations context initialized."}
{"time": "2025-07-11 19:12:37,844", "pipeline": "pangenome", "schema": "pangenome", "table": 

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{"time": "2025-07-11 19:12:44,669", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_not_be_null with args {'column': 'accession'} → result: True"}


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{"time": "2025-07-11 19:12:45,595", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Applied GE expectation: expect_column_values_to_match_regex with args {'column': 'accession', 'regex': '^[A-Za-z0-9_.\\-]+$'} → result: True"}
{"time": "2025-07-11 19:12:45,632", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Expectations saved to suite."}


Calculating Metrics:   0%|          | 0/12 [00:00<?, ?it/s]

{"time": "2025-07-11 19:12:46,447", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Validation run completed."}
{"time": "2025-07-11 19:12:46,450", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "DEBUG", "module": "run_great_expectations_validations", "msg": "Validation summary: {'success': True, 'successful_expectations': 2, 'unsuccessful_expectations': 0, 'success_percent': 100.0}"}
{"time": "2025-07-11 19:12:48,440", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs built after validation."}


Calculating Metrics:   0%|          | 0/16 [00:00<?, ?it/s]

{"time": "2025-07-11 19:12:50,726", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Checkpoint executed."}
{"time": "2025-07-11 19:12:52,989", "pipeline": "pangenome", "schema": "pangenome", "table": "ncbi_env", "level": "INFO", "module": "run_great_expectations_validations", "msg": "Data Docs rebuilt with checkpoint results."}
✅ GE validation and checkpoint complete. Data Docs generated.
✅ Uploaded log to MinIO at: s3://cdm-lake/logs/pangenome/pipeline_run_20250711_191137.log
