In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "genome"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "genome"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())



# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)



# Validate input files
from src.input_file_validator import validate_input_files
validate_input_files(loader, logger)


# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)

# Upload log file to MinIO
upload_to_minio(logger.log_file_path)

{"time": "2025-07-07 04:50:59,138", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "3421723309", "msg": "Notebook started"}
{"time": "2025-07-07 04:50:59,149", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: genome"}
{"time": "2025-07-07 04:50:59,150", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/genome.json"}
{"time": "2025-07-07 04:50:59,151", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/genome.json"}
{"time": "2025-07-07 04:50:59,335", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "config_loader", "msg": "Config loaded succes

{"time": "2025-07-07 04:52:02,997", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "DEBUG", "module": "validate_schema", "msg": "Schema columns: ['genome_id', 'gtdb_taxonomy_id', 'gtdb_species_clade_id', 'ncbi_biosample_id', 'fna_file_path_nersc', 'faa_file_path_nersc']"}
{"time": "2025-07-07 04:52:02,997", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema", "msg": "Comparison between schema and file header completed"}
{"time": "2025-07-07 04:52:02,998", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema", "msg": "All schema columns are present in the file"}
{"time": "2025-07-07 04:52:02,998", "pipeline": "pangenome", "schema": "pangenome", "table": "genome", "level": "INFO", "module": "validate_schema", "msg": "No extra columns in the file"}
🔹 Header columns from file:
['genome_id', 'gtdb_species_clade_id', 'gtdb_taxonomy_id', 'ncbi_bios

In [None]:
# Check TSV format
from src.input_file_validator import validate_input_files
validate_input_files(loader)

In [None]:
# Run validations
from src.run_validations import run_validations_from_config
run_validations_from_config(loader)

In [None]:
# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader)

In [None]:
import importlib
import src.run_great_expectations_validations

importlib.reload(src.run_great_expectations_validations)

In [None]:
# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader)
