In [1]:
import boto3
import yaml
from src.logger import setup_logger
from src.minio_uploader import upload_to_minio

# Set up logger
pipeline_name = "pangenome"
target_table = "gtdb_species_clade"
schema = "pangenome"

logger = setup_logger(
    pipeline_name=pipeline_name,
    target_table=target_table,
    schema=schema
)

logger.info("Notebook started")

from config.config_loader import ConfigLoader

# Path to config file in MinIO
config_path = "gtdb_species_clade"

# Load config
loader = ConfigLoader(config_path, logger)
loader.load_and_validate()

# (Optional) Print something
print(loader.get_target_table())

# Start Spark
from src.spark_session import start_spark_session
spark = start_spark_session(logger=logger)

{"time": "2025-07-09 03:53:08,384", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "927176468", "msg": "Notebook started"}
{"time": "2025-07-09 03:53:08,397", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "ConfigLoader initialized for target table: gtdb_species_clade"}
{"time": "2025-07-09 03:53:08,398", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "Resolved config path: s3a://cdm-lake/config-json/gtdb_species_clade.json"}
{"time": "2025-07-09 03:53:08,399", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "config_loader", "msg": "Loading config from MinIO: bucket=cdm-lake, key=config-json/gtdb_species_clade.json"}
{"time": "2025-07-09 03:53:08,626", "pipeline": "pangenome", "schema": "pangenome", "table":

In [None]:



# Check schema headers
from src.validate_schema import validate_schema_against_file
validate_schema_against_file(loader, logger)


# Check TSV format
#from src.input_file_validator import validate_input_files
#validate_input_files(loader)

# Run validations
from src.run_validations import run_validations_from_config
run_validations_from_config(loader, logger)


# Referential integrity
from src.run_referential_integrity_checks import run_referential_integrity_checks
run_referential_integrity_checks(spark, loader, logger)


#import importlib
#import src.run_great_expectations_validations

#importlib.reload(src.run_great_expectations_validations)




{"time": "2025-07-09 03:53:17,737", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "DEBUG", "module": "config_loader", "msg": "Input files: [{'source': 'GTDB', 'file_path': 's3a://cdm-lake/bronze/gtdb/sp_clusters_r214.tsv', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'no'}, {'source': 'NCBI', 'file_path': 's3a://cdm-lake/bronze/ncbi/assembly_summary_genbank.txt', 'file_type': 'tsv', 'delimiter': '\t', 'ignore_first_line': 'yes'}]"}
{"time": "2025-07-09 03:53:17,737", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "input_file_validator", "msg": "Starting input file validation..."}
Validating input files...

{"time": "2025-07-09 03:53:17,743", "pipeline": "pangenome", "schema": "pangenome", "table": "gtdb_species_clade", "level": "INFO", "module": "input_file_validator", "msg": "Checking file: s3a://cdm-lake/bronze/gtdb/sp_clusters_r214.tsv"}
🔍 Checking: s3a://cdm-lake/bron

In [None]:
# Great Expectations
from src.run_great_expectations_validations import run_great_expectations_validation
run_great_expectations_validation(spark, loader, logger)


# Upload log file to MinIO
upload_to_minio(logger.log_file_path)