In [None]:
import os
import shutil
import subprocess
import tempfile
import pytest
from pathlib import Path
import hashlib
import json
from importlib import import_module

# Try to import directly from the package
try:
    from vcfstash.initializer import DatabaseInitializer
    from vcfstash.base import VCFDatabase
except ImportError:
    # If running tests from the repo without installing
    import sys
    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
    from initializer import DatabaseInitializer
    from base import VCFDatabase

# Test data paths - adjust as needed
TEST_ROOT = os.path.abspath(os.path.dirname(__file__))
TEST_DATA = os.path.join(TEST_ROOT, 'data')
NODATA_DIR = os.path.join(TEST_DATA, 'nodata')
TEST_CONFIG = os.path.join(TEST_ROOT, 'config', 'nextflow_test.config')
TEST_VCF = os.path.join(NODATA_DIR, 'crayz_db.bcf')

In [None]:
@pytest.fixture
def temp_output_dir():
    """Create a temporary directory for test outputs and clean it up afterwards."""
    temp_dir = tempfile.mkdtemp(prefix="vcfstash_test_")
    yield temp_dir
    # Clean up after tests if not in debug mode
    if not os.environ.get('VCFSTASH_TEST_DEBUG'):
        shutil.rmtree(temp_dir)
    else:
        print(f"\nTest output preserved at: {temp_dir}")

@pytest.fixture
def stash_initializer(temp_output_dir):
    """Create a DatabaseInitializer instance with test configuration."""
    initializer = DatabaseInitializer(
        input_file=TEST_VCF,
        output_dir=temp_output_dir,
        config_file=TEST_CONFIG,
        force=True,
        verbose=2
    )
    return initializer

def compute_md5(file_path):
    """Compute MD5 hash of a file."""
    md5 = hashlib.md5()
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b''):
            md5.update(chunk)
    return md5.hexdigest()

In [None]:
def test_stash_init_creates_directory_structure(stash_initializer, temp_output_dir):
    """Test that stash-init creates the expected directory structure."""
    # Run the initialization
    stash_initializer.initialize()

    # Check that the expected directories were created
    assert os.path.exists(os.path.join(temp_output_dir, 'blueprint')), "Blueprint directory was not created"
    assert os.path.exists(os.path.join(temp_output_dir, 'stash')), "Annotations directory was not created"
    assert os.path.exists(os.path.join(temp_output_dir, 'workflow')), "Workflow directory was not created"

    # Check for expected workflow files
    assert os.path.exists(os.path.join(temp_output_dir, 'workflow', 'main.nf')), "main.nf was not created"
    assert os.path.exists(os.path.join(temp_output_dir, 'workflow', 'init_nextflow.config')), "nextflow.config was not created"
    assert os.path.exists(os.path.join(temp_output_dir, 'workflow', 'modules')), "modules directory was not created"

    # Check for expected blueprint files
    assert os.path.exists(os.path.join(temp_output_dir, 'blueprint', 'vcfstash.bcf')), "Normalized BCF was not created"
    assert os.path.exists(os.path.join(temp_output_dir, 'blueprint', 'vcfstash.bcf.csi')), "BCF index was not create

In [None]:
def test_stash_init_normalizes_vcf(stash_initializer, temp_output_dir):
    """Test that the input VCF is properly normalized."""
    # Run the initialization
    stash_initializer.initialize()

    # Path to the normalized BCF
    normalized_bcf = os.path.join(temp_output_dir, 'blueprint', 'vcfstash.bcf')

    # Ensure the file exists
    assert os.path.exists(normalized_bcf), "Normalized BCF file not created"

    # Check file size is reasonable (not empty)
    assert os.path.getsize(normalized_bcf) > 1000, "Normalized BCF file seems too small"

    # Validate the BCF using bcftools
    result = subprocess.run(
        ["bcftools", "view", "--header-only", normalized_bcf],
        capture_output=True,
        text=True
    )
    assert result.returncode == 0, f"BCF validation failed: {result.stderr}"

    # Check that the header contains expected information
    header = result.stdout
    assert "##fileformat=VCFv" in header, "Missing VCF format in header"

    # Optional: Check if genotypes were properly removed (for stash-init)
    result = subprocess.run(
        ["bcftools", "view", "-H", normalized_bcf],
        capture_output=True,
        text=True
    )
    sample_line = result.stdout.split('\n')[0] if result.stdout else ""
    # The last fields should be FORMAT and possibly sample IDs
    fields = sample_line.split('\t')
    # For stash-init, we expect the GT field to be removed
    assert len(fields) <= 8, "Genotype information was not removed during

In [None]:
def test_stash_init_workflow_execution(stash_initializer, temp_output_dir):
    """Test that the NextFlow workflow executes successfully."""
    # Run the initialization
    stash_initializer.initialize()

    # Check for the existence of nextflow output files that indicate successful execution
    report_file = os.path.join(temp_output_dir, 'blueprint', 'init_report.html')
    trace_file = os.path.join(temp_output_dir, 'blueprint', 'init_trace.txt')

    assert os.path.exists(report_file), "NextFlow report file was not created"
    assert os.path.exists(trace_file), "NextFlow trace file was not created"

    # Check the log file for errors
    workflow_log = os.path.join(temp_output_dir, 'workflow', 'workflow.log')
    if os.path.exists(workflow_log):
        with open(workflow_log, 'r') as f:
            log_content = f.read()
            assert "ERROR" not in log_content, f"Workflow log contains errors: {log_content}"

    # Alternative: check the vcfdb.log
    vcfdb_log = os.path.join(temp_output_dir, 'vcfdb.log')
    if os.path.exists(vcfdb_log):
        with open(vcfdb_log, 'r') as f:
            log_content = f.read()
            assert "ERROR" not in log_content, f"VCF database log contains errors: {log_content}"

In [None]:
def test_stash_init_cli(temp_output_dir):
    """Test the stash-init function through the command-line interface."""
    # Use subprocess to call the actual CLI command
    cmd = [
        "python",
        os.path.join(os.path.dirname(TEST_ROOT), "vcfstash.py"),
        "stash-init",
        "--vcf", TEST_VCF,
        "--output", temp_output_dir,
        "-c", TEST_CONFIG,
        "-f"  # Force mode
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    # Check that the command executed successfully
    assert result.returncode == 0, f"CLI command failed: {result.stderr}"

    # Check output for expected messages
    assert f"Creating stash structure: {temp_output_dir}" in result.stdout, "Expected output message not found"

    # Verify the directory structure
    assert os.path.exists(os.path.join(temp_output_dir, 'blueprint')), "