## Explore GNOMIX

- In this example, we use the demo VCF file provided in the repository.

In [1]:
import os
from collections import Counter
import logging
import sys
import shutil

# Load environment variables from .env file
from dotenv import load_dotenv
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
env_path = os.path.join(project_root, '.env')
load_dotenv(env_path, override=True)

print(env_path)

/home/lakishadavid/computational_genetic_genealogy/.env


In [2]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils
The current directory is /home/lakishadavid/computational_genetic_genealogy


In [3]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [5]:
log_filename = os.path.join(results_directory, "lab6.log")
print(f"The Lab 6 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.

The Lab 6 log file is located at /home/lakishadavid/computational_genetic_genealogy/results/lab6.log.


In [6]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

here is a demo notebook from the developers: https://github.com/AI-sandbox/gnomix/blob/main/demo.ipynb

the supervised machine learning model can learn the distinctive allele frequency patterns and linkage structures characteristic of each population.

In [11]:
%%bash -s "$utils_directory"

utils_directory=$1

# Create subdirectories in the utils directory
mkdir -p $utils_directory/gnomix

# Clone the gnomix repository
git clone https://github.com/AI-sandbox/gnomix $utils_directory/gnomix
cd $utils_directory/gnomix

# Install dependencies from requirements.txt using Poetry.
# This loop reads each line of the requirements file and adds the dependency.
while IFS= read -r dep || [ -n "$dep" ]; do
    # Skip empty lines or lines that start with '#' (comments)
    if [ -n "$dep" ] && [[ ! "$dep" =~ ^# ]]; then
        echo "Adding dependency: $dep"
        poetry add -D "$dep"
    fi
done < requirements.txt

Cloning into '/home/lakishadavid/computational_genetic_genealogy/utils/gnomix'...


Adding dependency: matplotlib==3.3.4

Updating dependencies
Resolving dependencies...



Incompatible constraints in requirements of bagg-analysis (0.1.0):
matplotlib (>=3.4,<4.0)
matplotlib (==3.3.4)


Adding dependency: numpy==1.20.3

Updating dependencies
Resolving dependencies...



Incompatible constraints in requirements of bagg-analysis (0.1.0):
numpy (>=1.23)
numpy (==1.20.3)


Adding dependency: pandas==1.3.5

Updating dependencies
Resolving dependencies...



Incompatible constraints in requirements of bagg-analysis (0.1.0):
pandas (>=2.2.3,<3.0.0)
pandas (==1.3.5)


Adding dependency: PyYAML==5.1.2

Updating dependencies
Resolving dependencies...



Because no versions of cfn-lint match >1.22.2,<1.22.3 || >1.22.3,<1.22.4 || >1.22.4,<1.22.5 || >1.22.5,<1.22.6 || >1.22.6,<1.22.7 || >1.22.7,<1.23.0 || >1.23.0,<1.23.1 || >1.23.1,<1.24.0 || >1.24.0,<2.0.0
 and cfn-lint (1.22.3) depends on pyyaml (>5.4), cfn-lint (>1.22.2,<1.22.4 || >1.22.4,<1.22.5 || >1.22.5,<1.22.6 || >1.22.6,<1.22.7 || >1.22.7,<1.23.0 || >1.23.0,<1.23.1 || >1.23.1,<1.24.0 || >1.24.0,<2.0.0) requires pyyaml (>5.4).
And because cfn-lint (1.22.4) depends on pyyaml (>5.4)
 and cfn-lint (1.22.5) depends on pyyaml (>5.4), cfn-lint (>1.22.2,<1.22.6 || >1.22.6,<1.22.7 || >1.22.7,<1.23.0 || >1.23.0,<1.23.1 || >1.23.1,<1.24.0 || >1.24.0,<2.0.0) requires pyyaml (>5.4).
And because cfn-lint (1.22.6) depends on pyyaml (>5.4)
 and cfn-lint (1.22.7) depends on pyyaml (>5.4), cfn-lint (>1.22.2,<1.23.0 || >1.23.0,<1.23.1 || >1.23.1,<1.24.0 || >1.24.0,<2.0.0) requires pyyaml (>5.4).
And because cfn-lint (1.23.0) depends on pyyaml (>5.4)
 and cfn-lint (1.23.1) depends on pyyaml (>5.4)

Adding dependency: scikit-allel==1.3.1

Updating dependencies
Resolving dependencies...

Package operations: 10 installs, 0 updates, 0 removals

  - Installing locket (1.0.0)
  - Installing toolz (1.0.0)
  - Installing zipp (3.21.0)
  - Installing click (8.1.8)
  - Installing cloudpickle (3.1.1)
  - Installing fsspec (2025.2.0)
  - Installing importlib-metadata (8.6.1)
  - Installing partd (1.4.2)
  - Installing dask (2025.1.0)
  - Installing scikit-allel (1.3.1)

PEP517 build of a dependency failed

Backend subprocess exited when trying to invoke build_wheel

    | Command '['/tmp/tmpy6492k4b/.venv/bin/python', '/home/lakishadavid/.local/share/pypoetry/venv/lib/python3.10/site-packages/pyproject_hooks/_in_process/_in_process.py', 'build_wheel', '/tmp/tmp8jj6zpq5']' returned non-zero exit status 1.
    | 
    | [scikit-allel] setup extensions without cython
    | Traceback (most recent call last):
    |   File "/tmp/tmpy6492k4b/.venv/lib/python3.10/site-packages/setuptools_scm/_integra


Incompatible constraints in requirements of bagg-analysis (0.1.0):
scipy (>=1.14.1,<2.0.0)
scipy (==1.5.3)


Adding dependency: seaborn==0.11.2

Updating dependencies
Resolving dependencies...



Incompatible constraints in requirements of bagg-analysis (0.1.0):
seaborn (>=0.13.2,<0.14.0)
seaborn (==0.11.2)


Adding dependency: sklearn-crfsuite==0.3.6

Updating dependencies
Resolving dependencies...

Package operations: 3 installs, 0 updates, 0 removals

  - Installing python-crfsuite (0.9.11)
  - Installing tabulate (0.9.0)
  - Installing sklearn-crfsuite (0.3.6)

Writing lock file
Adding dependency: tqdm==4.62.3

Updating dependencies
Resolving dependencies...



Incompatible constraints in requirements of bagg-analysis (0.1.0):
tqdm (>=4.67.1,<5.0.0)
tqdm (==4.62.3)


Adding dependency: uncertainty-calibration==0.0.7

Updating dependencies
Resolving dependencies...

Package operations: 3 installs, 0 updates, 0 removals

  - Installing parameterized (0.9.0)
  - Installing sklearn (0.0.post12)

PEP517 build of a dependency failed

Backend subprocess exited when trying to invoke get_requires_for_build_wheel

    | Command '['/tmp/tmp224leml2/.venv/bin/python', '/home/lakishadavid/.local/share/pypoetry/venv/lib/python3.10/site-packages/pyproject_hooks/_in_process/_in_process.py', 'get_requires_for_build_wheel', '/tmp/tmp7coa2dkx']' returned non-zero exit status 1.
    | 
    | The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
    | rather than 'sklearn' for pip commands. 
    | 
    | Here is how to fix this error in the main use cases:
    | - use 'pip install scikit-learn' rather than 'pip install sklearn'
    | - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    |   (requirements.txt, setup.py, setup.cfg, Pipfile, etc 

In [12]:
import numpy as np
import os
import pandas as pd

In [None]:
# Arguments ## REVISE
data_path        = "../utils/gnomix/demo/data/"
query_file       = data_path + "ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
genetic_map_file = data_path + "allchrs.b37.gmap"
reference_file   = data_path + "reference_1000g.vcf"
sample_map_file  = data_path + "1000g.smap"
chm              = "22"
phase            = "False"
output_basename  = "./demo/output"

In [None]:
# Second cell - Download pre-trained models
!sh download_pretrained_models.sh

In [None]:
# Third cell - Import required libraries and set up paths
import os
import subprocess
from pathlib import Path

# Configure your input/output paths
VCF_INPUT = "path/to/your/input.vcf.gz"  # Replace with your VCF file path
OUTPUT_DIR = "gnomix_results"
CHROMOSOME = "22"  # Replace with your chromosome number
USE_PHASE_CORRECTION = "True"  # Set to "False" if you don't want phase correction

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Fourth cell - Run Gnomix using pre-trained model
def run_gnomix(vcf_input, output_dir, chr_nr, phase):
    # Get path to pre-trained model for specified chromosome
    model_path = f"pretrained_gnomix_models/chr{chr_nr}/default_model.pkl"
    
    # Construct command
    cmd = [
        "python3", 
        "gnomix.py",
        vcf_input,
        output_dir,
        chr_nr,
        phase,
        model_path
    ]
    
    # Run command
    process = subprocess.run(
        cmd,
        capture_output=True,
        text=True
    )
    
    return process.stdout, process.stderr

# Run Gnomix
stdout, stderr = run_gnomix(
    VCF_INPUT,
    OUTPUT_DIR,
    CHROMOSOME,
    USE_PHASE_CORRECTION
)

print("STDOUT:", stdout)
print("STDERR:", stderr)

In [None]:
# Fifth cell - Load and examine results
import pandas as pd

# Load MSP results (single ancestry estimates)
msp_file = os.path.join(OUTPUT_DIR, "query_results.msp")
msp_results = pd.read_csv(msp_file, skiprows=1, sep='\t')
print("MSP Results Preview:")
print(msp_results.head())

# Load FB results (probability estimates)
fb_file = os.path.join(OUTPUT_DIR, "query_results.fb")
fb_results = pd.read_csv(fb_file, skiprows=1, sep='\t')
print("\nFB Results Preview:")
print(fb_results.head())

In [None]:
# Sixth cell - Optional: Visualize results if enabled in config
# Note: This requires setting visualize_inference: True in config.yaml

# Load and modify config
import yaml

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Enable visualization
config['inference']['visualize_inference'] = True

# Save modified config
with open('config.yaml', 'w') as f:
    yaml.dump(config, f)

# Re-run Gnomix with visualization enabled
stdout, stderr = run_gnomix(
    VCF_INPUT,
    OUTPUT_DIR,
    CHROMOSOME,
    USE_PHASE_CORRECTION
)