In [1]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv
import numpy as np
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc
from intervaltree import IntervalTree
import shutil

In [2]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

os.environ["WORKING_DIRECTORY"] = working_directory
os.environ["DATA_DIRECTORY"] = data_directory
os.environ["REFERENCES_DIRECTORY"] = references_directory
os.environ["RESULTS_DIRECTORY"] = results_directory
os.environ["UTILS_DIRECTORY"] = utils_directory

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Loaded environment variables from: /home/lakishadavid/computational_genetic_genealogy/.env
Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils
The current directory is /home/lakishadavid/computational_genetic_genealogy


In [3]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
        
log_filename = os.path.join(results_directory, "lab8_log.txt")
print(f"The Lab 8 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

The Lab 8 log file is located at /home/lakishadavid/computational_genetic_genealogy/results/lab8_log.txt.


### Prune the input vcf (real data)

In [None]:
%%bash

# Define the input and output VCF files
input_vcf=${DATA_DIRECTORY}/class_data/merged_opensnps_data_autosomes.vcf.gz
output_vcf=${RESULTS_DIRECTORY}/merged_opensnps_data_autosomes_pruned

# Create directory in results directory - lab8_output
mkdir -p ${RESULTS_DIRECTORY}/lab8_output

echo "Input sample size:" $(bcftools query -l ${input_vcf} | wc -l)

# Step 1: Convert the VCF file to PLINK binary format (BED/BIM/FAM)
plink2 --vcf ${input_vcf} --make-bed --out dataset

# Step 2: Remove close relatives using a KING cutoff of 0.125.
# The threshold 0.125 corresponds roughly to the expected kinship coefficient for first cousins.
plink2 --bfile dataset --king-cutoff 0.125 --make-bed --out ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated

# Step 3: Convert the filtered, unrelated dataset back to VCF format
plink2 --bfile ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated --export vcf --out ${output_vcf}

# Step 4: Compress the VCF file using bgzip and index it using tabix
bgzip -c ${output_vcf}.vcf > ${output_vcf}.vcf.gz
tabix -p vcf ${output_vcf}.vcf.gz
rm ${output_vcf}.vcf

# Step 5: Use bcftools to report the sample size in both the output VCF files.
echo "######################################################################"
echo "Output sample size:" $(bcftools query -l ${output_vcf}.vcf.gz | wc -l)
echo "######################################################################"

# Remove all the intermediate files
rm ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated.bed
rm ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated.bim
rm ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated.fam
rm ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated.log
rm ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated.king.cutoff.in.id
rm ${RESULTS_DIRECTORY}/lab8_output/dataset_unrelated.king.cutoff.out.id

${RESULTS_DIRECTORY}/lab8_output/dataset.log

# make directory for the results
mkdir -p ${RESULTS_DIRECTORY}/ped_sim_run2_autosomes/unphased_samples

for chr in {1..22}; do
    
    plink2 --vcf ${output_vcf}.vcf.gz \
           --chr ${chr} \
           --export vcf bgz \
           --out ${RESULTS_DIRECTORY}/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_data_pruned_chr${chr}
done

### Phasing

In [None]:
%%bash -s "$results_directory"

results_directory=$1

sample_file="merged_opensnps"
beagle="${UTILS_DIRECTORY}/beagle.17Dec24.224.jar"

# Create directories
mkdir -p "${results_directory}/ped_sim_run2_autosomes/phased_samples"

# Phase chromosomes using Beagle
for chr in {1..22}; do
    echo "Processing chromosome $chr"

    INPUT_VCF="${RESULTS_DIRECTORY}/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_data_pruned_chr${chr}.vcf.gz"
    REF_VCF="${REFERENCES_DIRECTORY}/onethousandgenomes_genotype_no_chr_prefix/subset_chr${chr}.vcf.gz"
    MAP_FILE="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map"
    OUTPUT_PREFIX="${results_directory}/ped_sim_run2_autosomes/phased_samples/merged_opensnps_data_pruned_chr${chr}"
    PHASED_VCF="${OUTPUT_PREFIX}.vcf.gz"
    TEMP_VCF="${results_directory}/ped_sim_run2_autosomes/phased_samples/temp_pruned_chr${chr}.vcf.gz"
    SORTED_VCF="${results_directory}/ped_sim_run2_autosomes/phased_samples/merged_opensnps_data_pruned_phased_chr${chr}_sorted.vcf.gz"

    # Check if input VCF exists
    if [ ! -f "$INPUT_VCF" ]; then
        echo "Input VCF file not found for chromosome $chr. Skipping."
        echo "$INPUT_VCF"
        continue
    fi

    # Run Beagle phasing
    if [ -f "$REF_VCF" ]; then
        echo "Running Beagle with reference panel for chromosome $chr"
        echo "$INPUT_VCF"
        java -jar ${beagle} \
            gt="$INPUT_VCF" \
            ref="$REF_VCF" \
            map="$MAP_FILE" \
            out="$OUTPUT_PREFIX" || {
                echo "Beagle failed for chromosome $chr. Skipping."
                continue
            }
    else
        echo "Running Beagle without reference panel for chromosome $chr"
        java -jar ${beagle} \
            gt="$INPUT_VCF" \
            map="$MAP_FILE" \
            out="$OUTPUT_PREFIX" || {
                echo "Beagle failed for chromosome $chr. Skipping."
                continue
            }
    fi

    if [ ! -f "$PHASED_VCF" ]; then
        echo "Phasing failed for chromosome $chr. Output file not found. Skipping."
        continue
    fi

    # Index the file
    tabix -f -p vcf "$PHASED_VCF"
    
    # Add INFO field definition and sort
    echo "Sorting VCF for chromosome $chr"
    bcftools annotate --header-lines <(echo '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">') "$PHASED_VCF" | \
    bcftools sort -Oz -o "$SORTED_VCF" || {
        echo "Sorting failed for chromosome $chr"
        continue
    }

    # Index the sorted file
    tabix -f -p vcf "$SORTED_VCF"
    
    # If the sorted vcf and index exists, remove phased vcf and index
    if [ -f "$SORTED_VCF" ] && [ -f "$SORTED_VCF.tbi" ]; then
        rm -f "$PHASED_VCF"
        rm -f "$PHASED_VCF.tbi"
    fi
done

bcftools concat -Oz -o "${results_directory}/ped_sim_run2_autosomes/phased_samples/merged_opensnps_data_pruned_phased_all_sorted.vcf.gz" "${results_directory}/ped_sim_run2_autosomes/phased_samples/merged_opensnps_data_pruned_phased_chr"{1..22}"_sorted.vcf.gz"
tabix -f -p vcf "${results_directory}/ped_sim_run2_autosomes/phased_samples/merged_opensnps_data_pruned_phased_all_sorted.vcf.gz"

In [None]:
%%bash

# Set up the references directory
references_directory="${REFERENCES_DIRECTORY}"
mkdir -p "$references_directory"

# Download the genetic map
wget https://github.com/cbherer/Bherer_etal_SexualDimorphismRecombination/raw/master/Refined_genetic_map_b37.tar.gz -P $references_directory
tar xvzf $references_directory/Refined_genetic_map_b37.tar.gz -C $references_directory

# Create the combined map file
printf "#chr\tpos\tmale_cM\tfemale_cM\n" > $references_directory/refined_mf_b37.simmap

# Process each chromosome
for chr in {1..22}; do
  paste $references_directory/Refined_genetic_map_b37/male_chr$chr.txt $references_directory/Refined_genetic_map_b37/female_chr$chr.txt \
    | awk -v OFS="\t" 'NR > 1 && $2 == $6 {print $1,$2,$4,$8}' \
    | sed 's/^chr//' >> $references_directory/refined_mf_b37.simmap
done

# Clean up the downloaded files
rm $references_directory/Refined_genetic_map_b37.tar.gz
rm -r $references_directory/Refined_genetic_map_b37

# Download the chain file for liftOver
wget -O "${references_directory}/hg19ToHg38.over.chain.gz" \
  "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz"

# Create a BED file from the build 37 simmap
awk 'NR>1 {print "chr"$1, $2-1, $2, $3, $4}' OFS="\t" $references_directory/refined_mf_b37.simmap > $references_directory/refined_mf_b37.bed

# Run liftOver to convert coordinates
liftOver $references_directory/refined_mf_b37.bed \
         $references_directory/hg19ToHg38.over.chain.gz \
         $references_directory/refined_mf_b38.bed \
         $references_directory/refined_mf_b38.unmapped

# Clean up temporary files
rm $references_directory/refined_mf_b37.bed

echo "✅ Genetic map successfully created at: $references_directory/refined_mf_b38.bed"
echo "This file contains the sex-specific genetic map in build 38 coordinates, ready for ped-sim."

### Simulate the seg and vcf data for the given genetic family tree

In [None]:
def simulate_ground_truth():
    """Run ped-sim to simulate IBD segments based on pedigree definition"""
    vcf_file = f"{results_directory}/ped_sim_run2_autosomes/phased_samples/merged_opensnps_data_pruned_phased_all_sorted.vcf.gz"
    pedigree_def_file = f"{data_directory}/class_data/pedigree.def"
    refined_map = f"{references_directory}/refined_mf_b38.simmap"
    ped_sim_exec = f"{utils_directory}/ped-sim/ped-sim"
    interfere_file = f"{utils_directory}/ped-sim/interfere/nu_p_campbell.tsv"
    ped_sim_basename = "merged_opensnps_autosomes_ped_sim"
    output_prefix = f"{results_directory}/{ped_sim_basename}"
    
    # Run ped-sim
    print(f"Running ped-sim with input VCF {vcf_file}")
    cmd = [
        ped_sim_exec,
        "-d", pedigree_def_file,
        "-m", refined_map,
        "-o", output_prefix,
        "-i", vcf_file,
        "--intf", interfere_file,
        "--seed", "1234",
        "--fam",
        "--mrca"
    ]
    
    subprocess.run(cmd, check=True)
    
    # Check results
    if os.path.exists(f"{output_prefix}.seg") and os.path.exists(f"{output_prefix}.vcf.gz"):
        print(f"Truth segments file: {output_prefix}.seg")
        print(f"Simulated VCF file: {output_prefix}.vcf.gz")
        return f"{output_prefix}.vcf.gz", f"{output_prefix}.seg"

    else:
        print("ERROR: Ped-sim failed to generate output files")
        return None
    
simulated_vcf, simulated_seg = simulate_ground_truth()

# Preparing Data for IBD Detection Algorithms

## Overview
To accurately evaluate the performance of different IBD detection methods, we need to prepare the data in formats compatible with each algorithm. Each tool has specific input requirements that must be met.

## Algorithm Requirements
- **RefinedIBD**: Requires phased data, processes by chromosome
- **HapIBD**: Requires phased data, processes by chromosome
- **IBIS**: Can work with unphased data, can process all chromosomes together

## Preparation Steps

1. **Split by Chromosome**: The simulated VCF file contains data for all autosomes. We'll extract each chromosome separately to allow chromosome-by-chromosome processing.

2. **Quality Control**: For each chromosome, we apply standard QC filters:
   - Keep only biallelic SNPs
   - Remove exact duplicate variants
   - Filter variants on minor allele frequency (MAF > 0.05)
   - Remove variants with high missing data rates (< 5%)

3. **Phasing**: For RefinedIBD and HapIBD, we'll phase each chromosome separately using Beagle:
   - Use reference panels when available for improved phasing accuracy
   - Incorporate genetic maps for accurate recombination estimation
   - Add necessary INFO fields required by downstream tools

4. **Merging**: For IBIS, we'll merge all phased chromosomes back together into a single file

The output of this process includes:
- Chromosome-specific phased VCF files for RefinedIBD and HapIBD
- A merged phased VCF for IBIS
- File paths organized in a dictionary for subsequent analysis steps

To run the data preparation function:

```python
data_paths = prepare_data_for_ibd_detection(simulated_prefix)
```

### Check Genetic Maps

In [None]:
def prepare_genetic_maps():
    """Prepare genetic maps for all IBD detection algorithms"""
    print("Preparing genetic maps for IBD detection algorithms...")
    
    # Create genetic maps directories
    beagle_map_dir = os.path.join(references_directory, "genetic_maps/beagle_genetic_maps")
    ibis_map_dir = os.path.join(references_directory, "genetic_maps/ibis_genetic_maps")
    os.makedirs(beagle_map_dir, exist_ok=True)
    os.makedirs(ibis_map_dir, exist_ok=True)
    
    # Check if we have Beagle maps already
    beagle_maps_exist = any(f.endswith(".map") for f in os.listdir(beagle_map_dir)) if os.path.exists(beagle_map_dir) else False
    
    # Download Beagle maps if needed
    if not beagle_maps_exist:
        print("Downloading Beagle genetic maps...")
        subprocess.run(
            f"poetry run python -m scripts_support.genetic_maps_download --data-source BEAGLE --assembly GRCh38",
            shell=True, check=True
        )
    
    # Convert Beagle maps to IBIS format
    print("Converting Beagle maps to IBIS format...")
    for map_file in os.listdir(beagle_map_dir):
        if map_file.endswith(".map"):
            beagle_map_filename = os.path.join(beagle_map_dir, map_file)
            ibis_map_filename = os.path.join(ibis_map_dir, map_file)
            
            # For IBIS maps, we need: CHR POSITION GENETIC_POSITION [RATE]
            # From Beagle maps which are: CHR . GENETIC_POSITION PHYSICAL_POSITION
            subprocess.run(
                f"awk '{{print $1, $4, $3, 0}}' {beagle_map_filename} > {ibis_map_filename}",
                shell=True, check=True
            )
    
    print("Genetic maps preparation complete.")
    return beagle_map_dir, ibis_map_dir

In [4]:
output_directory_unphased=f"{results_directory}/ped_sim_run2_autosomes/unphased_samples"
output_directory_phased=f"{results_directory}/ped_sim_run2_autosomes/phased_samples"

In [None]:
# # debugging
# simulated_seg = "/home/lakishadavid/computational_genetic_genealogy/results/merged_opensnps_autosomes_ped_sim.seg"
# simulated_vcf = "/home/lakishadavid/computational_genetic_genealogy/results/merged_opensnps_autosomes_ped_sim.vcf.gz"
# simulated_vcf

'/home/lakishadavid/computational_genetic_genealogy/results/merged_opensnps_autosomes_ped_sim.vcf.gz'

### Quality Control

In [13]:
%%bash -s "$simulated_vcf" "$output_directory_unphased" "$output_directory_phased"

simulated_vcf="$1"
echo "simulated_vcf: ${simulated_vcf}"
output_directory_unphased="$2"
output_directory_phased="$3"

# Create a properly BGZF-compressed file from scratch
gunzip -c "$simulated_vcf" > "${simulated_vcf%.gz}.temp"
bgzip -c "${simulated_vcf%.gz}.temp" > "${simulated_vcf%.gz}.proper.gz"
rm "${simulated_vcf%.gz}.temp"  # Clean up the temp file
rm "${simulated_vcf}"
mv "${simulated_vcf%.gz}.proper.gz" "${simulated_vcf}"
# Now index the properly compressed file
tabix -p vcf "${simulated_vcf}"


# Get the base name of the VCF file
filename_wo_ext="${simulated_vcf%.vcf.gz}"
file_basename=$(basename "$filename_wo_ext")

# Run Quality Control (QC) on the VCF file
for chromosome in {1..22}; do
    echo "Processing chromosome $chromosome..."
    
    output_vcf="$output_directory_unphased/${file_basename}_qcfinished_chr${chromosome}.vcf.gz"
    
    # Extended QC pipeline:
    # 1. Select autosomal chromosome
    # 2. Keep only biallelic SNPs
    # -m2 keeps only variants with at least 2 alleles
    # -M2 keeps only variants with at most 2 alleles
    # could add: -i 'strlen(REF)=1 && strlen(ALT)=1' | \
    # 3. Remove exact duplicate variants
    # 4. Filter on MAF and missing data
    # 5. Sort variants
    bcftools view "$simulated_vcf" \
        --regions "${chromosome}" \
        --types snps \
        -m2 -M2 \
        -i 'strlen(REF)=1 && strlen(ALT)=1' | \
    bcftools norm --rm-dup exact | \
    bcftools view \
        -q 0.05:minor \
        -i 'F_MISSING < 0.05' | \
    bcftools sort -Oz -o "$output_vcf"
    
    # Index the final VCF with force flag
    bcftools index -f "$output_vcf"
    
    # Report number of variants
    # echo "Number of variants in chromosome $chromosome after QC:"
    bcftools index -n "$output_vcf"
    echo
done

beagle="${UTILS_DIRECTORY}/beagle.17Dec24.224.jar"

# Phase chromosomes using Beagle
for chr in {1..22}; do
    echo "Processing chromosome ${chr}"

    INPUT_VCF="${output_directory_unphased}/${file_basename}_qcfinished_chr${chr}.vcf.gz"
    REF_VCF="${REFERENCES_DIRECTORY}/onethousandgenomes_genotype/onethousandgenomes_genotyped_phased.chr${chr}.vcf.gz"
    MAP_FILE="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map"
    OUTPUT_PREFIX="${output_directory_phased}/${file_basename}_phased_chr${chr}_temp"
    PHASED_VCF="${OUTPUT_PREFIX}.vcf.gz"
    TEMP_VCF="${output_directory_phased}/temp_chr${chr}.vcf.gz"
    SORTED_VCF="${output_directory_phased}/${file_basename}_phased_chr${chr}.vcf.gz"

    # Check if input VCF exists
    if [ ! -f "${INPUT_VCF}" ]; then
        echo "Input VCF file not found for chromosome ${chr}. Skipping."
        echo "${INPUT_VCF}"
        continue
    fi

    # Run Beagle phasing
    if [ -f "${REF_VCF}" ]; then
        echo "Running Beagle with reference panel for chromosome ${chr}"
        java -jar ${beagle} \
            gt="${INPUT_VCF}" \
            ref="${REF_VCF}" \
            map="${MAP_FILE}" \
            out="${OUTPUT_PREFIX}" || {
                echo "Beagle failed for chromosome ${chr}. Skipping."
                continue
            }
    else
        echo "Running Beagle without reference panel for chromosome ${chr}"
        java -jar ${beagle} \
            gt="${INPUT_VCF}" \
            map="${MAP_FILE}" \
            out="${OUTPUT_PREFIX}" || {
                echo "Beagle failed for chromosome ${chr}. Skipping."
                continue
            }
    fi

    if [ ! -f "${PHASED_VCF}" ]; then
        echo "Phasing failed for chromosome ${chr}. Output file not found. Skipping."
        continue
    fi

    # Index the file
    tabix -f -p vcf "${PHASED_VCF}"
    
    # Add INFO field definition and sort
    echo "Sorting VCF for chromosome $CHR"
    bcftools annotate --header-lines <(echo '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">') "${PHASED_VCF}" | \
    bcftools sort -Oz -o "${SORTED_VCF}" || {
        echo "Sorting failed for chromosome $CHR"
        continue
    }

    # Index the sorted file
    tabix -f -p vcf "${SORTED_VCF}"
    
    # If the sorted vcf and index exists, remove phased vcf and index
    if [ -f "${SORTED_VCF}" ] && [ -f "${SORTED_VCF}.tbi" ]; then
        rm -f "${PHASED_VCF}"
        rm -f "${PHASED_VCF}.tbi"
        rm -f "${PHASED_VCF}.log"
    fi
done

simulated_vcf: /home/lakishadavid/computational_genetic_genealogy/results/merged_opensnps_autosomes_ped_sim.vcf.gz
Processing chromosome 1...


Writing to /tmp/bcftools.DMRpUq
Lines   total/split/joined/realigned/skipped:	29944/0/0/0/0
Merging 1 temporary files
Cleaning
Done


29710

Processing chromosome 2...


Writing to /tmp/bcftools.LcOvE3
Lines   total/split/joined/realigned/skipped:	29929/0/0/0/0
Merging 1 temporary files
Cleaning
Done


29674

Processing chromosome 3...


Writing to /tmp/bcftools.toijRC
Lines   total/split/joined/realigned/skipped:	25047/0/0/0/0
Merging 1 temporary files
Cleaning
Done


24874

Processing chromosome 4...


Writing to /tmp/bcftools.zEjPkk
Lines   total/split/joined/realigned/skipped:	22237/0/0/0/0
Merging 1 temporary files
Cleaning
Done


22058

Processing chromosome 5...


Writing to /tmp/bcftools.HKxxNH
Lines   total/split/joined/realigned/skipped:	23127/0/0/0/0
Merging 1 temporary files
Cleaning
Done


22953

Processing chromosome 6...


Writing to /tmp/bcftools.1yTxvm
Lines   total/split/joined/realigned/skipped:	23719/0/0/0/0
Merging 1 temporary files
Cleaning
Done


23506

Processing chromosome 7...


Writing to /tmp/bcftools.LN4KOe
Lines   total/split/joined/realigned/skipped:	20559/0/0/0/0
Merging 1 temporary files
Cleaning
Done


20392

Processing chromosome 8...


Writing to /tmp/bcftools.EmxU0b
Lines   total/split/joined/realigned/skipped:	20385/0/0/0/0
Merging 1 temporary files
Cleaning
Done


20230

Processing chromosome 9...


Writing to /tmp/bcftools.SkHJnQ
Lines   total/split/joined/realigned/skipped:	18381/0/0/0/0
Merging 1 temporary files
Cleaning
Done


18249

Processing chromosome 10...


Writing to /tmp/bcftools.6jMZSP
Lines   total/split/joined/realigned/skipped:	20435/0/0/0/0
Merging 1 temporary files
Cleaning
Done


20270

Processing chromosome 11...


Writing to /tmp/bcftools.5HrhzL
Lines   total/split/joined/realigned/skipped:	18581/0/0/0/0
Merging 1 temporary files
Cleaning
Done


18435

Processing chromosome 12...


Writing to /tmp/bcftools.zS9woc
Lines   total/split/joined/realigned/skipped:	18940/0/0/0/0
Merging 1 temporary files
Cleaning
Done


18789

Processing chromosome 13...


Writing to /tmp/bcftools.cpngrF
Lines   total/split/joined/realigned/skipped:	14141/0/0/0/0
Merging 1 temporary files
Cleaning
Done


14046

Processing chromosome 14...


Writing to /tmp/bcftools.XWB2bW
Lines   total/split/joined/realigned/skipped:	12537/0/0/0/0
Merging 1 temporary files
Cleaning
Done


12432

Processing chromosome 15...


Writing to /tmp/bcftools.FXDWLd
Lines   total/split/joined/realigned/skipped:	11925/0/0/0/0
Merging 1 temporary files
Cleaning
Done


11850

Processing chromosome 16...


Writing to /tmp/bcftools.Oybmix
Lines   total/split/joined/realigned/skipped:	12832/0/0/0/0
Merging 1 temporary files
Cleaning
Done


12747

Processing chromosome 17...


Writing to /tmp/bcftools.1Yz0ko
Lines   total/split/joined/realigned/skipped:	11059/0/0/0/0
Merging 1 temporary files
Cleaning
Done


10978

Processing chromosome 18...


Writing to /tmp/bcftools.F7cMhX
Lines   total/split/joined/realigned/skipped:	11825/0/0/0/0
Merging 1 temporary files
Cleaning
Done


11737

Processing chromosome 19...


Writing to /tmp/bcftools.aULzI2
Lines   total/split/joined/realigned/skipped:	7840/0/0/0/0
Merging 1 temporary files
Cleaning
Done


7781

Processing chromosome 20...


Writing to /tmp/bcftools.lZlGgD
Lines   total/split/joined/realigned/skipped:	10415/0/0/0/0
Merging 1 temporary files
Cleaning
Done


10352

Processing chromosome 21...


Writing to /tmp/bcftools.s8M8xs
Lines   total/split/joined/realigned/skipped:	5862/0/0/0/0
Merging 1 temporary files
Cleaning
Done


5813

Processing chromosome 22...


Writing to /tmp/bcftools.1mwOK5
Lines   total/split/joined/realigned/skipped:	5764/0/0/0/0
Merging 1 temporary files
Cleaning
Done


5729

Processing chromosome 1
Running Beagle without reference panel for chromosome 1
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:28 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr1.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr1.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr1_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [1:817341-19301382]
Study     markers:                3,289

Burnin  iteration 1:           8 seconds
Burnin  iteration 2: 

Writing to /tmp/bcftools.1D8YCD
Merging 1 temporary files
Cleaning
Done


Processing chromosome 2
Running Beagle without reference panel for chromosome 2
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:33 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr2.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr2.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr2_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [2:130357-18028498]
Study     markers:                3,265

Burnin  iteration 1:           8 seconds
Burnin  iteration 2:       

Writing to /tmp/bcftools.Ybno1R
Merging 1 temporary files
Cleaning
Done


Processing chromosome 3
Running Beagle without reference panel for chromosome 3
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:38 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr3.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr3.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr3_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [3:67310-21595904]
Study     markers:                4,198

Burnin  iteration 1:           11 seconds
Burnin  iteration 2:       

Writing to /tmp/bcftools.oqla5U
Merging 1 temporary files
Cleaning
Done


Processing chromosome 4
Running Beagle without reference panel for chromosome 4
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:42 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr4.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr4.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr4_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [4:187687-22822398]
Study     markers:                3,433

Burnin  iteration 1:           9 seconds
Burnin  iteration 2:       

Writing to /tmp/bcftools.pLivfr
Merging 1 temporary files
Cleaning
Done


Processing chromosome 5
Running Beagle without reference panel for chromosome 5
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:46 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr5.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr5.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr5_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [5:163654-24851390]
Study     markers:                3,974

Burnin  iteration 1:           11 seconds
Burnin  iteration 2:      

Writing to /tmp/bcftools.41vFsz
Merging 1 temporary files
Cleaning
Done


Processing chromosome 6
Running Beagle without reference panel for chromosome 6
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:50 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr6.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr6.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr6_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [6:203878-21502446]
Study     markers:                4,247

Burnin  iteration 1:           13 seconds
Burnin  iteration 2:      

Writing to /tmp/bcftools.bpJRCY
Merging 1 temporary files
Cleaning
Done


Processing chromosome 7
Running Beagle without reference panel for chromosome 7
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:54 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr7.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr7.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr7_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [7:46239-22452771]
Study     markers:                4,204

Burnin  iteration 1:           11 seconds
Burnin  iteration 2:       

Writing to /tmp/bcftools.aHE5Ek
Merging 1 temporary files
Cleaning
Done


Processing chromosome 8
Running Beagle without reference panel for chromosome 8
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 07:58 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr8.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr8.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr8_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [8:214984-19079523]
Study     markers:                4,716

Burnin  iteration 1:           12 seconds
Burnin  iteration 2:      

Writing to /tmp/bcftools.c2NaAQ
Merging 1 temporary files
Cleaning
Done


Processing chromosome 9
Running Beagle without reference panel for chromosome 9
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:01 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr9.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr9.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr9_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [9:213149-19771892]
Study     markers:                4,668

Burnin  iteration 1:           15 seconds
Burnin  iteration 2:      

Writing to /tmp/bcftools.IHzWbV
Merging 1 temporary files
Cleaning
Done


Processing chromosome 10
Running Beagle without reference panel for chromosome 10
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:04 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr10.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr10.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr10_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [10:113006-18269869]
Study     markers:                4,214

Burnin  iteration 1:           14 seconds
Burnin  iteration 2:

Writing to /tmp/bcftools.iSBHeB
Merging 1 temporary files
Cleaning
Done


Processing chromosome 11
Running Beagle without reference panel for chromosome 11
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:08 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr11.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr11.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr11_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [11:242112-22689598]
Study     markers:                4,226

Burnin  iteration 1:           13 seconds
Burnin  iteration 2:

Writing to /tmp/bcftools.TRiE82
Merging 1 temporary files
Cleaning
Done


Processing chromosome 12
Running Beagle without reference panel for chromosome 12
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:11 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr12.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr12.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr12_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [12:82453-21998639]
Study     markers:                3,916

Burnin  iteration 1:           9 seconds
Burnin  iteration 2:  

Writing to /tmp/bcftools.Qjd4T7
Merging 1 temporary files
Cleaning
Done


Processing chromosome 13
Running Beagle without reference panel for chromosome 13
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:14 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr13.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr13.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr13_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [13:18706400-40350391]
Study     markers:                3,950

Burnin  iteration 1:           9 seconds
Burnin  iteration 2

Writing to /tmp/bcftools.CKPXzE
Merging 1 temporary files
Cleaning
Done


Processing chromosome 14
Running Beagle without reference panel for chromosome 14
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:17 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr14.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr14.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr14_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [14:18551346-41751717]
Study     markers:                3,438

Burnin  iteration 1:           8 seconds
Burnin  iteration 2

Writing to /tmp/bcftools.Nt4qqa
Merging 1 temporary files
Cleaning
Done


Processing chromosome 15
Running Beagle without reference panel for chromosome 15
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:19 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr15.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr15.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr15_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [15:19956119-32327926]
Study     markers:                1,435

Burnin  iteration 1:           6 seconds
Burnin  iteration 2

Writing to /tmp/bcftools.x1BNHe
Merging 1 temporary files
Cleaning
Done


Processing chromosome 16
Running Beagle without reference panel for chromosome 16
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:21 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr16.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr16.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr16_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [16:102222-19261586]
Study     markers:                3,586

Burnin  iteration 1:           11 seconds
Burnin  iteration 2:

Writing to /tmp/bcftools.Cytill
Merging 1 temporary files
Cleaning
Done


Processing chromosome 17
Running Beagle without reference panel for chromosome 17
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:24 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr17.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr17.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr17_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [17:163755-15106933]
Study     markers:                3,260

Burnin  iteration 1:           8 seconds
Burnin  iteration 2: 

Writing to /tmp/bcftools.8PKD8p
Merging 1 temporary files
Cleaning
Done


Processing chromosome 18
Running Beagle without reference panel for chromosome 18
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:26 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr18.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr18.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr18_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [18:112535-21568311]
Study     markers:                3,168

Burnin  iteration 1:           8 seconds
Burnin  iteration 2: 

Writing to /tmp/bcftools.eyjZ60
Merging 1 temporary files
Cleaning
Done


Processing chromosome 19
Running Beagle without reference panel for chromosome 19
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:29 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr19.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr19.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr19_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [19:267039-16089568]
Study     markers:                2,569

Burnin  iteration 1:           8 seconds
Burnin  iteration 2: 

Writing to /tmp/bcftools.VX9qzl
Merging 1 temporary files
Cleaning
Done


Processing chromosome 20
Running Beagle without reference panel for chromosome 20
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:30 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr20.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr20.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr20_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [20:88453-17059314]
Study     markers:                3,660

Burnin  iteration 1:           10 seconds
Burnin  iteration 2: 

Writing to /tmp/bcftools.86L7kM
Merging 1 temporary files
Cleaning
Done


Processing chromosome 21
Running Beagle without reference panel for chromosome 21
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:32 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr21.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr21.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr21_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [21:13297610-36615618]
Study     markers:                3,712

Burnin  iteration 1:           9 seconds
Burnin  iteration 2

Writing to /tmp/bcftools.UZOwUG
Merging 1 temporary files
Cleaning
Done


Processing chromosome 22
Running Beagle without reference panel for chromosome 22
beagle.17Dec24.224.jar (version 5.5)
Copyright (C) 2014-2024 Brian L. Browning
Enter "java -jar beagle.17Dec24.224.jar" to list command line argument
Start time: 08:33 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar beagle.17Dec24.224.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/unphased_samples/merged_opensnps_autosomes_ped_sim_qcfinished_chr22.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr22.GRCh38.map
  out=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr22_temp
  nthreads=12

Reference samples:                    0
Study     samples:                  520

Window 1 [22:16613859-35645603]
Study     markers:                3,049

Burnin  iteration 1:           8 seconds
Burnin  iteration 2

Writing to /tmp/bcftools.e8qOhq
Merging 1 temporary files
Cleaning
Done


### Refined IBD

In [None]:
%%bash -s "$simulated_vcf" "$output_directory_unphased" "$output_directory_phased"

simulated_vcf="$1"
output_directory_unphased="$2"
output_directory_phased="$3"
echo "output_directory_phased: ${output_directory_phased}"

# Get the base name of the VCF file
filename_wo_ext="${simulated_vcf%.vcf.gz}"
file_basename=$(basename "$filename_wo_ext")

# Define the Refined-IBD executable paths
refined_ibd="${UTILS_DIRECTORY}/refined-ibd.17Jan20.102.jar"
merge_ibd_segments="${UTILS_DIRECTORY}/merge-ibd-segments.17Jan20.102.jar"

# Create directory for segments
mkdir -p "${RESULTS_DIRECTORY}/segments"

# Loop for multiple runs to enhance sensitivity
for run in {1..3}; do
    for chr in {1..22}; do
        phased_file="${output_directory_phased}/${file_basename}_phased_chr${chr}.vcf.gz"
        
        # Check if file exists
        if [[ ! -f "${phased_file}" ]]; then
            echo "Phased file not found for chr${chr}: ${phased_file}"
            continue
        fi
        
        echo "Processing chromosome ${chr}, run ${run} with RefinedIBD"
        
        # Run RefinedIBD
        java -jar "${refined_ibd}" \
            gt="${phased_file}" \
            map="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map" \
            lod=4 \
            length=3 \
            out="${RESULTS_DIRECTORY}/segments/temp_${file_basename}_refinedibd_chr${chr}_run${run}.seg" \
            nthreads=4
    done
done

gap_threshold=0.6
discord_threshold=1

for chr in {1..22}; do
    echo "🔀 Merging IBD segments for chr${chr}"
    phased_file="${output_phased}/${file_basename}_phased_chr${chr}.vcf.gz"
    map_file="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map"
    merged_out="${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_merged.seg"

    # collect the .ibd.gz outputs from each run
    ibd_files=()
    for run in {1..3}; do
        prefix="${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_run${run}"
        f="${prefix}.ibd.gz"
        if [[ -f "$f" ]]; then
        ibd_files+=("$f")
        else
        echo "   ⚠️  Missing IBD file: $f"
        fi
    done

    if (( ${#ibd_files[@]} == 0 )); then
        echo "   ❌  No runs to merge for chr${chr}, skipping."
        continue
    fi

    # decompress & merge
    echo "   📦  Found ${#ibd_files[@]} runs; merging..."
    # process substitution avoids combining into one long string
    {
        for f in "${ibd_files[@]}"; do
        zcat "$f"
        done
    } | java -jar "${merge_ibd}" \
            "${phased_file}" \
            "${map_file}" \
            "${gap_threshold}" \
            "${discord_threshold}" \
        > "${merged_out}"

    echo "   ✅  Merged output written to ${merged_out}"
done

# Combine all per‐chromosome merge files into one final output
final_output="${RESULTS_DIRECTORY}/${file_basename}_refinedibd.seg"
> "${final_output}"   # truncate or create

for chr in {1..22}; do
    chr_merged="${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_merged.seg"
    if [[ -f "${chr_merged}" ]]; then
        cat "${chr_merged}" >> "${final_output}"
    else
        echo "⚠️  Missing merged file for chr${chr}: ${chr_merged}"
    fi
done

echo "✅  All chromosomes merged into: ${final_output}"

# 4. Clean up intermediate files
for chr in {1..22}; do
    for run in {1..3}; do
        rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_run${run}.ibd.gz"
        rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_run${run}.hbd.gz"
        rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_run${run}.log"
    done
    rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_refinedibd_chr${chr}_merged.seg"
done

echo "🧹  Cleaned up all temporary segment files."

output_directory_phased: /home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples
Processing chromosome 1, run 1 with RefinedIBD
refined-ibd.17Jan20.102.jar
Copyright (C) 2014-2017 Brian L. Browning
Enter java -jar refined-ibd.17Jan20.102.jar to list command line argument
Start time: 08:35 PM CDT on 27 Apr 2025

Command line: java -Xmx1948m -jar refined-ibd.17Jan20.102.jar
  gt=/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/merged_opensnps_autosomes_ped_sim_phased_chr1.vcf.gz
  map=/home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr1.GRCh38.map
  lod=4
  length=3
  out=/home/lakishadavid/computational_genetic_genealogy/results/segments/temp_merged_opensnps_autosomes_ped_sim_refinedibd_chr1_run1.seg
  nthreads=4

Samples:                 520        
Model scale:             2.28       

Window 1 (1:817341-19301382)
Markers:                 3,289 

In [None]:
# Check RefinedIBD output
refined_ibd_output = os.path.join(results_directory, "merged_opensnps_autosomes_ped_sim_refinedibd.seg")
if os.path.exists(refined_ibd_output):
    print(f"RefinedIBD output found: {refined_ibd_output}")
    print(f"File size: {os.path.getsize(refined_ibd_output)} bytes")
else:
    print(f"RefinedIBD output not found: {refined_ibd_output}")

### HAP-IBD

In [None]:
%%bash -s "$simulated_vcf" "$output_directory_unphased" "$output_directory_phased"

simulated_vcf="$1"
output_directory_unphased="$2"
output_directory_phased="$3"
echo "output_directory_phased: ${output_directory_phased}"

# Get the base name of the VCF file
filename_wo_ext="${simulated_vcf%.vcf.gz}"
file_basename=$(basename "$filename_wo_ext")

# Define the Hap-IBD executable paths
hap_ibd="${UTILS_DIRECTORY}/hap-ibd.jar"

# Create directory for segments
mkdir -p "${RESULTS_DIRECTORY}/segments"

    for chr in {1..22}; do
        phased_file="${output_directory_phased}/${file_basename}_phased_chr${chr}.vcf.gz"
        
        echo "Processing chromosome ${chr} with Hap IBD"
        
        # Run Hap IBD
        java -jar "${hap_ibd}" \
            gt="${phased_file}" \
            map="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map" \
            lod=4 \
            length=3 \
            out="${RESULTS_DIRECTORY}/segments/${file_basename}_hapibd_chr${chr}.seg" \
            nthreads=4
    done



# Combine all per‐chromosome merge files into one final output
final_output="${RESULTS_DIRECTORY}/${file_basename}_hapibd.seg"
> "${final_output}"   # truncate or create

for chr in {1..22}; do
    chr_merged="${RESULTS_DIRECTORY}/segments/${file_basename}_hapibd_chr${chr}.seg"
    if [[ -f "${chr_merged}" ]]; then
        cat "${chr_merged}" >> "${final_output}"
    else
        echo "⚠️  Missing merged file for chr${chr}: ${chr_merged}"
    fi
done

echo "✅  All chromosomes merged into: ${final_output}"

# 4. Clean up intermediate files
for run in {1..3}; do
    rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_hapibd_chr${chr}.ibd.gz"
    rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_hapibd_chr${chr}.hbd.gz"
    rm -f "${RESULTS_DIRECTORY}/segments/${file_basename}_hapibd_chr${chr}.log"
done

echo "🧹  Cleaned up all temporary segment files."

In [None]:
# Check HapIBD output
hap_ibd_output = os.path.join(results_directory, "merged_opensnps_autosomes_ped_sim_hapibd.seg")
if os.path.exists(hap_ibd_output):
    print(f"HapIBD output found: {hap_ibd_output}")
    print(f"File size: {os.path.getsize(hap_ibd_output)} bytes")
else:
    print(f"HapIBD output not found: {hap_ibd_output}")

In [None]:
# Attempt to convert this notebook to PDF using the Poetry environment
# Note: The notebook's CWD should be the project's base directory now.
print(f"Current directory for conversion: {os.getcwd()}")
notebook_path = "labs/Lab8_Evaluate_IBD_Detection.ipynb"

if Path(notebook_path).exists():
    print(f"Converting {notebook_path} to PDF...")
    !poetry run jupyter nbconvert --to pdf "{notebook_path}"
else:
    print(f"Error: Notebook not found at relative path {notebook_path} from CWD {os.getcwd()}")