In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

os.environ["WORKING_DIRECTORY"] = working_directory
os.environ["DATA_DIRECTORY"] = data_directory
os.environ["REFERENCES_DIRECTORY"] = references_directory
os.environ["RESULTS_DIRECTORY"] = results_directory
os.environ["UTILS_DIRECTORY"] = utils_directory

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab6.log")
print(f"The Lab 6 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

### Run the hap-IBD Detection Algorithm

**Select your VCF file**

In the next cell, uncomment the file you want to use.

In [24]:
# vcf_file = os.path.join(results_directory, "merged_sample_autosomes_unphased.vcf.gz")
# vcf_directory = os.path.join(results_directory, "real_data_autosomes")

vcf_file = os.path.join(results_directory, "ped_sim_run2_autosomes.vcf.gz")
vcf_directory = os.path.join(results_directory, "ped_sim_run2_autosomes")

In [25]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Define the hap-IBD executable path
hap_ibd="${UTILS_DIRECTORY}/hap-ibd.jar"

# Ensure the hap-IBD executable exists
if [[ ! -f "${hap_ibd}" ]]; then
    echo "Error: Hap-IBD executable not found: ${hap_ibd}" >&2
fi

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
# base_name remove "_autosomes" suffixif present
base_name=${base_name/_autosomes/}


# Create the output directories
mkdir -p "${vcf_directory}/segments"
mkdir -p "${vcf_directory}/phased_samples"

# Run hap-IBD analysis in loop by chromosome
for chr in {1..22}; do
    data_file="${vcf_directory}/phased_samples/${base_name}_phased_chr${chr}.vcf.gz"
    echo "data_file: ${data_file}"
    if [[ ! -f "${data_file}" ]]; then
        echo "No matching VCF file found" >&2
        exit 1
    fi

    java -jar "${hap_ibd}" gt="${data_file}" \
        map="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map" \
        out="${vcf_directory}/segments/${base_name}_hapibd_chr${chr}.seg" \
        nthreads=4
done


if [[ $? -eq 0 ]]; then
    echo "hap-IBD analysis completed successfully."
else
    echo "Error running hap-IBD analysis." >&2
fi


data_file: /home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/ped_sim_run2_phased_chr1.vcf.gz
Copyright (C) 2019-2023 Brian L. Browning
Enter "java -jar hap-ibd.jar" to print a list of command line arguments

Program            :  hap-ibd.jar  [ version 1.0, 15Jun23.92f ]
Start Time         :  09:18 AM CST on 05 Mar 2025
Max Memory         :  1948 MB

Parameters
  gt               :  /home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/phased_samples/ped_sim_run2_phased_chr1.vcf.gz
  map              :  /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr1.GRCh38.map
  out              :  /home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/segments/ped_sim_run2_hapibd_chr1.seg
  min-seed         :  2.0
  max-gap          :  1000
  min-extend       :  1.0
  min-output       :  2.0
  min-markers      :  100
  min-mac          :  2
  

In [26]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Define the hap-IBD executable path
hap_ibd="${UTILS_DIRECTORY}/hap-ibd.jar"

# Ensure the hap-IBD executable exists
if [[ ! -f "${hap_ibd}" ]]; then
    echo "Error: Hap-IBD executable not found: ${hap_ibd}" >&2
fi

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
# base_name remove "_autosomes" suffixif present
base_name=${base_name/_autosomes/}

# Create the output directories
mkdir -p "${vcf_directory}/segments"
mkdir -p "${vcf_directory}/phased_samples"

# Create or empty the merged output file
: > "${vcf_directory}/segments/${base_name}_hapibd.seg"

for chr in {1..22}; do
    file1="${vcf_directory}/segments/${base_name}_hapibd_chr${chr}.seg.ibd.gz"
    if [[ -f "${file1}" ]]; then
        zcat "${file1}" >> "${vcf_directory}/segments/${base_name}_hapibd.seg"
    else
        echo "Warning: File for chromosome ${chr} not found during concatenation." >&2
    fi
done

# Remove temporary files using a loop
for chr in {1..22}; do
    rm -f "${vcf_directory}/segments/${base_name}_hapibd_chr${chr}.seg.ibd.gz"
    rm -f "${vcf_directory}/segments/${base_name}_hapibd_chr${chr}.seg.hbd.gz"
    rm -f "${vcf_directory}/segments/${base_name}_hapibd_chr${chr}.seg.log"
done

# note, this still leaves the hbd and log files

### Explore The Segments Results

In [30]:
prefix = vcf_file.split(".vcf.gz")[0]
print(prefix)
base_name = os.path.basename(prefix)
print(base_name)

/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes
ped_sim_run2_autosomes


In [33]:
# Path to segments file
segments = os.path.join(results_directory, f"{base_name}/segments/{base_name}_hapibd.seg")
print(segments)

segments_temp = pd.read_csv(segments, sep="\t", header=None)
segments_temp.columns = [
    "id1", "sample1_haplotype", "id2", "sample2_haplotype",
    "chrom", "phys_start_pos", "phys_end_pos", 
    "genetic_length"
    ]
segments = segments_temp.sort_values(
    by=["chrom", "phys_start_pos", "phys_end_pos"],
    ascending=[True, True, True]
)
segments = segments.reset_index(drop=True)
output_file = os.path.join(results_directory, "merged_opensnps_autosomes_hapibd.csv")
segments.to_csv(output_file, sep="\t", index=False, header=False)
segments.info()

/home/lakishadavid/computational_genetic_genealogy/results/ped_sim_run2_autosomes/segments/ped_sim_run2_autosomes_hapibd.seg


EmptyDataError: No columns to parse from file

In [None]:
segments.head() # You can enter a number greater than 5 to view more rows

In [None]:
segments["genetic_length"].describe()

In [None]:
# filter segments on min_length=7, min_markers=436, max_error_density=0.004,

filtered_segments_7cM = segments[segments["genetic_length"] >= 7].copy()

filtered_segments_7cM["genetic_length"].describe()

In [None]:
filtered_segments_20cM = segments[segments["genetic_length"] >= 20].copy()
filtered_segments_20cM["genetic_length"].describe()

Let's aggregate the data by pairs instead of looking at it by segment.

In [None]:
import pandas as pd
import numpy as np

# First ensure id1 and id2 are consistently ordered
filtered_segments_7cM[["id1", "id2"]] = filtered_segments_7cM.apply(
    lambda row: pd.Series((row["id1"], row["id2"])) if row["id1"] < row["id2"] 
    else pd.Series((row["id2"], row["id1"])), axis=1
)

pair_counts = filtered_segments_7cM.groupby(["id1", "id2"]).size().reset_index(name="pair_count")
pair_count_distribution = pair_counts["pair_count"].value_counts().reset_index()
pair_count_distribution.columns = ["Number of Segments", "Number of Pairs"]
pair_count_distribution = pair_count_distribution.reset_index(drop=True)
display(pair_count_distribution.style.hide(axis="index"))

In [None]:
# Group by id pairs and calculate all metrics at once
aggregated_segments = filtered_segments_7cM.groupby(["id1", "id2"]).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

# Check distribution of values
display(aggregated_segments.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define columns to plot
columns = ["total_genetic_length", "num_segments", "largest_segment"]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram for each metric
for i, col in enumerate(columns):
    sns.histplot(aggregated_segments[col], bins=30, kde=True, ax=axes[i], edgecolor="black")
    axes[i].set_title(f"Distribution of {col.replace('_', ' ').title()}")
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# Box Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(columns):
    sns.boxplot(y=aggregated_segments[col], ax=axes[i])
    axes[i].set_title(f"Box Plot of {col.replace('_', ' ').title()}")
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()
