In [1]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [2]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

Loaded environment variables from: /home/lakishadavid/computational_genetic_genealogy/.env


In [3]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils
The current directory is /home/lakishadavid/computational_genetic_genealogy


In [4]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [5]:
log_filename = os.path.join(results_directory, "lab6.log")
print(f"The Lab 6 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

The Lab 6 log file is located at /home/lakishadavid/computational_genetic_genealogy/results/lab6.log.


### Run the hap-IBD Detection Algorithm

In [None]:
%%bash -s "$data_directory" "$results_directory" "$utils_directory" "$references_directory"

data_directory="$1"
results_directory="$2"
utils_directory="$3"
references_directory="$4"

touch "${results_directory}/merged_opensnps_autosomes_hapibd.seg"

# Define the hap-IBD executable path
hap_ibd="${utils_directory}/hap-ibd.jar"

# Ensure the hap-IBD executable exists
if [[ ! -f "${hap_ibd}" ]]; then
    echo "Error: Hap-IBD executable not found: ${hap_ibd}" >&2
fi

# Run hap-IBD analysis in loop by chromosome
for chr in {1..22}; do
    phased_samples="${results_directory}/phased_samples/merged_opensnps_phased_chr${chr}_sorted.vcf.gz"
    if [[ ! -f "${phased_samples}" ]]; then
        echo "No matching VCF file found" >&2
        exit 1
    fi

    java -jar "${hap_ibd}" gt="${phased_samples}" \
        map="${references_directory}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map" \
        out="${results_directory}/merged_opensnps_autosomes_hapibd_chr${chr}.seg" \
        nthreads=4
done


if [[ $? -eq 0 ]]; then
    echo "hap-IBD analysis completed successfully."
else
    echo "Error running hap-IBD analysis." >&2
fi


Copyright (C) 2019-2023 Brian L. Browning
Enter "java -jar hap-ibd.jar" to print a list of command line arguments

Program            :  hap-ibd.jar  [ version 1.0, 15Jun23.92f ]
Start Time         :  09:36 PM CST on 02 Mar 2025
Max Memory         :  1948 MB

Parameters
  gt               :  /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr1_sorted.vcf.gz
  map              :  /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr1.GRCh38.map
  out              :  /home/lakishadavid/computational_genetic_genealogy/results/merged_opensnps_autosomes_hapibd_chr1.seg
  min-seed         :  2.0
  max-gap          :  1000
  min-extend       :  1.0
  min-output       :  2.0
  min-markers      :  100
  min-mac          :  2
  nthreads         :  4

Statistics
  samples          :  295
  markers          :  82850
  IBD segments     :  26329
  IBD segs/sample  :  89.3
  HBD segments     :  45
  HB

In [None]:
%%bash -s "$data_directory" "$results_directory" "$utils_directory" "$references_directory"

data_directory="$1"
results_directory="$2"
utils_directory="$3"
references_directory="$4"

# Create or empty the merged output file
: > "${results_directory}/merged_opensnps_autosomes_hapibd.seg"

for chr in {1..22}; do
    file1="${results_directory}/merged_opensnps_autosomes_hapibd_chr${chr}.seg.ibd.gz"
    if [[ -f "${file1}" ]]; then
        zcat "${file1}" >> "${results_directory}/merged_opensnps_autosomes_hapibd.seg"
    else
        echo "Warning: File for chromosome ${chr} not found during concatenation." >&2
    fi
done

# Remove temporary files using a loop
for chr in {1..22}; do
    rm -f "${results_directory}/merged_opensnps_autosomes_hapibd_chr${chr}.seg.ibd.gz"
    rm -f "${results_directory}/merged_opensnps_autosomes_hapibd_chr${chr}.seg.hbd.gz"
    rm -f "${results_directory}/merged_opensnps_autosomes_hapibd_chr${chr}.seg.log"
done

# note, this still leaves the hbd and log files

### Explore The Segments Results

In [8]:
segments = os.path.join(results_directory, "merged_opensnps_autosomes_hapibd.seg")

segments_temp = pd.read_csv(segments, sep="\t", header=None)
segments_temp.columns = [
    "id1", "sample1_haplotype", "id2", "sample2_haplotype",
    "chrom", "phys_start_pos", "phys_end_pos", 
    "genetic_length"
    ]
segments = segments_temp.sort_values(
    by=["chrom", "phys_start_pos", "phys_end_pos"],
    ascending=[True, True, True]
)
segments = segments.reset_index(drop=True)
output_file = os.path.join(results_directory, "merged_opensnps_autosomes_hapibd.csv")
segments.to_csv(output_file, sep="\t", index=False, header=False)
segments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63215 entries, 0 to 63214
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id1                63215 non-null  object 
 1   sample1_haplotype  63215 non-null  int64  
 2   id2                63215 non-null  object 
 3   sample2_haplotype  63215 non-null  int64  
 4   chrom              63215 non-null  int64  
 5   phys_start_pos     63215 non-null  int64  
 6   phys_end_pos       63215 non-null  int64  
 7   genetic_length     63215 non-null  float64
dtypes: float64(1), int64(5), object(2)
memory usage: 3.9+ MB


In [9]:
segments.head() # You can enter a number greater than 5 to view more rows

Unnamed: 0,id1,sample1_haplotype,id2,sample2_haplotype,chrom,phys_start_pos,phys_end_pos,genetic_length
0,user2097,2,user4742,2,1,115746,1027226,2.459
1,user3624,2,user4742,2,1,115746,1027226,2.459
2,user4742,2,user4748,2,1,115746,1027226,2.459
3,user4742,2,user5624,1,1,115746,1027226,2.459
4,user4742,2,user5709,2,1,115746,1027226,2.459


In [10]:
segments["genetic_length"].describe()

count    63215.000000
mean         3.568240
std          2.049659
min          2.000000
25%          2.265000
50%          2.661000
75%          5.442000
max         78.091000
Name: genetic_length, dtype: float64

In [11]:
# filter segments on min_length=7, min_markers=436, max_error_density=0.004,

filtered_segments_7cM = segments[segments["genetic_length"] >= 7].copy()

filtered_segments_7cM["genetic_length"].describe()

count    887.000000
mean      12.846432
std        7.589422
min        7.005000
25%        8.405500
50%       10.381000
75%       14.724000
max       78.091000
Name: genetic_length, dtype: float64

In [12]:
filtered_segments_20cM = segments[segments["genetic_length"] >= 20].copy()
filtered_segments_20cM["genetic_length"].describe()

count    97.000000
mean     29.194835
std      11.610397
min      20.192000
25%      21.831000
50%      25.070000
75%      31.415000
max      78.091000
Name: genetic_length, dtype: float64

Let's aggregate the data by pairs instead of looking at it by segment.

In [13]:
import pandas as pd
import numpy as np

# First ensure id1 and id2 are consistently ordered
filtered_segments_7cM[["id1", "id2"]] = filtered_segments_7cM.apply(
    lambda row: pd.Series((row["id1"], row["id2"])) if row["id1"] < row["id2"] 
    else pd.Series((row["id2"], row["id1"])), axis=1
)

pair_counts = filtered_segments_7cM.groupby(["id1", "id2"]).size().reset_index(name="pair_count")
pair_count_distribution = pair_counts["pair_count"].value_counts().reset_index()
pair_count_distribution.columns = ["Number of Segments", "Number of Pairs"]
pair_count_distribution = pair_count_distribution.reset_index(drop=True)
display(pair_count_distribution.style.hide(axis="index"))

Number of Segments,Number of Pairs
1,222
2,13
84,1
170,1
6,1
283,1
93,1
3,1


In [14]:
# Group by id pairs and calculate all metrics at once
aggregated_segments = filtered_segments_7cM.groupby(["id1", "id2"]).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

# Check distribution of values
display(aggregated_segments.describe())

Unnamed: 0,total_genetic_length,num_segments,largest_segment
count,241.0,241.0,241.0
mean,47.281266,3.680498,10.530502
std,302.768077,22.522988,7.819096
min,7.005,1.0,7.005
25%,7.699,1.0,7.699
50%,9.106,1.0,8.94
75%,11.028,1.0,10.635
max,3285.958,283.0,78.091


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define columns to plot
columns = ["total_genetic_length", "num_segments", "largest_segment"]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram for each metric
for i, col in enumerate(columns):
    sns.histplot(aggregated_segments[col], bins=30, kde=True, ax=axes[i], edgecolor="black")
    axes[i].set_title(f"Distribution of {col.replace('_', ' ').title()}")
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# Box Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(columns):
    sns.boxplot(y=aggregated_segments[col], ax=axes[i])
    axes[i].set_title(f"Box Plot of {col.replace('_', ' ').title()}")
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()
