In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

os.environ["WORKING_DIRECTORY"] = working_directory
os.environ["DATA_DIRECTORY"] = data_directory
os.environ["REFERENCES_DIRECTORY"] = references_directory
os.environ["RESULTS_DIRECTORY"] = results_directory
os.environ["UTILS_DIRECTORY"] = utils_directory

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
        
log_filename = os.path.join(results_directory, "lab6.log")
print(f"The Lab 6 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

If you are returning back to this point after running subsequent labs, you need the results from the ped-sim notebook and quality control. Alternatively (e.g, if you don't have those results), run the following cell to copy prepared results from the instructor's run of the ped-sim notebook.

In [None]:
# Copy required files from class_data to results directory
import os
import shutil

# List of files to copy
files_to_copy = [
    "ped_sim_run2-everyone.fam",
    "ped_sim_run2.seg",
    "ped_sim_run2.seg_dict.txt",
    "pedigree.fam",
    "merged_opensnps_data.vcf.gz",
    "merged_opensnps_data.vcf.gz.tbi",
    "merged_opensnps_data_autosomes.vcf.gz",
    "merged_opensnps_data_autosomes.vcf.gz.tbi",
]

directories_to_copy = [
    "merged_opensnps_data_autosomes",
]

# Copy each file
for file in files_to_copy:
    source = os.path.join(data_directory, "class_data", file)
    destination = os.path.join(results_directory, file)
    shutil.copy2(source, destination)
    print(f"Copied {file} to {results_directory}")
    
# Copy each directory
for directory in directories_to_copy:
    source = os.path.join(data_directory, "class_data", directory)
    destination = os.path.join(results_directory, directory)
    shutil.copytree(source, destination, dirs_exist_ok=True)
    print(f"Copied {directory} to {results_directory}")

**Select your VCF file**

In the next cell, uncomment the file you want to use.

In [None]:
# vcf_file = os.path.join(results_directory, "merged_sample_autosomes_unphased.vcf.gz")
# vcf_directory = os.path.join(results_directory, "real_data_autosomes")

# vcf_file = os.path.join(results_directory, "ped_sim_run2_autosomes.vcf.gz")
# vcf_directory = os.path.join(results_directory, "ped_sim_run2_autosomes")

vcf_file = os.path.join(results_directory, "merged_opensnps_data_autosomes.vcf.gz")
vcf_directory = os.path.join(results_directory, "merged_opensnps_data_autosomes")

# Check if the VCF file exists
if not os.path.exists(vcf_file):
    print(f"VCF file not found: {vcf_file}")
else:
    print(f"VCF file found: {vcf_file}")
    
# Check if the VCF directory exists
if not os.path.exists(vcf_directory):
    print(f"VCF directory not found: {vcf_directory}")
else:
    print(f"VCF directory found: {vcf_directory}")

### Run the Refined-IBD Detection Algorithm

In [None]:
%%bash -s "$vcf_file" "$vcf_directory" "$results_directory"

vcf_file="$1"
vcf_directory="$2"
results_directory="$3"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
# base_name remove "_autosomes" suffixif present
base_name=${base_name/_autosomes/}

# Define the Refined-IBD executable path
refined_ibd="${UTILS_DIRECTORY}/refined-ibd.17Jan20.102.jar"
merge_ibd_segments="${UTILS_DIRECTORY}/merge-ibd-segments.17Jan20.102.jar"

# Ensure the Refined-IBD executable exists
if [[ ! -f "${refined_ibd}" ]]; then
    echo "Error: Refined-IBD executable not found: ${refined_ibd}" >&2
fi

# Ensure the Merge-IBD-Segments executable exists
if [[ ! -f "${merge_ibd_segments}" ]]; then
    echo "Error: Merge-IBD-Segments executable not found: ${merge_ibd_segments}" >&2
fi

# Create a directory for storing segment output files
mkdir -p "${vcf_directory}/segments"

# Run Refined-IBD analysis in loop by chromosome
for run in {1..3}; do
    for chr in {1..22}; do
        phased_samples="${vcf_directory}/phased_samples/${base_name}_phased_chr${chr}.vcf.gz"
        if [[ ! -f "${phased_samples}" ]]; then
            echo "No matching VCF file found" >&2
            exit 1
        fi

        java -jar "${refined_ibd}" gt="${phased_samples}" \
            map="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map" \
            lod=4 \
            length=3 \
            out="${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}_run${run}.seg" \
            nthreads=4
    done
done

# Merge IBD segments for each chromosome
gap_threshold=0.6  # Adjust as needed
discord_threshold=1  # Adjust as needed

for chr in {1..22}; do
    echo "Processing chromosome ${chr}"
    phased_samples="${vcf_directory}/phased_samples/${base_name}_phased_chr${chr}.vcf.gz"
    genetic_map="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map"
    merged_output="${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}.seg"

    # Concatenate all three runs of Refined IBD for this chromosome
    zcat "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}_run1.seg.ibd.gz" \
        "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}_run2.seg.ibd.gz" \
        "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}_run3.seg.ibd.gz" | \
    java -jar "${merge_ibd_segments}" "${phased_samples}" "${genetic_map}" "${gap_threshold}" "${discord_threshold}" > "${merged_output}"
done

# Create or empty the final merged output file
: > "${vcf_directory}/segments/${base_name}_autosomes_refinedibd.seg"

# Concatenate all merged chromosome-specific IBD files
for chr in {1..22}; do
    merged_file="${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}.seg"
    if [[ -f "${merged_file}" ]]; then
        cat "${merged_file}" >> "${results_directory}/${base_name}_autosomes_refinedibd.seg"
    else
        echo "Warning: File for chromosome ${chr} not found during final merging." >&2
    fi
done

# Remove intermediate merged chromosome-specific files
if [[ -f "${vcf_directory}/segments/${base_name}_autosomes_refinedibd.seg" ]]; then
    for chr in {1..22}; do
        rm -f "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}"*.seg.ibd.gz
        rm -f "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}"*.seg.hbd.gz
        rm -f "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}"*.seg.log
        rm -f "${vcf_directory}/segments/temp_${base_name}_refinedibd_chr${chr}.seg"
    done
fi

echo "Final merged IBD file: ${vcf_directory}/segments//${base_name}_autosomes_refinedibd.seg"

### Explore The Segments Results

In [None]:
prefix = vcf_file.split(".vcf.gz")[0]
print(prefix)
base_name = os.path.basename(prefix)
print(base_name)

In [None]:
# Path to segments file
segments = os.path.join(results_directory, f"{base_name}_refinedibd.seg")
print(f"The segments file is located at {segments}")

# Only proceed if the file exists
if os.path.exists(segments):
    segments_temp = pd.read_csv(segments, sep="\t", header=None)
    segments_temp.columns = [
        "id1", "sample1_haplotype", "id2", "sample2_haplotype",
        "chrom", "phys_start_pos", "phys_end_pos", 
        "lod_score", "genetic_length"
        ]
    segments = segments_temp.sort_values(
        by=["chrom", "phys_start_pos", "phys_end_pos"],
        ascending=[True, True, True]
    )
    segments = segments.reset_index(drop=True)
    output_file = os.path.join(results_directory, "merged_opensnps_data_autosomes_refinedibd.csv")
    segments.to_csv(output_file, sep="\t", index=False, header=False)
    segments.info() 
else:
    print("Cannot proceed without segments file.")

In [None]:
segments.head() # You can enter a number greater than 5 to view more rows

In [None]:
segments["genetic_length"].describe()

In [None]:
# filter segments on min_length=7, min_markers=436, max_error_density=0.004,

filtered_segments_7cM = segments[segments["genetic_length"] >= 7].copy()

filtered_segments_7cM["genetic_length"].describe()

In [None]:
import pandas as pd
import numpy as np

# First ensure id1 and id2 are consistently ordered
filtered_segments_7cM[["id1", "id2"]] = filtered_segments_7cM.apply(
    lambda row: pd.Series((row["id1"], row["id2"])) if row["id1"] < row["id2"] 
    else pd.Series((row["id2"], row["id1"])), axis=1
)

pair_counts = filtered_segments_7cM.groupby(["id1", "id2"]).size().reset_index(name="pair_count")
pair_count_distribution = pair_counts["pair_count"].value_counts().reset_index()
pair_count_distribution.columns = ["Number of Segments", "Number of Pairs"]
pair_count_distribution = pair_count_distribution.reset_index(drop=True)
display(pair_count_distribution.style.hide(axis="index"))

In [None]:
# Group by id pairs and calculate all metrics at once
aggregated_segments = filtered_segments_7cM.groupby(["id1", "id2"]).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

# Check distribution of values
display(aggregated_segments.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define columns to plot
columns = ["total_genetic_length", "num_segments", "largest_segment"]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram for each metric
for i, col in enumerate(columns):
    sns.histplot(aggregated_segments[col], bins=30, kde=True, ax=axes[i], edgecolor="black")
    axes[i].set_title(f"Distribution of {col.replace('_', ' ').title()}")
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# Box Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(columns):
    sns.boxplot(y=aggregated_segments[col], ax=axes[i])
    axes[i].set_title(f"Box Plot of {col.replace('_', ' ').title()}")
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()