In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab7.log")
print(f"The Lab 7 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

### Run the Refined-IBD Detection Algorithm

In [None]:
%%bash -s "$data_directory" "$results_directory" "$utils_directory" "$references_directory"

data_directory="$1"
results_directory="$2"
utils_directory="$3"
references_directory="$4"

# Create or empty the final merged output file
: > "${results_directory}/final_merged_opensnps_autosomes_refinedibd.seg"

# Define the Refined-IBD executable path
refined_ibd="${utils_directory}/refined-ibd.17Jan20.102.jar"
merge_ibd_segments="${utils_directory}/merge-ibd-segments.17Jan20.102.jar"

# Ensure the Refined-IBD executable exists
if [[ ! -f "${refined_ibd}" ]]; then
    echo "Error: Hap-IBD executable not found: ${refined_ibd}" >&2
fi

# Run Refined-IBD analysis in loop by chromosome
for run in {1..3}; do
    for chr in {1..22}; do
        phased_samples="${results_directory}/phased_samples/merged_opensnps_phased_chr${chr}_sorted.vcf.gz"
        if [[ ! -f "${phased_samples}" ]]; then
            echo "No matching VCF file found" >&2
            exit 1
        fi

        java -jar "${refined_ibd}" gt="${phased_samples}" \
            map="${references_directory}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map" \
            out="${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}_run${run}.seg" \
            nthreads=4
    done
done

# Merge IBD segments for each chromosome
gap_threshold=0.6  # Adjust as needed
discord_threshold=1  # Adjust as needed

for chr in {1..22}; do
    phased_samples="${results_directory}/phased_samples/merged_opensnps_phased_chr${chr}_sorted.vcf.gz"
    genetic_map="${references_directory}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map"
    merged_output="${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}.seg"

    # Concatenate all three runs of Refined IBD for this chromosome
    zcat "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}_run1.seg.ibd.gz" \
        "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}_run2.seg.ibd.gz" \
        "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}_run3.seg.ibd.gz" | \
    java -jar "${merge_ibd_segments}" "${phased_samples}" "${genetic_map}" "${gap_threshold}" "${discord_threshold}" > "${merged_output}"
done

# Concatenate all merged chromosome-specific IBD files
for chr in {1..22}; do
    merged_file="${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}.seg"
    if [[ -f "${merged_file}" ]]; then
        cat "${merged_file}" >> "${results_directory}/final_merged_opensnps_autosomes_refinedibd.seg"
    else
        echo "Warning: File for chromosome ${chr} not found during final merging." >&2
    fi
done

# Remove intermediate merged chromosome-specific files
if [[ -f "${results_directory}/final_merged_opensnps_autosomes_refinedibd.seg" ]]; then
    for chr in {1..22}; do
        rm -f "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}"*.seg.ibd.gz
        rm -f "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}"*.seg.hbd.gz
        rm -f "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}"*.seg.log
        rm -f "${results_directory}/merged_opensnps_autosomes_refinedibd_chr${chr}.seg"
    done
fi

echo "Final merged IBD file: ${results_directory}/final_merged_opensnps_autosomes_refinedibd.seg"

### Explore The Segments Results

In [None]:
segments = os.path.join(results_directory, "final_merged_opensnps_autosomes_refinedibd.seg")

segments_temp = pd.read_csv(segments, sep="\t", header=None)
segments_temp.columns = [
    "id1", "sample1_haplotype", "id2", "sample2_haplotype",
    "chrom", "phys_start_pos", "phys_end_pos", 
    "lod_score", "genetic_length"
    ]
segments = segments_temp.sort_values(
    by=["chrom", "phys_start_pos", "phys_end_pos"],
    ascending=[True, True, True]
)
segments = segments.reset_index(drop=True)
output_file = os.path.join(results_directory, "merged_opensnps_autosomes_refinedibd.csv")
segments.to_csv(output_file, sep="\t", index=False, header=False)
segments.info() 

In [None]:
segments.head() # You can enter a number greater than 5 to view more rows

In [None]:
segments["genetic_length"].describe()

In [None]:
# filter segments on min_length=7, min_markers=436, max_error_density=0.004,

filtered_segments_7cM = segments[segments["genetic_length"] >= 7].copy()

filtered_segments_7cM["genetic_length"].describe()

In [None]:
import pandas as pd
import numpy as np

# First ensure id1 and id2 are consistently ordered
filtered_segments_7cM[["id1", "id2"]] = filtered_segments_7cM.apply(
    lambda row: pd.Series((row["id1"], row["id2"])) if row["id1"] < row["id2"] 
    else pd.Series((row["id2"], row["id1"])), axis=1
)

pair_counts = filtered_segments_7cM.groupby(["id1", "id2"]).size().reset_index(name="pair_count")
pair_count_distribution = pair_counts["pair_count"].value_counts().reset_index()
pair_count_distribution.columns = ["Number of Segments", "Number of Pairs"]
pair_count_distribution = pair_count_distribution.reset_index(drop=True)
display(pair_count_distribution.style.hide(axis="index"))

In [None]:
# Group by id pairs and calculate all metrics at once
aggregated_segments = filtered_segments_7cM.groupby(["id1", "id2"]).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

# Check distribution of values
display(aggregated_segments.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define columns to plot
columns = ["total_genetic_length", "num_segments", "largest_segment"]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram for each metric
for i, col in enumerate(columns):
    sns.histplot(aggregated_segments[col], bins=30, kde=True, ax=axes[i], edgecolor="black")
    axes[i].set_title(f"Distribution of {col.replace('_', ' ').title()}")
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# Box Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(columns):
    sns.boxplot(y=aggregated_segments[col], ax=axes[i])
    axes[i].set_title(f"Box Plot of {col.replace('_', ' ').title()}")
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()