In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [None]:
# Environment setup for cross-compatibility
from scripts_support.lab_cross_compatibility import setup_environment, is_jupyterlite

# Set up environment-specific paths
DATA_DIR, RESULTS_DIR = setup_environment()

# Now you can use DATA_DIR and RESULTS_DIR consistently across environments


In [None]:
# Environment setup for cross-compatibility
from scripts_support.lab_cross_compatibility import setup_environment, is_jupyterlite

# Set up environment-specific paths
DATA_DIR, RESULTS_DIR = setup_environment()

# Now you can use DATA_DIR and RESULTS_DIR consistently across environments


In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab5.log")
print(f"The Lab 5 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

### Genetic Map

Use your existing Beagle genetic maps to create the genetic maps for IBIS. If you already have these genetic maps, you do not need to rerun these cells to download the genetic maps again.

In [None]:
def preprocess_ibis_map():
    beagle_map_dir = os.path.join(references_directory, "genetic_maps/beagle_genetic_maps")
    ibis_map_dir = os.path.join(references_directory, "genetic_maps/ibis_genetic_maps")
    os.makedirs(ibis_map_dir, exist_ok=True)
    
    for map_file in os.listdir(beagle_map_dir):
        if map_file.endswith(".map"):
            beagle_map_filename = os.path.join(beagle_map_dir, map_file)
            ibis_map_filename = os.path.join(ibis_map_dir, map_file)
            print(f"Processing {beagle_map_filename} to create IBIS map...")
            
            # For IBIS maps, we need: CHR POSITION GENETIC_POSITION [RATE]
            # From Beagle maps which are: CHR . GENETIC_POSITION PHYSICAL_POSITION
            # Move the genetic position to column 3 (not 4)
            command = f"awk '{{print $1, $4, $3, 0}}' {beagle_map_filename} > {ibis_map_filename}"
            
            subprocess.run(command, shell=True, check=True)
    print("All Beagle genetic maps converted to IBIS format.")

In [None]:
# Set up output directories
genetic_maps_dir = os.path.join(references_directory, "genetic_maps")
os.makedirs(genetic_maps_dir, exist_ok=True)

ibis_genetic_maps = os.path.join(genetic_maps_dir, "ibis_genetic_maps")
os.makedirs(ibis_genetic_maps, exist_ok=True)

assembly = "GRCh38"
preprocess_ibis_map()

# # Alternative source
# plink2_genetic_map_url="https://alkesgroup.broadinstitute.org/Eagle/downloads/tables/genetic_map_hg38_withX.txt.gz"

**Visual Inspection**

The above code should have created a set of genetic map files in the format that IBIS use. Look in your `references/genetic_mpas` directory and check for the `ibis_genetic_maps` subdirectory and individual by chromosome files within `ibis_genetic_maps`. Open the chromosome 1 file of both `ibis_genetic_maps` and `beagle_genetic_maps`. Compare them, visually. How are they similar? How are they different?

### Change the format of the data files from VCF to BED.

**Select your VCF file**

In the next cell, uncomment the file you want to use.

In [None]:
# vcf_file = os.path.join(results_directory, "merged_sample_autosomes_unphased.vcf.gz")
# vcf_directory = os.path.join(results_directory, "real_data_autosomes")

vcf_file = os.path.join(results_directory, "ped_sim_run2_autosomes.vcf.gz")
vcf_directory = os.path.join(results_directory, "ped_sim_run2_autosomes")

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

# Create the output directory if it does not exist
mkdir -p "${vcf_directory}/segments"
mkdir -p "${vcf_directory}/unphased_samples"

# Ensure the PLINK2 executable exists
if [[ ! -f "${UTILS_DIRECTORY}/plink2" ]]; then
    echo "Error: PLINK2 executable not found: ${UTILS_DIRECTORY}/plink2"
fi

# Check if the file exists
if [[ ! -f "${vcf_file}" ]]; then
    echo "File not found: ${vcf_file}"
fi

# Convert the VCF file to PLINK format
plink2 --vcf "${vcf_file}" --autosome --make-bed --out "${vcf_directory}/unphased_samples/${base_name}"

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "PLINK2 successfully processed: ${vcf_file}"
else
    echo "Error processing ${vcf_file}" >&2
fi

Explanation:
- `data_directory`, `utils_directory`, and `results_directory` are passed as arguments and assigned.
- The script verifies that plink2 exists and that the phased_samples_dir is a valid directory.
- It loops over files matching opensnps_phased_*.vcf.gz, checking if they exist before processing.
- Uses PLINK2 to convert each .vcf.gz file to PLINK binary format (.bed, .bim, .fam).
- Handles errors and prints appropriate messages.

### Add Genetic Map to Bim File
(as per the IBIS developer)

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

head -10 ${vcf_directory}/unphased_samples/${base_name}.bim

python scripts_support/add_genetic_map.py \
  ${vcf_directory}/unphased_samples/${base_name}.bim \
  ${REFERENCES_DIRECTORY}/genetic_maps/ibis_genetic_maps \
  ${vcf_directory}/unphased_samples/${base_name}_withgm.bim
  
head -10 ${vcf_directory}/unphased_samples/${base_name}_withgm.bim

Explanation:
- The script assigns arguments to data_directory, references_directory, and utils_directory.
- It verifies the existence of the add-map-plink.pl script.
- It checks for .bim files in data_directory, ensuring at least one exists.
- Extracts the chromosome number from the .bim filename.
- Determines the corresponding genetic map file.
- If the necessary files exist, it runs the Perl script to append the genetic map.
- The new .bim file is saved with a _gm.bim suffix.
- Errors are handled with messages and exit codes.

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

# Create the output directory if it does not exist
mkdir -p "${vcf_directory}/segments"
mkdir -p "${vcf_directory}/unphased_samples"

# Copy the original .bed and .fam files
cp ${vcf_directory}/unphased_samples/${base_name}.bed ${vcf_directory}/unphased_samples/${base_name}_withgm.bed
cp ${vcf_directory}/unphased_samples/${base_name}.fam ${vcf_directory}/unphased_samples/${base_name}_withgm.fam
      
plink2 --bfile ${vcf_directory}/unphased_samples/${base_name}_withgm \
      --snps-only just-acgt \
      --max-alleles 2 \
      --min-alleles 2 \
      --rm-dup force-first \
      --make-bed \
      --out ${vcf_directory}/unphased_samples/${base_name}_withgm_filtered

### Run the IBD Detection Algorithm

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

# Create the output directory if it does not exist
mkdir -p "${vcf_directory}/segments"
mkdir -p "${vcf_directory}/unphased_samples"

# Define the IBIS executable path
ibis="${UTILS_DIRECTORY}/ibis/ibis"

# Ensure the IBIS executable exists
if [[ ! -f "${ibis}" ]]; then
    echo "Error: IBIS executable not found: ${ibis}" >&2
fi

bed_file="${vcf_directory}/unphased_samples/${base_name}_withgm_filtered.bed"
bim_file="${vcf_directory}/unphased_samples/${base_name}_withgm_filtered.bim"
fam_file="${vcf_directory}/unphased_samples/${base_name}_withgm_filtered.fam"
${ibis} ${bed_file} ${bim_file} ${fam_file} -min_l 7 -mt 500 -er .004 \
    -ibd2 \
    -o "${vcf_directory}/segments/${base_name}_ibis" \
    -printCoef -noFamID

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "IBIS analysis completed successfully."
else
    echo "Error running IBIS analysis." >&2
fi

# IBIS Output File Descriptions

## IBIS
This file contains detailed information about identity-by-descent (IBD) segments shared between pairs of individuals.

### Columns:
- **sample1, sample2**: IDs of the two individuals being compared for shared genetic segments.
- **chrom**: Chromosome number where the IBD segment is located.
- **phys_start_pos, phys_end_pos**: Start and end positions of the IBD segment in base pairs (physical positions).
- **IBD_type**: Type of IBD segment (e.g., IBD1 for sharing one parental haplotype or IBD2 for sharing both parental haplotypes).
- **genetic_start_pos, genetic_end_pos**: Start and end positions of the segment in genetic map units (centiMorgans).
- **genetic_seg_length**: Length of the IBD segment in centiMorgans (genetic distance).
- **marker_count**: Number of genetic markers (SNPs) within the segment.
- **error_count**: Total number of mismatches or genotyping errors detected in the segment.
- **error_density**: Average error rate per marker in the segment (error_count divided by marker_count).

---

## Coef
This file provides information about pairwise kinship coefficients and degrees of relatedness.

### Columns:
- **sample1, sample2**: IDs of the two individuals being compared.
- **kinship_coefficient**: A measure of genetic similarity between the individuals, ranging from 0 (no relation) to higher values for close relatives.
- **IBD2_fraction**: Proportion of the genome where both parental haplotypes are shared between the individuals.
- **segment_count**: Total number of IBD segments identified between the individuals.
- **degree_of_relatedness**: Classification of the relationship based on kinship (e.g., siblings, cousins).

---

## IBD2
Represents segments where two individuals share both parental haplotypes.  
IBD2 is particularly useful in identifying siblings or individuals with close familial ties, as these segments indicate inheritance from both sides of the family.

---

## HBD (Runs of Homozygosity)
Indicates segments where an individual has matching haplotypes on both chromosomes, likely due to inheritance from a common ancestor.  
This is a measure of inbreeding or autozygosity (when an individual inherits identical haplotypes from both parents).

### Columns:
- **sample_id**: ID of the individual being analyzed for HBD segments.
- **chrom**: Chromosome number where the HBD segment is located.
- **phys_start_pos, phys_end_pos**: Start and end positions of the HBD segment in base pairs.
- **HBD_type**: Type or classification of the HBD segment.
- **genetic_start_pos, genetic_end_pos**: Start and end positions of the segment in genetic map units (centiMorgans).
- **genetic_seg_length**: Length of the HBD segment in centiMorgans.
- **marker_count**: Number of genetic markers (SNPs) in the segment.
- **error_count**: Total number of mismatches or genotyping errors detected in the segment.
- **error_density**: Average error rate per marker in the segment.

---

## Incoef
Provides inbreeding coefficients for individuals, based on HBD analysis.

### Columns:
- **sample_id**: ID of the individual being analyzed.
- **inbreeding_coefficient**: A measure of inbreeding for the individual, reflecting the proportion of the genome covered by HBD segments.
- **segment_count**: Total number of HBD segments identified in the individual's genome.

In [None]:
base_name = os.path.basename(vcf_directory)
print(base_name)

### Explore The Coefficients Results

In [None]:
def explore_coefficients(
    coefficients, 
    focus_on_related=True, 
    save_plots=True, 
    show_plots=True,
    output_dir=os.path.join(vcf_directory, "segments")
    ):
    """
    Reads and explores the coefficients file from the results directory.
    Includes handling for missing values and options to focus on related individuals.
    
    Parameters:
        results_directory (str): Directory containing the result files.
        filename (str): Filename of the coefficients file.
        focus_on_related (bool): If True, focuses analysis on related individuals (Degree > 0).
        save_plots (bool): If True, saves plots to the specified output directory.
        output_dir (str): Directory to save plots.
    
    Returns:
        pd.DataFrame: Processed coefficients DataFrame for further analysis.
    """
        
    # Save both full and filtered data if focus_on_related is True
    full_data = coefficients.copy()
    filtered_data = None

    if focus_on_related:
        print("\nFocusing on related individuals (Degree > 0).")
        filtered_data = full_data[full_data['Degree'] > 0]
        print(f"Filtered DataFrame Info (Degree > 0):")
        filtered_data.info()
        print("\n=== Descriptive Statistics (Filtered) ===")
        print(filtered_data.describe())
        print("\n")
        filtered_file_path = os.path.join(output_dir, "filtered_coefficients.csv")
        filtered_data.to_csv(filtered_file_path, index=False)
        print(f"Filtered coefficients saved to: {filtered_file_path}")

    # Save and print the full data
    print("\nFull DataFrame Info:")
    full_data.info()
    print("\n=== Descriptive Statistics (Full) ===")
    print(full_data.describe())
    print("\n")
    full_file_path = os.path.join(output_dir, "full_coefficients.csv")
    full_data.to_csv(full_file_path, index=False)
    print(f"Full coefficients saved to: {full_file_path}")

    # Analyze both datasets
    datasets = {"Full": full_data, "Filtered": filtered_data} if focus_on_related else {"Full": full_data}

    for name, data in datasets.items():
        if data is not None:
            print(f"\n=== Analyzing {name} Data ===")
            
            # Counts by Degree
            degree_grouped_counts = data['Degree'].value_counts().sort_index()
            degree_grouped_counts_df = degree_grouped_counts.reset_index(name='Count')
            degree_grouped_counts_df.columns = ['Degree', 'Count']
            print(f"=== Counts by Degree ({name}) ===")
            print(degree_grouped_counts_df)
            
            # Save HTML table
            html_table = degree_grouped_counts_df.to_html(index=False)
            html_file_path = os.path.join(output_dir, f"{name.lower()}_degree_counts.html")
            with open(html_file_path, "w") as f:
                f.write(html_table)
            print(f"HTML table for {name} data saved to: {html_file_path}")

            # Display in Jupyter if available
            if hasattr(IPython, 'get_ipython') and IPython.get_ipython() is not None:
                display(HTML(html_table))

            # Visualizations
            def save_or_show_plot(fig, filename):
                if save_plots:
                    fig.savefig(os.path.join(output_dir, f"{name.lower()}_{filename}"))
                if show_plots:
                    plt.show()
                plt.close(fig)

            # Degree distribution
            fig, ax = plt.subplots(figsize=(8, 5))
            sns.histplot(data['Degree'], bins=10, kde=False, ax=ax)
            ax.set_title(f'Degree Distribution ({name})')
            ax.set_xlabel('Degree')
            ax.set_ylabel('Frequency')
            save_or_show_plot(fig, "degree_distribution.png")

            # Other plots
            if 'Kinship_Coefficient' in data.columns:
                fig, ax = plt.subplots(figsize=(8, 5))
                sns.histplot(data['Kinship_Coefficient'], bins=30, kde=True, ax=ax)
                ax.set_title(f'Kinship Coefficient Distribution ({name})')
                ax.set_xlabel('Kinship Coefficient')
                ax.set_ylabel('Frequency')
                save_or_show_plot(fig, "kinship_coefficient_distribution.png")

            if 'IBD2_Fraction' in data.columns:
                fig, ax = plt.subplots(figsize=(8, 5))
                sns.histplot(data['IBD2_Fraction'], bins=30, kde=True, ax=ax)
                ax.set_title(f'IBD2 Fraction Distribution ({name})')
                ax.set_xlabel('IBD2 Fraction')
                ax.set_ylabel('Frequency')
                save_or_show_plot(fig, "ibd2_fraction_distribution.png")

            if all(col in data.columns for col in ['Kinship_Coefficient', 'IBD2_Fraction']):
                fig, ax = plt.subplots(figsize=(8, 5))
                sns.scatterplot(
                    data=data,
                    x='Kinship_Coefficient',
                    y='IBD2_Fraction',
                    hue='Degree', palette='viridis', ax=ax
                )
                ax.set_title(f'Kinship vs. IBD2 Fraction ({name})')
                ax.set_xlabel('Kinship Coefficient')
                ax.set_ylabel('IBD2 Fraction')
                plt.legend(title='Degree')
                save_or_show_plot(fig, "kinship_vs_ibd2_fraction.png")

            # Correlation matrix
            numeric_cols = ['Kinship_Coefficient', 'IBD2_Fraction', 'Segment_Count']
            existing_cols = [col for col in numeric_cols if col in data.columns]
            if existing_cols:
                fig, ax = plt.subplots(figsize=(6, 5))
                corr = data[existing_cols].corr()
                sns.heatmap(corr, annot=True, cmap='Blues', square=True, ax=ax)
                ax.set_title(f'Correlation Matrix ({name})')
                save_or_show_plot(fig, "correlation_matrix.png")

    print("\nAnalysis completed.")
    return

In [None]:
file_path=os.path.join(vcf_directory, "segments", f"{base_name}_ibis.coef")
print(f"File path: {file_path}")

# Make sure file_path exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist.")

# Step 1: Read the coefficients file
coefficients = pd.read_csv(file_path, sep="\t", low_memory=False)

explore_coefficients(
    coefficients=coefficients,
    focus_on_related=True, 
    save_plots=True,
    show_plots=True,
    output_dir=os.path.join(vcf_directory, "segments")
    )

### New Results

Look in your `results/segments` directory. You should see several new files. The image files are the ones that with the `.png` extension. Look at the 5 image files that start wtih `filtered_`. What do you think these mean?

### Explore The Segments Results

In [None]:
seg_file = os.path.join(vcf_directory, "segments", f"{base_name}_ibis.seg")

seg_data_temp = pd.read_csv(seg_file, sep="\t", header=None)
seg_data_temp.columns = [
    "sample1", "sample2", "chrom", 
    "phys_start_pos", "phys_end_pos", 
    "IBD_type", "genetic_start_pos", 
    "genetic_end_pos", "genetic_seg_length", 
    "marker_count", "error_count", "error_density"
    ]
seg_data = seg_data_temp.sort_values(
    by=["chrom", "phys_start_pos", "phys_end_pos", "IBD_type"],
    ascending=[True, True, True, True]
)

output_file = os.path.join(vcf_directory, f"{base_name}_ibis.csv")
seg_data.to_csv(output_file, sep="\t", index=False, header=False)
print(f"Saved {output_file}")

Notice the new file extension of `.csv`.

Let's create some data visualizations.

In [None]:
def explore_segments_ibis(
        segments,
        min_length=7, 
        min_markers=436, 
        max_error_density=0.004,
        save_plots=True,
        show_plots=True,
        output_dir=os.path.join(vcf_directory, "segments")
):
    """
    Explores and optionally filters the segments DataFrame.
    
    Parameters:
        results_directory (str): Directory containing the segments file.
        filename (str): Filename of the segments file.
        min_length (float): Minimum genetic length threshold for filtering.
        min_markers (int): Minimum marker count threshold for filtering.
        max_error_density (float): Maximum error density threshold for filtering.
        filter_segments_enabled (bool): If True, apply filtering to the segments.
        save_plots (bool): If True, save plots to the specified directory.
        output_dir (str): Directory to save outputs and plots.
    
    Returns:
        pd.DataFrame: The segments DataFrame (filtered or unfiltered based on input).
    """

    # Ensure numeric columns are properly parsed
    numeric_columns = ["genetic_length", "marker_count", "error_density", "chromosome"]
    for col in numeric_columns:
        if col in segments.columns:
            segments[col] = pd.to_numeric(segments[col], errors='coerce')

    # Drop rows with NaN values in numeric columns
    nan_rows = segments[segments[numeric_columns].isnull().any(axis=1)]
    if not nan_rows.empty:
        nan_file_path = os.path.join(output_dir, "nan_segments_ibis.csv")
        nan_rows.to_csv(nan_file_path, sep="\t", index=False)
        print(f"Rows with NaN values saved to: {nan_file_path}")
    segments = segments.dropna(subset=numeric_columns).reset_index(drop=True)

    # Step 2: Basic info and descriptive statistics
    print("=== Segments DataFrame Info ===")
    segments.info()
    print("\n=== Descriptive Statistics ===")
    print(segments[['genetic_length', 'marker_count', 'error_density']].describe())
    print("\n")

    # Save the unfiltered data
    unfiltered_file_path = os.path.join(output_dir, "unfiltered_segments_ibis.csv")
    segments.to_csv(unfiltered_file_path, sep="\t", index=False)
    print(f"Unfiltered segments saved to: {unfiltered_file_path}")
    print()

    filtered_segments = segments[
        (segments['genetic_length'] >= min_length) &
        (segments['marker_count'] >= min_markers) &
        (segments['error_density'] <= max_error_density)
    ].copy()
    
    print("=== Filtered Segments Info ===")
    filtered_segments.info()
    print("\n=== Descriptive Statistics (Filtered) ===")
    print(filtered_segments[['genetic_length', 'marker_count', 'error_density']].describe())
    print("\n")
    
    # Save filtered segments to a new file
    filtered_filename = "filtered_segments_ibis.csv"
    filtered_file_path = os.path.join(output_dir, filtered_filename)
    filtered_segments.to_csv(filtered_file_path, sep="\t", index=False)
    print(f"Filtered segments saved to: {filtered_file_path}")

    print(f"\nSummary:")
    print(f"Total segments: {len(segments)}")
    print(f"Filtered segments: {len(filtered_segments)}")
    if not nan_rows.empty:
        print(f"Rows with NaN values: {len(nan_rows)} (saved to: {nan_file_path})")


    # Step 4: Visualizations
    def save_or_show_plot(fig, filename):
        if save_plots:
            fig.savefig(os.path.join(output_dir, filename))
        if show_plots:
            plt.show()
        plt.close(fig)

    def plot_distribution(data, column, title, xlabel, ylabel, filename, bins=30, kde=True):
        fig, ax = plt.subplots(figsize=(8, 5))
        sns.histplot(data[column], bins=bins, kde=kde, ax=ax)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        save_or_show_plot(fig, filename)

    # Visualize genetic_length distribution
    plot_distribution(
        segments, "genetic_length", "Distribution of Genetic Length", 
        "Genetic Length (cM)", "Frequency", "genetic_length_distribution_unfiltered.png"
    )

    plot_distribution(
        filtered_segments, "genetic_length", "Distribution of Genetic Length (Filtered)", 
        "Genetic Length (cM)", "Frequency", "genetic_length_distribution_filtered.png"
    )

    # Visualize marker_count distribution
    plot_distribution(
        segments, "marker_count", "Distribution of Marker Count", 
        "Marker Count", "Frequency", "marker_count_distribution_unfiltered.png"
    )
    plot_distribution(
        filtered_segments, "marker_count", "Distribution of Marker Count (Filtered)", 
        "Marker Count", "Frequency", "marker_count_distribution_filtered.png"
    )

    # Boxplot of genetic_length by chromosome
    def plot_boxplot(data, x_col, y_col, title, xlabel, ylabel, filename):
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(x=x_col, y=y_col, data=data, ax=ax)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        plt.xticks(rotation=45)
        plt.tight_layout()
        save_or_show_plot(fig, filename)

    plot_boxplot(
        segments, "chromosome", "genetic_length", 
        "Distribution of Genetic Length by Chromosome", 
        "Chromosome", "Genetic Length (cM)", "genetic_length_by_chromosome_unfiltered.png"
    )
    plot_boxplot(
        filtered_segments, "chromosome", "genetic_length", 
        "Distribution of Genetic Length by Chromosome (Filtered)", 
        "Chromosome", "Genetic Length (cM)", "genetic_length_by_chromosome_filtered.png"
    )

    print("\nAnalysis completed.")
    return

In [None]:
file_path = os.path.join(vcf_directory, f"segments/{base_name}_ibis.seg")
segments = pd.read_csv(file_path, sep="\t", header=None)
segments.columns = [
    "id1", "id2", "chromosome", "physical_position_start", 
    "physical_position_end", "IBD_type", "genetic_position_start", 
    "genetic_position_end", "genetic_length", "marker_count", 
    "error_count", "error_density"
    ]

explore_segments_ibis(
        segments,
        min_length=7, 
        min_markers=436, 
        max_error_density=0.004,
        save_plots=True,
        show_plots=True,
        output_dir=os.path.join(vcf_directory, "segments")
)

The plots were saved in your `results/segments` directory.

Take a look at the plots. What information are they communicating? Some are meaningful. Some less so. What other information do you think should be communicated?

### Continue to explore the segment data.

The following cell takes the segment data output from IBIS IBD dectection algorithm and processes it using `pandas`.

In [None]:
file_path = os.path.join(vcf_directory, f"segments/{base_name}_ibis.seg")
segments = pd.read_csv(file_path, sep="\t", header=None)
segments.columns = [
    "id1", "id2", "chromosome", "physical_position_start", 
    "physical_position_end", "IBD_type", "genetic_position_start", 
    "genetic_position_end", "genetic_length", "marker_count", 
    "error_count", "error_density"
]

# Ensure numeric columns are properly parsed
numeric_columns = ["genetic_length", "marker_count", "error_density", "chromosome"]
for col in numeric_columns:
    if col in segments.columns:
        segments[col] = pd.to_numeric(segments[col], errors='coerce')

segments = segments.dropna(subset=numeric_columns).reset_index(drop=True)

The next cell allows you to look at the first five rows. Notice that the data now has column headers. The IBIS output file does not come with column headers. We have to look at the developers' GitHub page or other documenation to determine the names of the columns.

In [None]:
segments.info()

In [None]:
segments.describe()

In [None]:
segments.head() # You can enter a number greater than 5 to view more rows

Let's look at the IBD types

In [None]:
segments["IBD_type"].value_counts()

In [None]:
segments_ibd1 = segments[segments["IBD_type"] == "IBD1"]
segments_ibd2 = segments[segments["IBD_type"] == "IBD2"]

display(segments_ibd1.head())
display(segments_ibd2.head())

In [None]:
# Show all the segments that the first pair of individuals share in IBD1 segments
segments_ibd1.loc[(segments["id1"] == "user2715") & (segments["id2"] == "user2817")]

In [None]:
segments_ibd1[["genetic_length", "marker_count", "error_count", "error_density"]].describe()

In [None]:
# filter segments on min_length=7, min_markers=436, max_error_density=0.004,

filtered_segments = segments_ibd1[
    (segments["genetic_length"] >= 7) & 
    (segments["marker_count"] >= 436) & 
    (segments["error_density"] <= 0.004)
].copy()

filtered_segments[["genetic_length", "marker_count", "error_count", "error_density"]].describe()

In [None]:
filtered_segments_20cM = segments_ibd1[segments_ibd1["genetic_length"] >= 20].copy()
filtered_segments_20cM[["genetic_length", "marker_count", "error_count", "error_density"]].describe()

Let's aggregate the data by pairs instead of looking at it by segment.

In [None]:
len(segments_ibd1)

In [None]:
import pandas as pd
import numpy as np

filtered_segments[["id1", "id2"]] = segments_ibd1.apply(
    lambda row: pd.Series((row["id1"], row["id2"])) if row["id1"] < row["id2"] 
    else pd.Series((row["id2"], row["id1"])), axis=1
)
pair_counts = filtered_segments.groupby(["id1", "id2"]).size().reset_index(name="pair_count")
pair_count_distribution = pair_counts["pair_count"].value_counts().reset_index()
pair_count_distribution.columns = ["Number of Segments", "Number of Pairs"]
pair_count_distribution = pair_count_distribution.reset_index(drop=True)
display(pair_count_distribution.style.hide(axis="index"))

In [None]:
# Group by id pairs and calculate all metrics at once
aggregated_segments = segments_ibd1.groupby(["id1", "id2"]).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

# Check distribution of values
display(aggregated_segments.describe())

In [None]:
filtered_segments.head()

In [None]:
filtered_segments[filtered_segments["IBD_type"] == "IBD1"].shape[0]

In [None]:
filtered_segments[filtered_segments["IBD_type"] == "IBD2"].shape[0]

In [None]:
filtered_segments[filtered_segments["id1"] == filtered_segments["id2"]].shape[0]

In [None]:
aggregated_segments_by_type = filtered_segments.groupby(
    ["id1", "id2", "IBD_type"]
).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

pd.options.display.float_format = '{:.6f}'.format
# Display the result
ibd1_summary = aggregated_segments_by_type[aggregated_segments_by_type["IBD_type"] == "IBD1"].describe()
display(ibd1_summary)

ibd2_summary = aggregated_segments_by_type[aggregated_segments_by_type["IBD_type"] == "IBD2"].describe()
display(ibd2_summary)

In [None]:
aggregated_segments[["total_genetic_length", "num_segments", "largest_segment"]].describe()

In [None]:
# Filter the pairs that meet the criteria
filtered_pairs1 = aggregated_segments[aggregated_segments["total_genetic_length"] >= 3000]

display(filtered_pairs1[["id1", "id2", "total_genetic_length"]])

In [None]:
# Filter the pairs that meet the criteria
filtered_pairs2 = aggregated_segments[aggregated_segments["total_genetic_length"] >= 1000]

display(filtered_pairs2[["id1", "id2", "total_genetic_length"]])

In [None]:
display(aggregated_segments)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define columns to plot
columns = ["total_genetic_length", "num_segments", "largest_segment"]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram for each metric
for i, col in enumerate(columns):
    sns.histplot(aggregated_segments[col], bins=30, kde=True, ax=axes[i], edgecolor="black")
    axes[i].set_title(f"Distribution of {col.replace('_', ' ').title()}")
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# Box Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(columns):
    sns.boxplot(y=aggregated_segments[col], ax=axes[i])
    axes[i].set_title(f"Box Plot of {col.replace('_', ' ').title()}")
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()
