In [1]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
from dotenv import load_dotenv

In [2]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

Loaded environment variables from: /home/lakishadavid/computational_genetic_genealogy/.env


In [3]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils
The current directory is /home/lakishadavid/computational_genetic_genealogy


In [4]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [5]:
log_filename = os.path.join(results_directory, "lab5.log")
print(f"The Lab 5 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.

The Lab 5 log file is located at /home/lakishadavid/computational_genetic_genealogy/results/lab5.log.


In [6]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

### Genetic Map

Use your existing Beagle genetic maps to create the genetic maps for IBIS. If you already have these genetic maps, you do not need to rerun these cells to download the genetic maps again.

In [7]:
def preprocess_ibis_map():
    beagle_map_dir = os.path.join(references_directory, "genetic_maps/beagle_genetic_maps")
    ibis_map_dir = os.path.join(references_directory, "genetic_maps/ibis_genetic_maps")
    os.makedirs(ibis_map_dir, exist_ok=True)
    
    for map_file in os.listdir(beagle_map_dir):
        if map_file.endswith(".map"):
            beagle_map_filename = os.path.join(beagle_map_dir, map_file)
            ibis_map_filename = os.path.join(ibis_map_dir, map_file)
            print(f"Processing {beagle_map_filename} to create IBIS map...")
            command = f"awk '{{print $1, $4, ($2 == \".\" ? 0 : $2), $3}}' {beagle_map_filename} > {ibis_map_filename}"
            subprocess.run(command, shell=True, check=True)
    print("All Beagle genetic maps converted to IBIS format.")

In [8]:
# Set up output directories
genetic_maps_dir = os.path.join(references_directory, "genetic_maps")
os.makedirs(genetic_maps_dir, exist_ok=True)

ibis_genetic_maps = os.path.join(genetic_maps_dir, "ibis_genetic_maps")
os.makedirs(ibis_genetic_maps, exist_ok=True)

assembly = "GRCh38"
preprocess_ibis_map()

# # Alternative source
# plink2_genetic_map_url="https://alkesgroup.broadinstitute.org/Eagle/downloads/tables/genetic_map_hg38_withX.txt.gz"

Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chrX_par1.GRCh38.map to create IBIS map...
Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr14.GRCh38.map to create IBIS map...
Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr12.GRCh38.map to create IBIS map...
Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr11.GRCh38.map to create IBIS map...
Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chr18.GRCh38.map to create IBIS map...
Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/beagle_genetic_maps/plink.chrX_par2.GRCh38.map to create IBIS map...
Processing /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/

**Visual Inspection**

The above code should have created a set of genetic map files in the format that IBIS use. Look in your `references/genetic_mpas` directory and check for the `ibis_genetic_maps` subdirectory and individual by chromosome files within `ibis_genetic_maps`. Open the chromosome 1 file of both `ibis_genetic_maps` and `beagle_genetic_maps`. Compare them, visually. How are they similar? How are they different?

### Concatenate phased VCF files

In [7]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Define the directory containing phased VCF files
phased_samples_dir="${results_directory}/phased_samples"

# Concatenate phased VCF files
echo "Creating list of phased VCF files..."
PHASED_FILE_LIST="${phased_samples_dir}/phased_file_list_sample.txt"

# Empty the file list if it already exists
> "$PHASED_FILE_LIST"

for CHR in {1..22}; do
    PHASED_VCF="${phased_samples_dir}/merged_opensnps_phased_chr${CHR}.vcf.gz"
    if [ -f "$PHASED_VCF" ]; then
        echo "$PHASED_VCF" >> "$PHASED_FILE_LIST"
    else
        echo "Phased VCF missing for chromosome $CHR"
    fi
done

CONCATENATED_VCF="${phased_samples_dir}/merged_opensnps_autosomes.vcf"
SORTED_VCF="${phased_samples_dir}/merged_opensnps_autosomes_sorted.vcf.gz"
STATS_OUTPUT="${phased_samples_dir}/merged_opensnps_autosomes_sorted_stats.vchk"

# Concatenate VCF files
bcftools concat -o "$CONCATENATED_VCF" --file-list "$PHASED_FILE_LIST"

if [ -f "$CONCATENATED_VCF" ]; then
    # Sort and compress the concatenated VCF
    bcftools sort -Oz -o "$SORTED_VCF" "$CONCATENATED_VCF"

    # Index the sorted VCF
    bcftools index --tbi -f "$SORTED_VCF"

    # Generate stats
    bcftools stats -s - "$SORTED_VCF" > "$STATS_OUTPUT"

    rm -f "${results_directory}/merged_opensnps_autosomes_step1*"
    rm -f "${results_directory}/merged_opensnps_autosomes_step2*"

    echo "Phasing, cleanup, and concatenation completed successfully."

    # Remove individual phased VCF files
    echo "Removing individual phased VCF files..."
    for CHR in {1..22}; do
        PHASED_VCF="${phased_samples_dir}/merged_opensnps_phased_chr${CHR}.vcf.gz"
        if [ -f "${PHASED_VCF}" ]; then
            rm -f "${PHASED_VCF}"
            rm -f "${PHASED_VCF}.tbi"
            rm -f "${phased_samples_dir}/merged_opensnps_phased_chr${CHR}.log"
            rm -f "${phased_samples_dir}/merged_opensnps_phased_chr${CHR}_stats.vchk"
            echo "Removed $PHASED_VCF and its index."
        fi
    done
    rm -f "${phased_samples_dir}/merged_opensnps_autosomes.vcf"
else
    echo "Concatenated VCF file missing. Pipeline aborted."
    exit 1
fi

Creating list of phased VCF files...


Checking the headers and starting positions of 22 files
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr1.vcf.gz	0.033714 seconds
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr2.vcf.gz	0.035030 seconds
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr3.vcf.gz	0.028771 seconds
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr4.vcf.gz	0.026724 seconds
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr5.vcf.gz	0.027721 seconds
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr6.vcf.gz	0.029012 seconds
Concatenating /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_

Phasing, cleanup, and concatenation completed successfully.
Removing individual phased VCF files...
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr1.vcf.gz and its index.
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr2.vcf.gz and its index.
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr3.vcf.gz and its index.
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr4.vcf.gz and its index.
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr5.vcf.gz and its index.
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr6.vcf.gz and its index.
Removed /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_phased_chr

### Change the format of the data files from VCF to BED.

In [8]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Define
phased_samples_dir="${results_directory}/phased_samples"
vcf_file="${phased_samples_dir}/merged_opensnps_autosomes_sorted.vcf.gz"

# Ensure the PLINK2 executable exists
if [[ ! -f "${utils_directory}/plink2" ]]; then
    echo "Error: PLINK2 executable not found: ${utils_directory}/plink2" >&2
    exit 1
fi

# Ensure the phased samples directory exists
if [[ ! -d "${phased_samples_dir}" ]]; then
    echo "Error: Phased samples directory not found: ${phased_samples_dir}" >&2
    exit 1
fi

# Check if the file exists
if [[ ! -f "$vcf_file" ]]; then
    echo "No matching VCF file found in $phased_samples_dir" >&2
    exit 1
fi

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"

# Convert the VCF file to PLINK format
${utils_directory}/plink2 --vcf "$vcf_file" --autosome --make-bed --out "$output_prefix"

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "PLINK2 successfully processed: ${vcf_file}"
else
    echo "Error processing ${vcf_file}" >&2
fi

PLINK v2.0.0-a.6.4LM 64-bit Intel (6 Dec 2024)     cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.log.
Options in effect:
  --autosome
  --make-bed
  --out /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted
  --vcf /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.vcf.gz

Start time: Mon Feb 17 00:36:59 2025
7789 MiB RAM detected, ~4754 available; reserving 3894 MiB for main workspace.
Using up to 12 threads (change this with --threads).
--vcf: 400582 variants scanned.
--vcf: 393k variants converted. 
/home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted-temporary.pgen
+
/home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merg

Explanation:
- `data_directory`, `utils_directory`, and `results_directory` are passed as arguments and assigned.
- The script verifies that plink2 exists and that the phased_samples_dir is a valid directory.
- It loops over files matching opensnps_phased_*.vcf.gz, checking if they exist before processing.
- Uses PLINK2 to convert each .vcf.gz file to PLINK binary format (.bed, .bim, .fam).
- Handles errors and prints appropriate messages.

### Add Genetic Map to Bim File
(as per the IBIS developer)

In [9]:
%%bash -s "$results_directory" "$references_directory" "$utils_directory"

results_directory="$1"
references_directory="$2"
utils_directory="$3"

# Define the script for adding the genetic map
add_map_script="${utils_directory}/ibis/add-map-plink.pl"
if [[ ! -f "${add_map_script}" ]]; then
    echo "Error: Add-map script not found: $add_map_script" >&2
    exit 1
fi

bim_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.bim"
if [[ ! -f "${bim_file}" ]]; then
    echo "NOT FOUND: ${bim_file}" >&2
    exit 1
fi

map_directory="${references_directory}/genetic_maps/ibis_genetic_maps"
output_bim="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.bim.gm"

# Run the add-map script
${add_map_script} ${bim_file} ${map_directory}/plink.chr{1..22}.GRCh38.map > "${output_bim}"

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "Genetic map added to: ${bim_file}"
else
    echo "Error adding genetic map to: ${bim_file}" >&2
fi

Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr1.GRCh38.map... done
Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr2.GRCh38.map... done
Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr3.GRCh38.map... done
Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr4.GRCh38.map... done
Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr5.GRCh38.map... done
Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr6.GRCh38.map... done
Reading map file /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps/plink.chr7.GRCh38.map... done
Reading map file /ho

Genetic map added to: /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bim


Explanation:
- The script assigns arguments to data_directory, references_directory, and utils_directory.
- It verifies the existence of the add-map-plink.pl script.
- It checks for .bim files in data_directory, ensuring at least one exists.
- Extracts the chromosome number from the .bim filename.
- Determines the corresponding genetic map file.
- If the necessary files exist, it runs the Perl script to append the genetic map.
- The new .bim file is saved with a _gm.bim suffix.
- Errors are handled with messages and exit codes.

### Run the IBD Detection Algorithm

In [10]:
%%bash -s "$data_directory" "$results_directory" "$utils_directory"

data_directory="$1"
results_directory="$2"
utils_directory="$3"

# Define the IBIS executable path
ibis="${utils_directory}/ibis/ibis"

# Ensure the IBIS executable exists
if [[ ! -f "${ibis}" ]]; then
    echo "Error: IBIS executable not found: ${ibis}" >&2
fi

bed_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.bed"
bim_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.bim.gm"
fam_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.fam"
${ibis} ${bed_file} ${bim_file} ${fam_file} -ibd2 -min_l 7 -mt 500 -er .004 \
    -min_l2 2 -mt2 186 -er2 .008 \
    -o "${results_directory}/merged_opensnps_autosomes_ibis" \
    -printCoef -noFamID

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "IBIS analysis completed successfully."
else
    echo "Error running IBIS analysis." >&2
fi

IBIS Segment Caller!  v1.20.9    (Released December 7, 2020)

Viewing arguments...
-ibd2 - running with IBD2 detection enabled
-min_l - running with minimum IBD1 length 7.000000
-mt - running with minimum IBD1 marker threshold of 499.000000
-er - running with error rate 0.004000
-min_l2 - running with minimum IBD2 length 2.000000
-mt2 - running with minimum IBD2 marker threshold of 185.000000
-er2 - running with IBD2 error rate 0.008000
-o - setting output file /home/lakishadavid/computational_genetic_genealogy/results/merged_opensnps_autosomes_ibis.seg
-printCoef - printing coefficient file
-noFamID - assuming no Family ID in input
No -b or -bfile - Running with input files: /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bed, /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bim.gm, /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnp

Omitting marker rs28699618: missing physical position
Omitting marker rs6565704: missing physical position
Omitting marker rs6565705: missing physical position
Omitting marker rs7502403: missing physical position
Omitting marker rs8064924: missing physical position
Omitting marker rs8075072: missing physical position
Omitting marker rs7221348: missing physical position


Parsing SNP file... done.
Parsing individual file... done.
Parsing genotype file... initiated, to complete later.
Chromosome map shorter than 6 units of genetic distance.
 Morgan input detected - Converting to centimorgans. (Prevent this by running with -noConvert argument)
Defining Windows... done.
Total Genetic Length in use:3536.001953
Organizing genotype data for analysis... done.
Beginning segment detection with 1 thread(s)...Total Segments Found: 108
done.
IBIS analysis completed successfully.


# IBIS Output File Descriptions

## IBIS
This file contains detailed information about identity-by-descent (IBD) segments shared between pairs of individuals.

### Columns:
- **sample1, sample2**: IDs of the two individuals being compared for shared genetic segments.
- **chrom**: Chromosome number where the IBD segment is located.
- **phys_start_pos, phys_end_pos**: Start and end positions of the IBD segment in base pairs (physical positions).
- **IBD_type**: Type of IBD segment (e.g., IBD1 for sharing one parental haplotype or IBD2 for sharing both parental haplotypes).
- **genetic_start_pos, genetic_end_pos**: Start and end positions of the segment in genetic map units (centiMorgans).
- **genetic_seg_length**: Length of the IBD segment in centiMorgans (genetic distance).
- **marker_count**: Number of genetic markers (SNPs) within the segment.
- **error_count**: Total number of mismatches or genotyping errors detected in the segment.
- **error_density**: Average error rate per marker in the segment (error_count divided by marker_count).

---

## Coef
This file provides information about pairwise kinship coefficients and degrees of relatedness.

### Columns:
- **sample1, sample2**: IDs of the two individuals being compared.
- **kinship_coefficient**: A measure of genetic similarity between the individuals, ranging from 0 (no relation) to higher values for close relatives.
- **IBD2_fraction**: Proportion of the genome where both parental haplotypes are shared between the individuals.
- **segment_count**: Total number of IBD segments identified between the individuals.
- **degree_of_relatedness**: Classification of the relationship based on kinship (e.g., siblings, cousins).

---

## IBD2
Represents segments where two individuals share both parental haplotypes.  
IBD2 is particularly useful in identifying siblings or individuals with close familial ties, as these segments indicate inheritance from both sides of the family.

---

## HBD (Runs of Homozygosity)
Indicates segments where an individual has matching haplotypes on both chromosomes, likely due to inheritance from a common ancestor.  
This is a measure of inbreeding or autozygosity (when an individual inherits identical haplotypes from both parents).

### Columns:
- **sample_id**: ID of the individual being analyzed for HBD segments.
- **chrom**: Chromosome number where the HBD segment is located.
- **phys_start_pos, phys_end_pos**: Start and end positions of the HBD segment in base pairs.
- **HBD_type**: Type or classification of the HBD segment.
- **genetic_start_pos, genetic_end_pos**: Start and end positions of the segment in genetic map units (centiMorgans).
- **genetic_seg_length**: Length of the HBD segment in centiMorgans.
- **marker_count**: Number of genetic markers (SNPs) in the segment.
- **error_count**: Total number of mismatches or genotyping errors detected in the segment.
- **error_density**: Average error rate per marker in the segment.

---

## Incoef
Provides inbreeding coefficients for individuals, based on HBD analysis.

### Columns:
- **sample_id**: ID of the individual being analyzed.
- **inbreeding_coefficient**: A measure of inbreeding for the individual, reflecting the proportion of the genome covered by HBD segments.
- **segment_count**: Total number of HBD segments identified in the individual's genome.

In [14]:
import pandas as pd

def explore_coefficients(results_directory, filename="ibis_MergedSamples.coef", focus_on_related=True, save_plots=True, output_subdir="segments"):
    """
    Reads and explores the coefficients file from the results directory.
    Includes handling for missing values and options to focus on related individuals.
    
    Parameters:
        results_directory (str): Directory containing the result files.
        filename (str): Filename of the coefficients file.
        focus_on_related (bool): If True, focuses analysis on related individuals (Degree > 0).
        save_plots (bool): If True, saves plots to the specified output directory.
        output_dir (str): Directory to save plots.
    
    Returns:
        pd.DataFrame: Processed coefficients DataFrame for further analysis.
    """
        
    # Ensure output directory exists
    output_dir = os.path.join(results_directory, output_subdir)
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Read the coefficients file
    file_path = os.path.join(results_directory, filename)
    coefficients = pd.read_csv(file_path, sep="\t", low_memory=False)

    # Save both full and filtered data if focus_on_related is True
    full_data = coefficients.copy()
    filtered_data = None

    if focus_on_related:
        print("\nFocusing on related individuals (Degree > 0).")
        filtered_data = full_data[full_data['Degree'] > 0]
        print(f"Filtered DataFrame Info (Degree > 0):")
        filtered_data.info()
        print("\n=== Descriptive Statistics (Filtered) ===")
        print(filtered_data.describe())
        print("\n")
        filtered_file_path = os.path.join(output_dir, "filtered_coefficients.csv")
        filtered_data.to_csv(filtered_file_path, index=False)
        print(f"Filtered coefficients saved to: {filtered_file_path}")

    # Save and print the full data
    print("\nFull DataFrame Info:")
    full_data.info()
    print("\n=== Descriptive Statistics (Full) ===")
    print(full_data.describe())
    print("\n")
    full_file_path = os.path.join(output_dir, "full_coefficients.csv")
    full_data.to_csv(full_file_path, index=False)
    print(f"Full coefficients saved to: {full_file_path}")

    # Analyze both datasets
    datasets = {"Full": full_data, "Filtered": filtered_data} if focus_on_related else {"Full": full_data}

    for name, data in datasets.items():
        if data is not None:
            print(f"\n=== Analyzing {name} Data ===")
            
            # Counts by Degree
            degree_grouped_counts = data['Degree'].value_counts().sort_index()
            degree_grouped_counts_df = degree_grouped_counts.reset_index(name='Count')
            degree_grouped_counts_df.columns = ['Degree', 'Count']
            print(f"=== Counts by Degree ({name}) ===")
            print(degree_grouped_counts_df)
            
            # Save HTML table
            # html_table = degree_grouped_counts_df.to_html(index=False)
            # html_file_path = os.path.join(output_dir, f"{name.lower()}_degree_counts.html")
            # with open(html_file_path, "w") as f:
            #     f.write(html_table)
            # print(f"HTML table for {name} data saved to: {html_file_path}")

            # # Display in Jupyter if available
            # if hasattr(IPython, 'get_ipython') and IPython.get_ipython() is not None:
            #     display(HTML(html_table))

            # # Visualizations
            # def save_or_show_plot(fig, filename):
            #     if save_plots:
            #         fig.savefig(os.path.join(output_dir, f"{name.lower()}_{filename}"))
            #     plt.close(fig)

            # # Degree distribution
            # fig, ax = plt.subplots(figsize=(8, 5))
            # sns.histplot(data['Degree'], bins=10, kde=False, ax=ax)
            # ax.set_title(f'Degree Distribution ({name})')
            # ax.set_xlabel('Degree')
            # ax.set_ylabel('Frequency')
            # save_or_show_plot(fig, "degree_distribution.png")

            # # Other plots
            # if 'Kinship_Coefficient' in data.columns:
            #     fig, ax = plt.subplots(figsize=(8, 5))
            #     sns.histplot(data['Kinship_Coefficient'], bins=30, kde=True, ax=ax)
            #     ax.set_title(f'Kinship Coefficient Distribution ({name})')
            #     ax.set_xlabel('Kinship Coefficient')
            #     ax.set_ylabel('Frequency')
            #     save_or_show_plot(fig, "kinship_coefficient_distribution.png")

            # if 'IBD2_Fraction' in data.columns:
            #     fig, ax = plt.subplots(figsize=(8, 5))
            #     sns.histplot(data['IBD2_Fraction'], bins=30, kde=True, ax=ax)
            #     ax.set_title(f'IBD2 Fraction Distribution ({name})')
            #     ax.set_xlabel('IBD2 Fraction')
            #     ax.set_ylabel('Frequency')
            #     save_or_show_plot(fig, "ibd2_fraction_distribution.png")

            # if all(col in data.columns for col in ['Kinship_Coefficient', 'IBD2_Fraction']):
            #     fig, ax = plt.subplots(figsize=(8, 5))
            #     sns.scatterplot(
            #         data=data,
            #         x='Kinship_Coefficient',
            #         y='IBD2_Fraction',
            #         hue='Degree', palette='viridis', ax=ax
            #     )
            #     ax.set_title(f'Kinship vs. IBD2 Fraction ({name})')
            #     ax.set_xlabel('Kinship Coefficient')
            #     ax.set_ylabel('IBD2 Fraction')
            #     plt.legend(title='Degree')
            #     save_or_show_plot(fig, "kinship_vs_ibd2_fraction.png")

            # # Correlation matrix
            # numeric_cols = ['Kinship_Coefficient', 'IBD2_Fraction', 'Segment_Count']
            # existing_cols = [col for col in numeric_cols if col in data.columns]
            # if existing_cols:
            #     fig, ax = plt.subplots(figsize=(6, 5))
            #     corr = data[existing_cols].corr()
            #     sns.heatmap(corr, annot=True, cmap='Blues', square=True, ax=ax)
            #     ax.set_title(f'Correlation Matrix ({name})')
            #     save_or_show_plot(fig, "correlation_matrix.png")

    print("\nAnalysis completed.")
    return

In [16]:
explore_coefficients(results_directory, filename="merged_opensnps_autosomes_ibis.coef", focus_on_related=True, save_plots=True, output_subdir="segments")


Focusing on related individuals (Degree > 0).
Filtered DataFrame Info (Degree > 0):
<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 1966 to 4747
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Individual1          6 non-null      object 
 1   Individual2          6 non-null      object 
 2   Kinship_Coefficient  6 non-null      float64
 3   IBD2_Fraction        6 non-null      float64
 4   Segment_Count        6 non-null      int64  
 5   Degree               6 non-null      int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 336.0+ bytes

=== Descriptive Statistics (Filtered) ===
       Kinship_Coefficient  IBD2_Fraction  Segment_Count    Degree
count             6.000000       6.000000       6.000000  6.000000
mean              0.063810       0.000835      10.333333  5.166667
std               0.103334       0.002044      14.500575  2.857738
min               0.002778       0.0

In [None]:
import pandas as pd

seg_file = os.path.join(results_directory, "merged_opensnps_autosomes_ibis.seg")
coef_file = os.path.join(results_directory, "merged_opensnps_autosomes_ibis.coef")


seg_data_temp = pd.read_csv(seg_file, sep="\t", header=None)
seg_data_temp.columns = [
    "sample1", "sample2", "chrom", 
    "phys_start_pos", "phys_end_pos", 
    "IBD_type", "genetic_start_pos", 
    "genetic_end_pos", "genetic_seg_length", 
    "marker_count", "error_count", "error_density"
    ]
seg_data = seg_data_temp.sort_values(
    by=["chrom", "phys_start_pos", "phys_end_pos", "IBD_type"],
    ascending=[True, True, True, True]
)

output_file = os.path.join(results_directory, "merged_opensnps_autosomes_ibis.csv")
seg_data.to_csv(output_file, sep="\t", index=False, header=False)

In [18]:
def explore_segments_ibis(
        results_directory, 
        filename="merged_opensnps_autosomes_ibis.seg",
        min_length=7, 
        min_markers=436, 
        max_error_density=0.004,
        save_plots=True, 
        output_subdir="segments"
):
    """
    Explores and optionally filters the segments DataFrame.
    
    Parameters:
        results_directory (str): Directory containing the segments file.
        filename (str): Filename of the segments file.
        min_length (float): Minimum genetic length threshold for filtering.
        min_markers (int): Minimum marker count threshold for filtering.
        max_error_density (float): Maximum error density threshold for filtering.
        filter_segments_enabled (bool): If True, apply filtering to the segments.
        save_plots (bool): If True, save plots to the specified directory.
        output_dir (str): Directory to save outputs and plots.
    
    Returns:
        pd.DataFrame: The segments DataFrame (filtered or unfiltered based on input).
    """
    # Ensure output directory exists
    output_dir = os.path.join(results_directory, output_subdir)
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Read the segments file
    file_path = os.path.join(results_directory, filename)
    segments = pd.read_csv(file_path, sep="\t", header=None)
    segments.columns = [
        "id1", "id2", "chromosome", "physical_position_start", 
        "physical_position_end", "IBD_type", "genetic_position_start", 
        "genetic_position_end", "genetic_length", "marker_count", 
        "error_count", "error_density"
    ]

    # Ensure numeric columns are properly parsed
    numeric_columns = ["genetic_length", "marker_count", "error_density", "chromosome"]
    for col in numeric_columns:
        if col in segments.columns:
            segments[col] = pd.to_numeric(segments[col], errors='coerce')

    # Drop rows with NaN values in numeric columns
    nan_rows = segments[segments[numeric_columns].isnull().any(axis=1)]
    if not nan_rows.empty:
        nan_file_path = os.path.join(output_dir, "nan_segments_ibis.csv")
        nan_rows.to_csv(nan_file_path, sep="\t", index=False)
        print(f"Rows with NaN values saved to: {nan_file_path}")
    segments = segments.dropna(subset=numeric_columns).reset_index(drop=True)

    # Step 2: Basic info and descriptive statistics
    print("=== Segments DataFrame Info ===")
    segments.info()
    print("\n=== Descriptive Statistics ===")
    print(segments[['genetic_length', 'marker_count', 'error_density']].describe())
    print("\n")

    # Save the unfiltered data
    unfiltered_file_path = os.path.join(output_dir, "unfiltered_segments_ibis.csv")
    segments.to_csv(unfiltered_file_path, sep="\t", index=False)
    print(f"Unfiltered segments saved to: {unfiltered_file_path}")
    print()

    filtered_segments = segments[
        (segments['genetic_length'] >= min_length) &
        (segments['marker_count'] >= min_markers) &
        (segments['error_density'] <= max_error_density)
    ].copy()
    
    print("=== Filtered Segments Info ===")
    filtered_segments.info()
    print("\n=== Descriptive Statistics (Filtered) ===")
    print(filtered_segments[['genetic_length', 'marker_count', 'error_density']].describe())
    print("\n")
    
    # Save filtered segments to a new file
    filtered_filename = "filtered_segments_ibis.csv"
    filtered_file_path = os.path.join(output_dir, filtered_filename)
    filtered_segments.to_csv(filtered_file_path, sep="\t", index=False)
    print(f"Filtered segments saved to: {filtered_file_path}")

    print(f"\nSummary:")
    print(f"Total segments: {len(segments)}")
    print(f"Filtered segments: {len(filtered_segments)}")
    if not nan_rows.empty:
        print(f"Rows with NaN values: {len(nan_rows)} (saved to: {nan_file_path})")


    # # Step 4: Visualizations
    # def save_or_show_plot(fig, filename):
    #     if save_plots:
    #         fig.savefig(os.path.join(output_dir, filename))
    #     plt.close(fig)

    # def plot_distribution(data, column, title, xlabel, ylabel, filename, bins=30, kde=True):
    #     fig, ax = plt.subplots(figsize=(8, 5))
    #     sns.histplot(data[column], bins=bins, kde=kde, ax=ax)
    #     ax.set_title(title)
    #     ax.set_xlabel(xlabel)
    #     ax.set_ylabel(ylabel)
    #     save_or_show_plot(fig, filename)

    # # Visualize genetic_length distribution
    # plot_distribution(
    #     segments, "genetic_length", "Distribution of Genetic Length", 
    #     "Genetic Length (cM)", "Frequency", "genetic_length_distribution_unfiltered.png"
    # )

    # plot_distribution(
    #     filtered_segments, "genetic_length", "Distribution of Genetic Length (Filtered)", 
    #     "Genetic Length (cM)", "Frequency", "genetic_length_distribution_filtered.png"
    # )

    # # Visualize marker_count distribution
    # plot_distribution(
    #     segments, "marker_count", "Distribution of Marker Count", 
    #     "Marker Count", "Frequency", "marker_count_distribution_unfiltered.png"
    # )
    # plot_distribution(
    #     filtered_segments, "marker_count", "Distribution of Marker Count (Filtered)", 
    #     "Marker Count", "Frequency", "marker_count_distribution_filtered.png"
    # )

    # # Boxplot of genetic_length by chromosome
    # def plot_boxplot(data, x_col, y_col, title, xlabel, ylabel, filename):
    #     fig, ax = plt.subplots(figsize=(10, 6))
    #     sns.boxplot(x=x_col, y=y_col, data=data, ax=ax)
    #     ax.set_title(title)
    #     ax.set_xlabel(xlabel)
    #     ax.set_ylabel(ylabel)
    #     plt.xticks(rotation=45)
    #     plt.tight_layout()
    #     save_or_show_plot(fig, filename)

    # plot_boxplot(
    #     segments, "chromosome", "genetic_length", 
    #     "Distribution of Genetic Length by Chromosome", 
    #     "Chromosome", "Genetic Length (cM)", "genetic_length_by_chromosome_unfiltered.png"
    # )
    # plot_boxplot(
    #     filtered_segments, "chromosome", "genetic_length", 
    #     "Distribution of Genetic Length by Chromosome (Filtered)", 
    #     "Chromosome", "Genetic Length (cM)", "genetic_length_by_chromosome_filtered.png"
    # )

    print("\nAnalysis completed.")
    return

In [19]:
explore_segments_ibis(
        results_directory, 
        filename="merged_opensnps_autosomes_ibis.seg",
        min_length=7, 
        min_markers=436, 
        max_error_density=0.004,
        save_plots=True, 
        output_subdir="segments"
)

=== Segments DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id1                      108 non-null    object 
 1   id2                      108 non-null    object 
 2   chromosome               108 non-null    int64  
 3   physical_position_start  108 non-null    int64  
 4   physical_position_end    108 non-null    int64  
 5   IBD_type                 108 non-null    object 
 6   genetic_position_start   108 non-null    float64
 7   genetic_position_end     108 non-null    float64
 8   genetic_length           108 non-null    float64
 9   marker_count             108 non-null    int64  
 10  error_count              108 non-null    int64  
 11  error_density            108 non-null    float64
dtypes: float64(4), int64(5), object(3)
memory usage: 10.2+ KB

=== Descriptive Statistics ===
       genet