In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab5.log")
print(f"The Lab 5 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

### Get The Data

# 📥 How to Download OpenSNP Data

There are **two ways** to download OpenSNP data. You only need to select **one** method. **The preferred method is using `boto3`.**

---

## **✅ Option 1: Download Using `boto3` (Preferred)**
With `boto3` installed, you can use it to efficiently download files from the OpenSNP public S3 bucket. This method is recommended for bulk downloading and better reliability. However, how to setup boto3 will not be covered until later in the semester. For now, use option 2.

---

## **✅ Option 2: Download Using `requests` (No AWS Setup Required)**
You can use the `requests` library to download the files directly from a public S3 URLs. This method is easier to use but may be slower for large downloads.

---

## **📌 Manually Downloading Files**
If you prefer to manually download individual files:
1. Open your web browser and go to:
   ```
   https://opensnpdata.s3.us-east-2.amazonaws.com/[FILENAME]
   ```
   Replace `[FILENAME]` with the exact filename from `opensnp_file_list.txt`.

2. Example:

   https://opensnpdata.s3.us-east-2.amazonaws.com/user1001_file496_yearofbirth_unknown_sex_unknown.ancestry.txt

3. **Right-click → Save As...** to download the file.

4. Move the file to:
   ```
   data_directory/class_data/raw_dna_profiles/
   ```

---

📌 **Choose one method that works best for you.** If unsure, use **boto3** for better performance.

🚀 Happy downloading! 🚀

**Option 1: boto3 version**

In [None]:
# Define constants
BUCKET_NAME = "opensnpdata"
SAVE_DIR = f"{data_directory}/class_data/raw_dna_profiles"

# Ensure the save directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

# Initialize an anonymous S3 client
s3 = boto3.client("s3", config=boto3.session.Config(signature_version="s3v4"))

def count_files_in_bucket():
    """Count the total number of files in the OpenSNP bucket."""
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    if "Contents" in response:
        return len(response["Contents"])
    return 0

num_files = count_files_in_bucket()
print(f"Total files in bucket: {num_files}")

In [None]:
def download_files(limit=None):
    """
    Download a specified number of files (or all files) from the OpenSNP bucket.

    Parameters:
        limit (int or None): Number of files to download. If None, downloads all files.
    """
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    
    if "Contents" not in response:
        print("No files found in the OpenSNP bucket.")
        return
    
    files = response["Contents"]
    
    # Apply limit if specified
    if limit is not None:
        files = files[:limit]

    print(f"Downloading {len(files)} files...")

    for obj in files:
        file_key = obj["Key"]
        local_path = os.path.join(SAVE_DIR, os.path.basename(file_key))

        print(f"Downloading: {file_key} -> {local_path}")
        s3.download_file(BUCKET_NAME, file_key, local_path)

    print("Download completed.")


In [None]:
# Download a certain number of files (e.g., first 5)
download_files(limit=5)

In [None]:
# Download all files
download_files(limit=None)

**Option 2: non boto3 version**

In [None]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm  # Changed to notebook version of tqdm
from tqdm.auto import tqdm as tqdm_auto  # For auto-detection of environment
import signal
import sys
from dataclasses import dataclass
from typing import List, Optional
import logging

@dataclass
class DownloadConfig:
    bucket_url: str
    save_dir: str
    file_list_path: str
    max_workers: int = 5
    chunk_size: int = 8192

class ParallelDownloader:
    def __init__(self, config: DownloadConfig):
        self.config = config
        self.interrupted = False
        self.failed_downloads = []
        self.setup_logging()
        self.setup_signal_handlers()
        
    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('download_log.txt'),
                logging.StreamHandler(sys.stdout)
            ]
        )
        self.logger = logging.getLogger(__name__)

    def setup_signal_handlers(self):
        signal.signal(signal.SIGINT, self.handle_interrupt)
        signal.signal(signal.SIGTERM, self.handle_interrupt)

    def handle_interrupt(self, signum, frame):
        self.logger.warning("Received interrupt signal. Finishing current downloads...")
        self.interrupted = True

    def get_file_list(self, start: Optional[int] = None, end: Optional[int] = None) -> List[str]:
        """Read and optionally slice the file list."""
        try:
            with open(self.config.file_list_path, "r") as f:
                file_list = [line.strip() for line in f.readlines()]
            return file_list[start:end] if start is not None else file_list
        except FileNotFoundError:
            self.logger.error(f"File list not found: {self.config.file_list_path}")
            raise

    def download_file(self, filename: str, overall_pbar) -> bool:
        """Download a single file with progress tracking."""
        if self.interrupted:
            return False

        file_url = f"{self.config.bucket_url}/{filename}"
        local_path = os.path.join(self.config.save_dir, os.path.basename(filename))

        # Skip if file exists and has content
        if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
            self.logger.info(f"Skipping existing file: {filename}")
            overall_pbar.update(1)
            return True

        try:
            response = requests.get(file_url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get('content-length', 0))
            
            with open(local_path, "wb") as file:
                if total_size == 0:
                    file.write(response.content)
                else:
                    downloaded = 0
                    for chunk in response.iter_content(chunk_size=self.config.chunk_size):
                        if self.interrupted:
                            return False
                        if chunk:
                            file.write(chunk)
                            downloaded += len(chunk)
                            
            overall_pbar.update(1)
            return True

        except Exception as e:
            self.logger.error(f"Failed to download {filename}: {str(e)}")
            self.failed_downloads.append((filename, str(e)))
            if os.path.exists(local_path):
                os.remove(local_path)
            overall_pbar.update(1)
            return False

    def download_files(self, start: Optional[int] = None, 
                      end: Optional[int] = None, 
                      max_retries: int = 3) -> None:
        """
        Download files in parallel with retry mechanism.
        
        Args:
            start: Optional starting index for file range
            end: Optional ending index for file range
            max_retries: Maximum number of retry attempts for failed downloads
        """
        os.makedirs(self.config.save_dir, exist_ok=True)
        
        file_list = self.get_file_list(start, end)
        total_files = len(file_list)
        
        self.logger.info(f"Starting download of {total_files} files...")
        
        for attempt in range(max_retries + 1):
            if not file_list:
                break
                
            with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
                # Create the progress bar
                with tqdm_auto(total=len(file_list), 
                             desc="Downloading files", 
                             unit="file") as pbar:
                    
                    # Submit all downloads
                    future_to_file = {
                        executor.submit(self.download_file, filename, pbar): filename
                        for filename in file_list
                    }
                    
                    successful_downloads = []
                    
                    # Process completed downloads
                    for future in as_completed(future_to_file):
                        filename = future_to_file[future]
                        try:
                            if future.result():
                                successful_downloads.append(filename)
                        except Exception as e:
                            self.logger.error(f"Error downloading {filename}: {str(e)}")
                        
                        if self.interrupted:
                            self.logger.warning("Download interrupted by user.")
                            return

                # Remove successful downloads from the list
                file_list = [f for f in file_list if f not in successful_downloads]
                
                if file_list and attempt < max_retries:
                    self.logger.info(f"Retrying {len(file_list)} failed downloads... "
                                   f"(Attempt {attempt + 2}/{max_retries + 1})")
        
        if self.failed_downloads:
            self.logger.error("Failed downloads:")
            for filename, error in self.failed_downloads:
                self.logger.error(f"  {filename}: {error}")
            
        self.logger.info(f"Download completed. "
                        f"Successfully downloaded: {total_files - len(self.failed_downloads)}/{total_files}")

In [None]:
# Create config in one cell
config = DownloadConfig(
    bucket_url="https://opensnpdata.s3.us-east-2.amazonaws.com",
    save_dir=f"{data_directory}/class_data/raw_dna_profiles",
    file_list_path=f"{data_directory}/class_data/opensnp_file_list.txt",
    max_workers=5
)

# Create downloader instance
downloader = ParallelDownloader(config)


# You can run this cell multiple times if needed
downloader.download_files(start=5, end=15)  # Start with just 10 files as a test

In [None]:
# Download all files
downloader.download_files()

# There are 117 files. It took mine 3 minutes and 33 seconds to download.

### Genetic Map

Use your existing Beagle genetic maps to create the genetic maps for IBIS. If you already have these genetic maps, you do not need to rerun these cells to download the genetic maps again.

In [None]:
def preprocess_ibis_map():
    beagle_map_dir = os.path.join(references_directory, "genetic_maps/beagle_genetic_maps")
    ibis_map_dir = os.path.join(references_directory, "genetic_maps/ibis_genetic_maps")
    os.makedirs(ibis_map_dir, exist_ok=True)
    
    for map_file in os.listdir(beagle_map_dir):
        if map_file.endswith(".map"):
            beagle_map_filename = os.path.join(beagle_map_dir, map_file)
            ibis_map_filename = os.path.join(ibis_map_dir, map_file)
            print(f"Processing {beagle_map_filename} to create IBIS map...")
            
            # For IBIS maps, we need: CHR POSITION GENETIC_POSITION [RATE]
            # From Beagle maps which are: CHR . GENETIC_POSITION PHYSICAL_POSITION
            # Move the genetic position to column 3 (not 4)
            command = f"awk '{{print $1, $4, $3, 0}}' {beagle_map_filename} > {ibis_map_filename}"
            
            subprocess.run(command, shell=True, check=True)
    print("All Beagle genetic maps converted to IBIS format.")

In [None]:
# Set up output directories
genetic_maps_dir = os.path.join(references_directory, "genetic_maps")
os.makedirs(genetic_maps_dir, exist_ok=True)

ibis_genetic_maps = os.path.join(genetic_maps_dir, "ibis_genetic_maps")
os.makedirs(ibis_genetic_maps, exist_ok=True)

assembly = "GRCh38"
preprocess_ibis_map()

# # Alternative source
# plink2_genetic_map_url="https://alkesgroup.broadinstitute.org/Eagle/downloads/tables/genetic_map_hg38_withX.txt.gz"

**Visual Inspection**

The above code should have created a set of genetic map files in the format that IBIS use. Look in your `references/genetic_mpas` directory and check for the `ibis_genetic_maps` subdirectory and individual by chromosome files within `ibis_genetic_maps`. Open the chromosome 1 file of both `ibis_genetic_maps` and `beagle_genetic_maps`. Compare them, visually. How are they similar? How are they different?

🛑 **STOP**

If you're using the newly downloaded data, you need to run Lab3 and Lab4 to process that data first. 

The following cells starts with the files that are in your `results/phased_samples` directory (which is where Lab4 ends). 

For now, as a demonstraction of the rest of this code, you can manually move the files out of `class_data/phased_samples` and place them in `results/phased_samples`. These files are from my run of all the files through Labs 3 and 4. Make sure you delete these from `results/phased_samples` and process the files you downloaded later to complete the lab.

### Concatenate phased VCF files

In [None]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Define the directory containing phased VCF files
phased_samples_dir="${results_directory}/phased_samples"

# Concatenate phased VCF files
echo "Creating list of phased VCF files..."
PHASED_FILE_LIST="${phased_samples_dir}/phased_file_list_sample.txt"

# Empty the file list if it already exists
> "$PHASED_FILE_LIST"

for CHR in {1..22}; do
    PHASED_VCF="${phased_samples_dir}/merged_opensnps_phased_chr${CHR}.vcf.gz"
    if [ -f "$PHASED_VCF" ]; then
        echo "$PHASED_VCF" >> "$PHASED_FILE_LIST"
    else
        echo "Phased VCF missing for chromosome $CHR"
    fi
done

CONCATENATED_VCF="${phased_samples_dir}/merged_opensnps_autosomes.vcf"
SORTED_VCF="${phased_samples_dir}/merged_opensnps_autosomes_sorted.vcf.gz"
STATS_OUTPUT="${phased_samples_dir}/merged_opensnps_autosomes_sorted_stats.vchk"

# Concatenate VCF files
bcftools concat -o "$CONCATENATED_VCF" --file-list "$PHASED_FILE_LIST"

if [ -f "$CONCATENATED_VCF" ]; then
    # Sort and compress the concatenated VCF
    bcftools sort -Oz -o "$SORTED_VCF" "$CONCATENATED_VCF"

    # Index the sorted VCF
    bcftools index --tbi -f "$SORTED_VCF"

    # Generate stats
    bcftools stats -s - "$SORTED_VCF" > "$STATS_OUTPUT"

    rm -f "${results_directory}/merged_opensnps_autosomes_step1*"
    rm -f "${results_directory}/merged_opensnps_autosomes_step2*"

    echo "Phasing, cleanup, and concatenation completed successfully."

    # Remove individual phased VCF files
    echo "Removing individual phased VCF files..."
    for CHR in {1..22}; do
        PHASED_VCF="${phased_samples_dir}/merged_opensnps_phased_chr${CHR}.vcf.gz"
        if [ -f "${PHASED_VCF}" ]; then
            rm -f "${PHASED_VCF}"
            rm -f "${PHASED_VCF}.tbi"
            rm -f "${phased_samples_dir}/merged_opensnps_phased_chr${CHR}.log"
            rm -f "${phased_samples_dir}/merged_opensnps_phased_chr${CHR}_stats.vchk"
            echo "Removed $PHASED_VCF and its index."
        fi
    done
    rm -f "${phased_samples_dir}/merged_opensnps_autosomes.vcf"
else
    echo "Concatenated VCF file missing. Pipeline aborted."
    exit 1
fi

### Change the format of the data files from VCF to BED.

In [None]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Define
phased_samples_dir="${results_directory}/phased_samples"
vcf_file="${phased_samples_dir}/merged_opensnps_autosomes_sorted.vcf.gz"

# Ensure the PLINK2 executable exists
if [[ ! -f "${utils_directory}/plink2" ]]; then
    echo "Error: PLINK2 executable not found: ${utils_directory}/plink2" >&2
    exit 1
fi

# Ensure the phased samples directory exists
if [[ ! -d "${phased_samples_dir}" ]]; then
    echo "Error: Phased samples directory not found: ${phased_samples_dir}" >&2
    exit 1
fi

# Check if the file exists
if [[ ! -f "$vcf_file" ]]; then
    echo "No matching VCF file found in $phased_samples_dir" >&2
    exit 1
fi

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"

# Convert the VCF file to PLINK format
${utils_directory}/plink2 --vcf "$vcf_file" --autosome --make-bed --out "$output_prefix"

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "PLINK2 successfully processed: ${vcf_file}"
else
    echo "Error processing ${vcf_file}" >&2
fi

Explanation:
- `data_directory`, `utils_directory`, and `results_directory` are passed as arguments and assigned.
- The script verifies that plink2 exists and that the phased_samples_dir is a valid directory.
- It loops over files matching opensnps_phased_*.vcf.gz, checking if they exist before processing.
- Uses PLINK2 to convert each .vcf.gz file to PLINK binary format (.bed, .bim, .fam).
- Handles errors and prints appropriate messages.

### Add Genetic Map to Bim File
(as per the IBIS developer)

In [None]:
!head -10 /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bim

In [None]:
%%bash -s "$results_directory" "$references_directory" "$utils_directory"

results_directory="$1"
references_directory="$2"
utils_directory="$3"

python scripts_support/add_genetic_map.py \
  /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bim \
  /home/lakishadavid/computational_genetic_genealogy/references/genetic_maps/ibis_genetic_maps \
  /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bim.gm

In [None]:
!head -10 /home/lakishadavid/computational_genetic_genealogy/results/phased_samples/merged_opensnps_autosomes_sorted.bim.gm

Explanation:
- The script assigns arguments to data_directory, references_directory, and utils_directory.
- It verifies the existence of the add-map-plink.pl script.
- It checks for .bim files in data_directory, ensuring at least one exists.
- Extracts the chromosome number from the .bim filename.
- Determines the corresponding genetic map file.
- If the necessary files exist, it runs the Perl script to append the genetic map.
- The new .bim file is saved with a _gm.bim suffix.
- Errors are handled with messages and exit codes.

### Run the IBD Detection Algorithm

In [None]:
%%bash -s "$data_directory" "$results_directory" "$utils_directory"

data_directory="$1"
results_directory="$2"
utils_directory="$3"

# Define the IBIS executable path
ibis="${utils_directory}/ibis/ibis"

# Ensure the IBIS executable exists
if [[ ! -f "${ibis}" ]]; then
    echo "Error: IBIS executable not found: ${ibis}" >&2
fi

bed_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.bed"
bim_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.bim.gm"
fam_file="${results_directory}/phased_samples/merged_opensnps_autosomes_sorted.fam"
${ibis} ${bed_file} ${bim_file} ${fam_file} -ibd2 -min_l 7 -mt 500 -er .004 \
    -o "${results_directory}/merged_opensnps_autosomes_ibis" \
    -printCoef -noFamID

# Check exit status
if [[ $? -eq 0 ]]; then
    echo "IBIS analysis completed successfully."
else
    echo "Error running IBIS analysis." >&2
fi

# IBIS Output File Descriptions

## IBIS
This file contains detailed information about identity-by-descent (IBD) segments shared between pairs of individuals.

### Columns:
- **sample1, sample2**: IDs of the two individuals being compared for shared genetic segments.
- **chrom**: Chromosome number where the IBD segment is located.
- **phys_start_pos, phys_end_pos**: Start and end positions of the IBD segment in base pairs (physical positions).
- **IBD_type**: Type of IBD segment (e.g., IBD1 for sharing one parental haplotype or IBD2 for sharing both parental haplotypes).
- **genetic_start_pos, genetic_end_pos**: Start and end positions of the segment in genetic map units (centiMorgans).
- **genetic_seg_length**: Length of the IBD segment in centiMorgans (genetic distance).
- **marker_count**: Number of genetic markers (SNPs) within the segment.
- **error_count**: Total number of mismatches or genotyping errors detected in the segment.
- **error_density**: Average error rate per marker in the segment (error_count divided by marker_count).

---

## Coef
This file provides information about pairwise kinship coefficients and degrees of relatedness.

### Columns:
- **sample1, sample2**: IDs of the two individuals being compared.
- **kinship_coefficient**: A measure of genetic similarity between the individuals, ranging from 0 (no relation) to higher values for close relatives.
- **IBD2_fraction**: Proportion of the genome where both parental haplotypes are shared between the individuals.
- **segment_count**: Total number of IBD segments identified between the individuals.
- **degree_of_relatedness**: Classification of the relationship based on kinship (e.g., siblings, cousins).

---

## IBD2
Represents segments where two individuals share both parental haplotypes.  
IBD2 is particularly useful in identifying siblings or individuals with close familial ties, as these segments indicate inheritance from both sides of the family.

---

## HBD (Runs of Homozygosity)
Indicates segments where an individual has matching haplotypes on both chromosomes, likely due to inheritance from a common ancestor.  
This is a measure of inbreeding or autozygosity (when an individual inherits identical haplotypes from both parents).

### Columns:
- **sample_id**: ID of the individual being analyzed for HBD segments.
- **chrom**: Chromosome number where the HBD segment is located.
- **phys_start_pos, phys_end_pos**: Start and end positions of the HBD segment in base pairs.
- **HBD_type**: Type or classification of the HBD segment.
- **genetic_start_pos, genetic_end_pos**: Start and end positions of the segment in genetic map units (centiMorgans).
- **genetic_seg_length**: Length of the HBD segment in centiMorgans.
- **marker_count**: Number of genetic markers (SNPs) in the segment.
- **error_count**: Total number of mismatches or genotyping errors detected in the segment.
- **error_density**: Average error rate per marker in the segment.

---

## Incoef
Provides inbreeding coefficients for individuals, based on HBD analysis.

### Columns:
- **sample_id**: ID of the individual being analyzed.
- **inbreeding_coefficient**: A measure of inbreeding for the individual, reflecting the proportion of the genome covered by HBD segments.
- **segment_count**: Total number of HBD segments identified in the individual's genome.

### Explore The Coefficients Results

In [None]:
def explore_coefficients(
    results_directory, 
    filename="merged_opensnps_autosomes_ibis.coef", 
    focus_on_related=True, 
    save_plots=True, 
    show_plots=True,
    output_subdir="segments"
    ):
    """
    Reads and explores the coefficients file from the results directory.
    Includes handling for missing values and options to focus on related individuals.
    
    Parameters:
        results_directory (str): Directory containing the result files.
        filename (str): Filename of the coefficients file.
        focus_on_related (bool): If True, focuses analysis on related individuals (Degree > 0).
        save_plots (bool): If True, saves plots to the specified output directory.
        output_dir (str): Directory to save plots.
    
    Returns:
        pd.DataFrame: Processed coefficients DataFrame for further analysis.
    """
        
    # Ensure output directory exists
    output_dir = os.path.join(results_directory, output_subdir)
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Read the coefficients file
    file_path = os.path.join(results_directory, filename)
    coefficients = pd.read_csv(file_path, sep="\t", low_memory=False)

    # Save both full and filtered data if focus_on_related is True
    full_data = coefficients.copy()
    filtered_data = None

    if focus_on_related:
        print("\nFocusing on related individuals (Degree > 0).")
        filtered_data = full_data[full_data['Degree'] > 0]
        print(f"Filtered DataFrame Info (Degree > 0):")
        filtered_data.info()
        print("\n=== Descriptive Statistics (Filtered) ===")
        print(filtered_data.describe())
        print("\n")
        filtered_file_path = os.path.join(output_dir, "filtered_coefficients.csv")
        filtered_data.to_csv(filtered_file_path, index=False)
        print(f"Filtered coefficients saved to: {filtered_file_path}")

    # Save and print the full data
    print("\nFull DataFrame Info:")
    full_data.info()
    print("\n=== Descriptive Statistics (Full) ===")
    print(full_data.describe())
    print("\n")
    full_file_path = os.path.join(output_dir, "full_coefficients.csv")
    full_data.to_csv(full_file_path, index=False)
    print(f"Full coefficients saved to: {full_file_path}")

    # Analyze both datasets
    datasets = {"Full": full_data, "Filtered": filtered_data} if focus_on_related else {"Full": full_data}

    for name, data in datasets.items():
        if data is not None:
            print(f"\n=== Analyzing {name} Data ===")
            
            # Counts by Degree
            degree_grouped_counts = data['Degree'].value_counts().sort_index()
            degree_grouped_counts_df = degree_grouped_counts.reset_index(name='Count')
            degree_grouped_counts_df.columns = ['Degree', 'Count']
            print(f"=== Counts by Degree ({name}) ===")
            print(degree_grouped_counts_df)
            
            # Save HTML table
            html_table = degree_grouped_counts_df.to_html(index=False)
            html_file_path = os.path.join(output_dir, f"{name.lower()}_degree_counts.html")
            with open(html_file_path, "w") as f:
                f.write(html_table)
            print(f"HTML table for {name} data saved to: {html_file_path}")

            # Display in Jupyter if available
            if hasattr(IPython, 'get_ipython') and IPython.get_ipython() is not None:
                display(HTML(html_table))

            # Visualizations
            def save_or_show_plot(fig, filename):
                if save_plots:
                    fig.savefig(os.path.join(output_dir, f"{name.lower()}_{filename}"))
                if show_plots:
                    plt.show()
                plt.close(fig)

            # Degree distribution
            fig, ax = plt.subplots(figsize=(8, 5))
            sns.histplot(data['Degree'], bins=10, kde=False, ax=ax)
            ax.set_title(f'Degree Distribution ({name})')
            ax.set_xlabel('Degree')
            ax.set_ylabel('Frequency')
            save_or_show_plot(fig, "degree_distribution.png")

            # Other plots
            if 'Kinship_Coefficient' in data.columns:
                fig, ax = plt.subplots(figsize=(8, 5))
                sns.histplot(data['Kinship_Coefficient'], bins=30, kde=True, ax=ax)
                ax.set_title(f'Kinship Coefficient Distribution ({name})')
                ax.set_xlabel('Kinship Coefficient')
                ax.set_ylabel('Frequency')
                save_or_show_plot(fig, "kinship_coefficient_distribution.png")

            if 'IBD2_Fraction' in data.columns:
                fig, ax = plt.subplots(figsize=(8, 5))
                sns.histplot(data['IBD2_Fraction'], bins=30, kde=True, ax=ax)
                ax.set_title(f'IBD2 Fraction Distribution ({name})')
                ax.set_xlabel('IBD2 Fraction')
                ax.set_ylabel('Frequency')
                save_or_show_plot(fig, "ibd2_fraction_distribution.png")

            if all(col in data.columns for col in ['Kinship_Coefficient', 'IBD2_Fraction']):
                fig, ax = plt.subplots(figsize=(8, 5))
                sns.scatterplot(
                    data=data,
                    x='Kinship_Coefficient',
                    y='IBD2_Fraction',
                    hue='Degree', palette='viridis', ax=ax
                )
                ax.set_title(f'Kinship vs. IBD2 Fraction ({name})')
                ax.set_xlabel('Kinship Coefficient')
                ax.set_ylabel('IBD2 Fraction')
                plt.legend(title='Degree')
                save_or_show_plot(fig, "kinship_vs_ibd2_fraction.png")

            # Correlation matrix
            numeric_cols = ['Kinship_Coefficient', 'IBD2_Fraction', 'Segment_Count']
            existing_cols = [col for col in numeric_cols if col in data.columns]
            if existing_cols:
                fig, ax = plt.subplots(figsize=(6, 5))
                corr = data[existing_cols].corr()
                sns.heatmap(corr, annot=True, cmap='Blues', square=True, ax=ax)
                ax.set_title(f'Correlation Matrix ({name})')
                save_or_show_plot(fig, "correlation_matrix.png")

    print("\nAnalysis completed.")
    return

In [None]:
explore_coefficients(
    results_directory, 
    filename="merged_opensnps_autosomes_ibis.coef", 
    focus_on_related=True, 
    save_plots=True,
    show_plots=True,
    output_subdir="segments")

### New Results

Look in your `results/segments` directory. You should see several new files. The image files are the ones that with the `.png` extension. Look at the 5 image files that start wtih `filtered_`. What do you think these mean?

### Explore The Segments Results

In [None]:
seg_file = os.path.join(results_directory, "merged_opensnps_autosomes_ibis.seg")

seg_data_temp = pd.read_csv(seg_file, sep="\t", header=None)
seg_data_temp.columns = [
    "sample1", "sample2", "chrom", 
    "phys_start_pos", "phys_end_pos", 
    "IBD_type", "genetic_start_pos", 
    "genetic_end_pos", "genetic_seg_length", 
    "marker_count", "error_count", "error_density"
    ]
seg_data = seg_data_temp.sort_values(
    by=["chrom", "phys_start_pos", "phys_end_pos", "IBD_type"],
    ascending=[True, True, True, True]
)

output_file = os.path.join(results_directory, "merged_opensnps_autosomes_ibis.csv")
seg_data.to_csv(output_file, sep="\t", index=False, header=False)

Notice the new file extension of `.csv`.

Let's create some data visualizations.

In [None]:
def explore_segments_ibis(
        results_directory, 
        filename="merged_opensnps_autosomes_ibis.seg",
        min_length=7, 
        min_markers=436, 
        max_error_density=0.004,
        save_plots=True,
        show_plots=True, 
        output_subdir="segments"
):
    """
    Explores and optionally filters the segments DataFrame.
    
    Parameters:
        results_directory (str): Directory containing the segments file.
        filename (str): Filename of the segments file.
        min_length (float): Minimum genetic length threshold for filtering.
        min_markers (int): Minimum marker count threshold for filtering.
        max_error_density (float): Maximum error density threshold for filtering.
        filter_segments_enabled (bool): If True, apply filtering to the segments.
        save_plots (bool): If True, save plots to the specified directory.
        output_dir (str): Directory to save outputs and plots.
    
    Returns:
        pd.DataFrame: The segments DataFrame (filtered or unfiltered based on input).
    """
    # Ensure output directory exists
    output_dir = os.path.join(results_directory, output_subdir)
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Read the segments file
    file_path = os.path.join(results_directory, filename)
    segments = pd.read_csv(file_path, sep="\t", header=None)
    segments.columns = [
        "id1", "id2", "chromosome", "physical_position_start", 
        "physical_position_end", "IBD_type", "genetic_position_start", 
        "genetic_position_end", "genetic_length", "marker_count", 
        "error_count", "error_density"
    ]

    # Ensure numeric columns are properly parsed
    numeric_columns = ["genetic_length", "marker_count", "error_density", "chromosome"]
    for col in numeric_columns:
        if col in segments.columns:
            segments[col] = pd.to_numeric(segments[col], errors='coerce')

    # Drop rows with NaN values in numeric columns
    nan_rows = segments[segments[numeric_columns].isnull().any(axis=1)]
    if not nan_rows.empty:
        nan_file_path = os.path.join(output_dir, "nan_segments_ibis.csv")
        nan_rows.to_csv(nan_file_path, sep="\t", index=False)
        print(f"Rows with NaN values saved to: {nan_file_path}")
    segments = segments.dropna(subset=numeric_columns).reset_index(drop=True)

    # Step 2: Basic info and descriptive statistics
    print("=== Segments DataFrame Info ===")
    segments.info()
    print("\n=== Descriptive Statistics ===")
    print(segments[['genetic_length', 'marker_count', 'error_density']].describe())
    print("\n")

    # Save the unfiltered data
    unfiltered_file_path = os.path.join(output_dir, "unfiltered_segments_ibis.csv")
    segments.to_csv(unfiltered_file_path, sep="\t", index=False)
    print(f"Unfiltered segments saved to: {unfiltered_file_path}")
    print()

    filtered_segments = segments[
        (segments['genetic_length'] >= min_length) &
        (segments['marker_count'] >= min_markers) &
        (segments['error_density'] <= max_error_density)
    ].copy()
    
    print("=== Filtered Segments Info ===")
    filtered_segments.info()
    print("\n=== Descriptive Statistics (Filtered) ===")
    print(filtered_segments[['genetic_length', 'marker_count', 'error_density']].describe())
    print("\n")
    
    # Save filtered segments to a new file
    filtered_filename = "filtered_segments_ibis.csv"
    filtered_file_path = os.path.join(output_dir, filtered_filename)
    filtered_segments.to_csv(filtered_file_path, sep="\t", index=False)
    print(f"Filtered segments saved to: {filtered_file_path}")

    print(f"\nSummary:")
    print(f"Total segments: {len(segments)}")
    print(f"Filtered segments: {len(filtered_segments)}")
    if not nan_rows.empty:
        print(f"Rows with NaN values: {len(nan_rows)} (saved to: {nan_file_path})")


    # Step 4: Visualizations
    def save_or_show_plot(fig, filename):
        if save_plots:
            fig.savefig(os.path.join(output_dir, filename))
        if show_plots:
            plt.show()
        plt.close(fig)

    def plot_distribution(data, column, title, xlabel, ylabel, filename, bins=30, kde=True):
        fig, ax = plt.subplots(figsize=(8, 5))
        sns.histplot(data[column], bins=bins, kde=kde, ax=ax)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        save_or_show_plot(fig, filename)

    # Visualize genetic_length distribution
    plot_distribution(
        segments, "genetic_length", "Distribution of Genetic Length", 
        "Genetic Length (cM)", "Frequency", "genetic_length_distribution_unfiltered.png"
    )

    plot_distribution(
        filtered_segments, "genetic_length", "Distribution of Genetic Length (Filtered)", 
        "Genetic Length (cM)", "Frequency", "genetic_length_distribution_filtered.png"
    )

    # Visualize marker_count distribution
    plot_distribution(
        segments, "marker_count", "Distribution of Marker Count", 
        "Marker Count", "Frequency", "marker_count_distribution_unfiltered.png"
    )
    plot_distribution(
        filtered_segments, "marker_count", "Distribution of Marker Count (Filtered)", 
        "Marker Count", "Frequency", "marker_count_distribution_filtered.png"
    )

    # Boxplot of genetic_length by chromosome
    def plot_boxplot(data, x_col, y_col, title, xlabel, ylabel, filename):
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(x=x_col, y=y_col, data=data, ax=ax)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        plt.xticks(rotation=45)
        plt.tight_layout()
        save_or_show_plot(fig, filename)

    plot_boxplot(
        segments, "chromosome", "genetic_length", 
        "Distribution of Genetic Length by Chromosome", 
        "Chromosome", "Genetic Length (cM)", "genetic_length_by_chromosome_unfiltered.png"
    )
    plot_boxplot(
        filtered_segments, "chromosome", "genetic_length", 
        "Distribution of Genetic Length by Chromosome (Filtered)", 
        "Chromosome", "Genetic Length (cM)", "genetic_length_by_chromosome_filtered.png"
    )

    print("\nAnalysis completed.")
    return

In [None]:
explore_segments_ibis(
        results_directory, 
        filename="merged_opensnps_autosomes_ibis.seg",
        min_length=7, 
        min_markers=436, 
        max_error_density=0.004,
        save_plots=True,
        show_plots=True,
        output_subdir="segments"
)

The plots were saved in your `results/segments` directory.

Take a look at the plots. What information are they communicating? Some are meaningful. Some less so. What other information do you think should be communicated?

### Continue to explore the segment data.

The following cell takes the segment data output from IBIS IBD dectection algorithm and processes it using `pandas`.

In [None]:
file_path = os.path.join(results_directory, "merged_opensnps_autosomes_ibis.seg")
segments = pd.read_csv(file_path, sep="\t", header=None)
segments.columns = [
    "id1", "id2", "chromosome", "physical_position_start", 
    "physical_position_end", "IBD_type", "genetic_position_start", 
    "genetic_position_end", "genetic_length", "marker_count", 
    "error_count", "error_density"
]

# Ensure numeric columns are properly parsed
numeric_columns = ["genetic_length", "marker_count", "error_density", "chromosome"]
for col in numeric_columns:
    if col in segments.columns:
        segments[col] = pd.to_numeric(segments[col], errors='coerce')

segments = segments.dropna(subset=numeric_columns).reset_index(drop=True)

The next cell allows you to look at the first five rows. Notice that the data now has column headers. The IBIS output file does not come with column headers. We have to look at the developers' GitHub page or other documenation to determine the names of the columns.

In [None]:
segments.info()

In [None]:
segments.head() # You can enter a number greater than 5 to view more rows

In [None]:
segments[["genetic_length", "marker_count", "error_count", "error_density"]].describe()

In [None]:
# filter segments on min_length=7, min_markers=436, max_error_density=0.004,

filtered_segments = segments[
    (segments["genetic_length"] >= 7) & 
    (segments["marker_count"] >= 436) & 
    (segments["error_density"] <= 0.004)
].copy()

filtered_segments[["genetic_length", "marker_count", "error_count", "error_density"]].describe()

In [None]:
filtered_segments_20cM = filtered_segments[filtered_segments["genetic_length"] >= 20].copy()
filtered_segments_20cM[["genetic_length", "marker_count", "error_count", "error_density"]].describe()

Let's aggregate the data by pairs instead of looking at it by segment.

In [None]:
import pandas as pd
import numpy as np

# First ensure id1 and id2 are consistently ordered
filtered_segments[["id1", "id2"]] = filtered_segments.apply(
    lambda row: pd.Series((row["id1"], row["id2"])) if row["id1"] < row["id2"] 
    else pd.Series((row["id2"], row["id1"])), axis=1
)

filtered_segments.head()

In [None]:
pair_counts = filtered_segments.groupby(["id1", "id2"]).size().reset_index(name="pair_count")
pair_count_distribution = pair_counts["pair_count"].value_counts().reset_index()
pair_count_distribution.columns = ["Number of Segments", "Number of Pairs"]
pair_count_distribution = pair_count_distribution.reset_index(drop=True)
display(pair_count_distribution.style.hide(axis="index"))

In [None]:
filtered_segments[filtered_segments["IBD_type"] == "IBD1"].shape[0]

In [None]:
filtered_segments[filtered_segments["IBD_type"] == "IBD2"].shape[0]

In [None]:
filtered_segments[filtered_segments["id1"] == filtered_segments["id2"]].shape[0]

In [None]:
# Group by id pairs and calculate all metrics at once
aggregated_segments = filtered_segments.groupby(["id1", "id2"]).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

# Check distribution of values
print(aggregated_segments.describe())

In [None]:
aggregated_segments_by_type = filtered_segments.groupby(
    ["id1", "id2", "IBD_type"]
).agg(
    total_genetic_length=("genetic_length", "sum"),
    num_segments=("genetic_length", "count"),
    largest_segment=("genetic_length", "max")
).reset_index()

pd.options.display.float_format = '{:.6f}'.format
# Display the result
ibd1_summary = aggregated_segments_by_type[aggregated_segments_by_type["IBD_type"] == "IBD1"].describe()
display(ibd1_summary)

ibd2_summary = aggregated_segments_by_type[aggregated_segments_by_type["IBD_type"] == "IBD2"].describe()
display(ibd2_summary)

In [None]:
aggregated_segments[["total_genetic_length", "num_segments", "largest_segment", "second_largest_segment"]].describe()

In [None]:
# Filter the pairs that meet the criteria
filtered_pairs1 = aggregated_segments[aggregated_segments["total_genetic_length"] >= 3000]

display(filtered_pairs1[["id1", "id2", "total_genetic_length"]])

In [None]:
# Filter the pairs that meet the criteria
filtered_pairs2 = aggregated_segments[aggregated_segments["total_genetic_length"] >= 1000]

display(filtered_pairs2[["id1", "id2", "total_genetic_length"]])

In [None]:
display(aggregated_segments)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define columns to plot
columns = ["total_genetic_length", "num_segments", "largest_segment"]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram for each metric
for i, col in enumerate(columns):
    sns.histplot(aggregated_segments[col], bins=30, kde=True, ax=axes[i], edgecolor="black")
    axes[i].set_title(f"Distribution of {col.replace('_', ' ').title()}")
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# Box Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(columns):
    sns.boxplot(y=aggregated_segments[col], ax=axes[i])
    axes[i].set_title(f"Box Plot of {col.replace('_', ' ').title()}")
    axes[i].set_ylabel(col.replace('_', ' ').title())

plt.tight_layout()
plt.show()
