In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv
import numpy as np
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc
from intervaltree import IntervalTree

In [2]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
os.environ["WORKING_DIRECTORY"] = working_directory
print(f"Data Directory: {data_directory}")
os.environ["DATA_DIRECTORY"] = data_directory
print(f"References Directory: {references_directory}")
os.environ["REFERENCES_DIRECTORY"] = references_directory
print(f"Results Directory: {results_directory}")
os.environ["RESULTS_DIRECTORY"] = results_directory
print(f"Utils Directory: {utils_directory}")
os.environ["UTILS_DIRECTORY"] = utils_directory

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Loaded environment variables from: /home/lakishadavid/computational_genetic_genealogy/.env
Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils
The current directory is /home/lakishadavid/computational_genetic_genealogy


In [3]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
        
log_filename = os.path.join(results_directory, "lab9_log.txt")
print(f"The Lab 9 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

The Lab 9 log file is located at /home/lakishadavid/computational_genetic_genealogy/results/lab9_log.txt.


In [None]:
segments_refinedibd = Path(results_directory) / "merged_opensnps_autosomes_refinedibd.seg"
segments_hapibd = Path(results_directory) / "merged_opensnps_autosomes_hapibd.seg"
segments_ibis = Path(results_directory) / "merged_opensnps_autosomes_ibis.seg"
segments_pedsim = Path(results_directory) / "ped_sim_run2.seg"

In [None]:
def load_refined_ibd(filepath):
    """Load Refined IBD output file"""
    # Refined IBD format: sample1 sample2 chrom startpos endpos LOD cM
    cols = ['sample1', 'sample2', 'chrom', 'start', 'end', 'LOD', 'cM']
    try:
        df = pd.read_csv(filepath, sep='\s+', header=None, names=cols)
        # Create a unique segment ID for each segment
        df['segment_id'] = range(len(df))
        df['tool'] = 'RefinedIBD'
        df['length'] = df['end'] - df['start']
        return df
    except Exception as e:
        print(f"Error loading Refined IBD file: {e}")
        return pd.DataFrame(columns=cols + ['segment_id', 'tool', 'length'])

def load_hap_ibd(filepath):
    """Load Hap IBD output file"""
    # Hap IBD format: sample1 sample2 chrom startpos endpos cM .... 
    cols = ['sample1', 'sample2', 'chrom', 'start', 'end', 'cM', 'num_sites', 'LOD']
    try:
        df = pd.read_csv(filepath, sep='\s+', header=None, names=cols)
        df['segment_id'] = range(len(df))
        df['tool'] = 'HapIBD'
        df['length'] = df['end'] - df['start']
        return df
    except Exception as e:
        print(f"Error loading Hap IBD file: {e}")
        return pd.DataFrame(columns=cols + ['segment_id', 'tool', 'length'])

def load_ibis(filepath):
    """Load IBIS output file"""
    # IBIS format varies, adjust columns as needed
    cols = ['sample1', 'sample2', 'chrom', 'start', 'end', 'cM', 'num_snps']
    try:
        df = pd.read_csv(filepath, sep='\s+', header=None, names=cols)
        df['segment_id'] = range(len(df))
        df['tool'] = 'IBIS'
        df['length'] = df['end'] - df['start']
        df['LOD'] = np.nan  # IBIS might not have LOD scores
        return df
    except Exception as e:
        print(f"Error loading IBIS file: {e}")
        return pd.DataFrame(columns=cols + ['segment_id', 'tool', 'length', 'LOD'])

def load_pedsim_truth(filepath):
    """Load ground truth IBD segments from ped-sim"""
    # Adjust format according to your ped-sim output
    cols = ['sample1', 'sample2', 'chrom', 'start', 'end', 'segment_type']
    try:
        df = pd.read_csv(filepath, sep='\s+', header=None, names=cols)
        df['segment_id'] = range(len(df))
        df['tool'] = 'Truth'
        df['length'] = df['end'] - df['start']
        return df
    except Exception as e:
        print(f"Error loading PedSim truth file: {e}")
        return pd.DataFrame(columns=cols + ['segment_id', 'tool', 'length'])

Now, let's create functions to evaluate IBD detection performance:

In [None]:
def create_interval_tree(truth_df):
    """Create an interval tree from truth segments for efficient overlap checking"""
    trees = {}
    for _, row in truth_df.iterrows():
        pair_key = tuple(sorted([row['sample1'], row['sample2']]))
        chrom = row['chrom']
        
        if (pair_key, chrom) not in trees:
            trees[(pair_key, chrom)] = IntervalTree()
            
        trees[(pair_key, chrom)].addi(row['start'], row['end'], row['segment_id'])
    
    return trees

def calculate_overlap(segment, tree):
    """Calculate overlap between a segment and truth segments in the tree"""
    overlaps = tree.overlap(segment['start'], segment['end'])
    if not overlaps:
        return 0, None
    
    # Find the best overlapping segment
    best_overlap = 0
    best_truth_id = None
    
    for interval in overlaps:
        overlap_start = max(segment['start'], interval.begin)
        overlap_end = min(segment['end'], interval.end)
        overlap_length = overlap_end - overlap_start
        
        if overlap_length > best_overlap:
            best_overlap = overlap_length
            best_truth_id = interval.data
    
    return best_overlap / (segment['end'] - segment['start']), best_truth_id

def evaluate_tool(tool_df, truth_trees):
    """Evaluate IBD detection performance for a specific tool"""
    # Add columns for evaluation metrics
    tool_df['detected_truth'] = False
    tool_df['overlap_pct'] = 0.0
    tool_df['truth_id'] = None
    
    for idx, row in tool_df.iterrows():
        pair_key = tuple(sorted([row['sample1'], row['sample2']]))
        chrom = row['chrom']
        
        if (pair_key, chrom) in truth_trees:
            overlap_pct, truth_id = calculate_overlap(row, truth_trees[(pair_key, chrom)])
            tool_df.at[idx, 'overlap_pct'] = overlap_pct
            tool_df.at[idx, 'truth_id'] = truth_id
            tool_df.at[idx, 'detected_truth'] = (overlap_pct > 0.5)  # Consider >50% overlap a true positive
    
    return tool_df

def evaluate_all_tools(refined_df, hap_df, ibis_df, truth_df):
    """Evaluate all IBD detection tools"""
    # Create interval trees for truth segments
    truth_trees = create_interval_tree(truth_df)
    
    # Evaluate each tool
    refined_eval = evaluate_tool(refined_df, truth_trees)
    hap_eval = evaluate_tool(hap_df, truth_trees)
    ibis_eval = evaluate_tool(ibis_df, truth_trees)
    
    # Combine results
    all_results = pd.concat([refined_eval, hap_eval, ibis_eval])
    
    return all_results, truth_df

Now, let's create functions for visualizing the performance:

In [None]:
def plot_length_distribution(all_results, truth_df):
    """Plot the distribution of segment lengths for each tool and truth"""
    plt.figure(figsize=(12, 8))
    
    # Combine all data
    all_data = pd.concat([
        all_results[all_results['tool'] == 'RefinedIBD'][['length', 'tool']],
        all_results[all_results['tool'] == 'HapIBD'][['length', 'tool']],
        all_results[all_results['tool'] == 'IBIS'][['length', 'tool']],
        truth_df[['length', 'tool']]
    ])
    
    # Plot density
    sns.kdeplot(data=all_data, x='length', hue='tool', fill=True, alpha=0.5)
    
    plt.title('Distribution of IBD Segment Lengths')
    plt.xlabel('Segment Length (bp)')
    plt.ylabel('Density')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('ibd_length_distribution.png')
    plt.show()

def plot_precision_recall(all_results, truth_df):
    """Plot precision-recall curves for each tool"""
    plt.figure(figsize=(10, 8))
    
    # For each tool, calculate precision and recall using overlap percentage as score
    for tool_name, color in zip(['RefinedIBD', 'HapIBD', 'IBIS'], ['blue', 'green', 'red']):
        tool_df = all_results[all_results['tool'] == tool_name]
        
        if 'LOD' in tool_df.columns and not tool_df['LOD'].isna().all():
            score = tool_df['LOD']  # Use LOD score if available
        else:
            score = tool_df['length']  # Otherwise use length as a proxy for confidence
            
        y_true = tool_df['detected_truth'].astype(int)
        
        # Calculate precision and recall
        precision, recall, _ = precision_recall_curve(y_true, score)
        avg_precision = average_precision_score(y_true, score)
        
        # Plot PR curve
        plt.plot(recall, precision, lw=2, color=color,
                 label=f'{tool_name} (AP={avg_precision:.2f})')
    
    plt.title('Precision-Recall Curves for IBD Detection Tools')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(loc="best")
    plt.tight_layout()
    plt.savefig('ibd_precision_recall.png')
    plt.show()

def plot_roc_curves(all_results, truth_df):
    """Plot ROC curves for each tool"""
    plt.figure(figsize=(10, 8))
    
    # For each tool, calculate ROC using overlap percentage as score
    for tool_name, color in zip(['RefinedIBD', 'HapIBD', 'IBIS'], ['blue', 'green', 'red']):
        tool_df = all_results[all_results['tool'] == tool_name]
        
        if 'LOD' in tool_df.columns and not tool_df['LOD'].isna().all():
            score = tool_df['LOD']  # Use LOD score if available
        else:
            score = tool_df['length']  # Otherwise use length as a proxy for confidence
            
        y_true = tool_df['detected_truth'].astype(int)
        
        # Calculate ROC
        fpr, tpr, _ = roc_curve(y_true, score)
        roc_auc = auc(fpr, tpr)
        
        # Plot ROC curve
        plt.plot(fpr, tpr, lw=2, color=color,
                 label=f'{tool_name} (AUC={roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='gray', alpha=0.8)
    plt.title('ROC Curves for IBD Detection Tools')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig('ibd_roc_curves.png')
    plt.show()

def plot_overlap_histogram(all_results):
    """Plot histogram of overlap percentages for each tool"""
    plt.figure(figsize=(12, 8))
    
    for tool_name, color in zip(['RefinedIBD', 'HapIBD', 'IBIS'], ['blue', 'green', 'red']):
        tool_df = all_results[all_results['tool'] == tool_name]
        plt.hist(tool_df['overlap_pct'], bins=20, alpha=0.5, color=color, label=tool_name)
    
    plt.title('Distribution of Overlap with Truth Segments')
    plt.xlabel('Overlap Percentage')
    plt.ylabel('Count')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.savefig('ibd_overlap_histogram.png')
    plt.show()

def calculate_summary_metrics(all_results, truth_df):
    """Calculate summary statistics for each tool"""
    metrics = []
    
    # Count total truth segments
    total_truth = len(truth_df)
    
    for tool_name in ['RefinedIBD', 'HapIBD', 'IBIS']:
        tool_df = all_results[all_results['tool'] == tool_name]
        
        # Count true positives (segments with >50% overlap)
        true_positives = tool_df['detected_truth'].sum()
        
        # Count false positives (segments with ≤50% overlap)
        false_positives = len(tool_df) - true_positives
        
        # Count truth segments detected by this tool
        detected_truth_ids = set([x for x in tool_df['truth_id'] if x is not None])
        detected_truths = len(detected_truth_ids)
        
        # Calculate recall (proportion of truth segments detected)
        recall = detected_truths / total_truth if total_truth > 0 else 0
        
        # Calculate precision (proportion of detected segments that are true)
        precision = true_positives / len(tool_df) if len(tool_df) > 0 else 0
        
        # Calculate F1 score
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics.append({
            'Tool': tool_name,
            'Total Segments': len(tool_df),
            'True Positives': true_positives,
            'False Positives': false_positives,
            'Detected Truth Segments': detected_truths,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })
    
    return pd.DataFrame(metrics)

def plot_summary_barplot(metrics_df):
    """Plot summary metrics as a bar chart"""
    plt.figure(figsize=(14, 10))
    
    # Melt the dataframe to make it suitable for grouped bar chart
    plot_metrics = ['Precision', 'Recall', 'F1 Score']
    plot_df = pd.melt(metrics_df, id_vars=['Tool'], value_vars=plot_metrics, 
                      var_name='Metric', value_name='Value')
    
    # Create grouped bar chart
    sns.barplot(x='Tool', y='Value', hue='Metric', data=plot_df)
    
    plt.title('Performance Metrics by IBD Detection Tool')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.grid(True, linestyle='--', alpha=0.7, axis='y')
    plt.tight_layout()
    plt.savefig('ibd_performance_metrics.png')
    plt.show()

Finally, let's put everything together with a main function:

In [None]:
def main():
    # File paths - replace with your actual file paths
    refined_path = 'path/to/refined_ibd_output.txt'
    hap_path = 'path/to/hap_ibd_output.txt'
    ibis_path = 'path/to/ibis_output.txt'
    truth_path = 'path/to/pedsim_truth.txt'
    
    # Load data
    print("Loading data...")
    refined_df = load_refined_ibd(refined_path)
    hap_df = load_hap_ibd(hap_path)
    ibis_df = load_ibis(ibis_path)
    truth_df = load_pedsim_truth(truth_path)
    
    # Print data summaries
    print(f"Loaded {len(refined_df)} Refined IBD segments")
    print(f"Loaded {len(hap_df)} Hap IBD segments")
    print(f"Loaded {len(ibis_df)} IBIS segments")
    print(f"Loaded {len(truth_df)} truth segments from ped-sim")
    
    # Evaluate tools
    print("Evaluating IBD detection tools...")
    all_results, truth_df = evaluate_all_tools(refined_df, hap_df, ibis_df, truth_df)
    
    # Generate visualizations
    print("Generating visualizations...")
    plot_length_distribution(all_results, truth_df)
    plot_precision_recall(all_results, truth_df)
    plot_roc_curves(all_results, truth_df)
    plot_overlap_histogram(all_results)
    
    # Calculate and display summary metrics
    print("Calculating summary metrics...")
    metrics_df = calculate_summary_metrics(all_results, truth_df)
    print(metrics_df)
    plot_summary_barplot(metrics_df)
    
    print("Evaluation complete!")

if __name__ == "__main__":
    main()

In [None]:
def plot_chromosome_performance(all_results, truth_df):
    """Plot performance metrics by chromosome"""
    # Get unique chromosomes
    all_chroms = sorted(truth_df['chrom'].unique())
    
    # Initialize metrics dictionary
    chrom_metrics = {tool: {chrom: {'precision': 0, 'recall': 0, 'f1': 0} 
                           for chrom in all_chroms} 
                    for tool in ['RefinedIBD', 'HapIBD', 'IBIS']}
    
    # Calculate metrics per chromosome
    for chrom in all_chroms:
        # Count truth segments in this chromosome
        truth_in_chrom = truth_df[truth_df['chrom'] == chrom]
        total_truth = len(truth_in_chrom)
        
        for tool in ['RefinedIBD', 'HapIBD', 'IBIS']:
            # Get tool results for this chromosome
            tool_results = all_results[(all_results['tool'] == tool) & 
                                       (all_results['chrom'] == chrom)]
            
            if len(tool_results) == 0:
                continue
                
            # Count true positives
            true_positives = tool_results['detected_truth'].sum()
            
            # Count detected truth segments
            detected_truth_ids = set([x for x in tool_results['truth_id'] if x is not None])
            detected_truths = len(detected_truth_ids)
            
            # Calculate metrics
            precision = true_positives / len(tool_results) if len(tool_results) > 0 else 0
            recall = detected_truths / total_truth if total_truth > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            chrom_metrics[tool][chrom]['precision'] = precision
            chrom_metrics[tool][chrom]['recall'] = recall
            chrom_metrics[tool][chrom]['f1'] = f1
    
    # Create dataframes for plotting
    plot_data = []
    for tool in chrom_metrics:
        for chrom in chrom_metrics[tool]:
            for metric in ['precision', 'recall', 'f1']:
                plot_data.append({
                    'Tool': tool,
                    'Chromosome': chrom,
                    'Metric': metric.capitalize(),
                    'Value': chrom_metrics[tool][chrom][metric]
                })
    
    plot_df = pd.DataFrame(plot_data)
    
    # Plot
    plt.figure(figsize=(16, 12))
    g = sns.FacetGrid(plot_df, col='Metric', row='Tool', height=3, aspect=2)
    g.map_dataframe(sns.barplot, x='Chromosome', y='Value')
    g.set_axis_labels('Chromosome', 'Score')
    g.set_titles('{row_name} - {col_name}')
    
    for ax in g.axes.flat:
        ax.set_ylim(0, 1)
        ax.grid(True, linestyle='--', alpha=0.7, axis='y')
        
    plt.tight_layout()
    plt.savefig('ibd_chromosome_performance.png')
    plt.show()

In [None]:
def plot_accuracy_by_length(all_results, truth_df):
    """Plot detection accuracy as a function of segment length"""
    plt.figure(figsize=(12, 8))
    
    # Bin truth segments by length
    bins = [0, 1e4, 5e4, 1e5, 5e5, 1e6, 5e6, float('inf')]
    bin_labels = ['<10kb', '10-50kb', '50-100kb', '0.1-0.5Mb', '0.5-1Mb', '1-5Mb', '>5Mb']
    
    # Add length bins to truth dataframe
    truth_df['length_bin'] = pd.cut(truth_df['length'], bins=bins, labels=bin_labels)
    
    # Count total truth segments per bin
    truth_counts = truth_df.groupby('length_bin').size()
    
    # For each tool, calculate detection rate by length bin
    for tool_name, color in zip(['RefinedIBD', 'HapIBD', 'IBIS'], ['blue', 'green', 'red']):
        detection_rates = []
        
        for bin_label in bin_labels:
            # Get truth segments in this bin
            bin_truth = truth_df[truth_df['length_bin'] == bin_label]
            total = len(bin_truth)
            
            if total == 0:
                detection_rates.append(0)
                continue
            
            # Count how many were detected by this tool
            tool_df = all_results[all_results['tool'] == tool_name]
            detected = 0
            
            for _, truth_row in bin_truth.iterrows():
                truth_id = truth_row['segment_id']
                if (tool_df['truth_id'] == truth_id).any():
                    detected += 1
            
            detection_rates.append(detected / total if total > 0 else 0)
        
        plt.plot(bin_labels, detection_rates, marker='o', label=tool_name, color=color, linewidth=2)
    
    plt.title('IBD Detection Rate by Segment Length')
    plt.xlabel('Segment Length')
    plt.ylabel('Detection Rate')
    plt.ylim(0, 1.05)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.savefig('ibd_detection_by_length.png')
    plt.show()
    
    # Also create a bar chart showing segment counts by length
    plt.figure(figsize=(10, 6))
    truth_counts.plot(kind='bar', color='purple')
    plt.title('Number of Truth Segments by Length')
    plt.xlabel('Segment Length')
    plt.ylabel('Count')
    plt.grid(True, linestyle='--', alpha=0.7, axis='y')
    plt.tight_layout()
    plt.savefig('ibd_truth_segments_by_length.png')
    plt.show()

In [None]:
def main():
    # ... (previous code)
    
    # Additional visualizations
    plot_chromosome_performance(all_results, truth_df)
    plot_accuracy_by_length(all_results, truth_df)
    
    print("Evaluation complete!")

if __name__ == "__main__":
    main()