# Cross-Species Consensus Peak Pipeline

This notebook creates a unified peak set across species for cross-species accessibility comparison.

**Pipeline steps:**
1. Liftover species consensus peaks ‚Üí hg38 (human)
2. Merge all lifted peaks into unified human consensus
3. Add peak IDs for tracking across species
4. Liftover unified peaks back to each species genome
5. Create presence/absence matrix

---

In [None]:
import os
import sys
from pathlib import Path

# Add src to path
PIPELINE_DIR = Path(os.getcwd()).parent if 'notebooks' in os.getcwd() else Path(os.getcwd())
sys.path.insert(0, str(PIPELINE_DIR))

from src import (
    cross_species_consensus_pipeline,
    create_peak_matrix,
    liftover_two_step,
    get_chain_file,
    DEFAULT_CHAIN_DIR,
    REVERSE_CHAIN_FILES,
)

print(f"‚úÖ Pipeline loaded from {PIPELINE_DIR}")

In [None]:
# Show available reverse chain files (for lifting back)
print("Available reverse chain files (hg38 ‚Üí species):")
print("=" * 50)
for species, chain in REVERSE_CHAIN_FILES.items():
    chain_path = os.path.join(DEFAULT_CHAIN_DIR, chain)
    exists = "‚úÖ" if os.path.exists(chain_path) else "‚ùå"
    print(f"{exists} {species}: {chain}")

## Configuration

Set your input files and output directory below.

In [None]:
# =============================================================================
# CONFIGURATION - Edit these paths
# =============================================================================

# Input consensus BED files (species-specific coordinates)
SPECIES_BEDS = {
    "Gorilla": "/path/to/Gorilla_consensus_peaks.bed",
    "Chimpanzee": "/path/to/Chimpanzee_consensus_peaks.bed",
    "Bonobo": "/path/to/Bonobo_consensus_peaks.bed",
    "Macaque": "/path/to/Macaque_consensus_peaks.bed",
    "Marmoset": "/path/to/Marmoset_consensus_peaks.bed",
}

# Output directory for all results
OUTPUT_DIR = "/path/to/output/cross_species_consensus"

# Chain file directory
CHAIN_DIR = DEFAULT_CHAIN_DIR

# liftOver executable path (on cluster)
LIFTOVER_PATH = "/cluster/project/treutlein/jjans/software/miniforge3/envs/genomes/bin/liftOver"

# Peak ID prefix
PEAK_PREFIX = "unified"

# Merge distance (0 = only overlapping peaks merge)
MERGE_DISTANCE = 0

print(f"Output directory: {OUTPUT_DIR}")

In [None]:
# Validate input files
print("Input files:")
print("=" * 60)
for species, filepath in SPECIES_BEDS.items():
    exists = os.path.exists(filepath)
    if exists:
        # Count peaks
        with open(filepath) as f:
            n_peaks = sum(1 for line in f if line.strip() and not line.startswith('#'))
        print(f"‚úÖ {species}: {n_peaks:,} peaks")
    else:
        print(f"‚ùå {species}: NOT FOUND - {filepath}")

## Run Full Pipeline

This runs the complete cross-species consensus pipeline in one go.

In [None]:
# Run the full pipeline
results = cross_species_consensus_pipeline(
    species_beds=SPECIES_BEDS,
    output_dir=OUTPUT_DIR,
    chain_dir=CHAIN_DIR,
    liftover_path=LIFTOVER_PATH,
    min_match=0.95,
    merge_distance=MERGE_DISTANCE,
    peak_prefix=PEAK_PREFIX,
    verbose=True,
)

print(f"\n{results['message']}")

## Create Peak Presence Matrix

Creates a matrix showing which peaks are present/absent in each species.

In [None]:
# Create peak presence/absence matrix
if results["status"] == "success":
    matrix_file = os.path.join(OUTPUT_DIR, "peak_presence_matrix.tsv")
    
    # Get the lifted-back BED files
    species_lifted_beds = {
        species: results["output_files"][species]
        for species in SPECIES_BEDS.keys()
        if species in results["output_files"]
    }
    
    matrix_result = create_peak_matrix(
        unified_human_bed=results["output_files"]["human_consensus"],
        species_beds=species_lifted_beds,
        output_file=matrix_file,
        verbose=True,
    )
else:
    print("‚ö†Ô∏è  Pipeline did not complete successfully, skipping matrix creation")

In [None]:
# Preview the matrix
if results["status"] == "success":
    import pandas as pd
    
    matrix_df = pd.read_csv(matrix_file, sep='\t')
    print(f"Matrix shape: {matrix_df.shape}")
    print(f"\nFirst 10 rows:")
    display(matrix_df.head(10))
    
    # Summary statistics
    species_cols = [col for col in matrix_df.columns if col not in ['peak_id', 'chr', 'start', 'end']]
    print(f"\nüìä Peaks per species:")
    for sp in species_cols:
        count = matrix_df[sp].sum()
        pct = count / len(matrix_df) * 100
        print(f"   {sp}: {count:,} ({pct:.1f}%)")

## Conservation Analysis

In [None]:
# Analyze conservation levels
if results["status"] == "success":
    import pandas as pd
    import matplotlib.pyplot as plt
    
    matrix_df = pd.read_csv(matrix_file, sep='\t')
    species_cols = [col for col in matrix_df.columns if col not in ['peak_id', 'chr', 'start', 'end']]
    
    # Calculate how many species each peak is present in
    matrix_df['n_species'] = matrix_df[species_cols].sum(axis=1)
    
    # Count distribution
    conservation_counts = matrix_df['n_species'].value_counts().sort_index()
    
    print("üìä Peak conservation distribution:")
    print("=" * 40)
    for n, count in conservation_counts.items():
        pct = count / len(matrix_df) * 100
        bar = "‚ñà" * int(pct / 2)
        print(f"{n} species: {count:>6,} ({pct:>5.1f}%) {bar}")
    
    # Plot
    fig, ax = plt.subplots(figsize=(8, 5))
    conservation_counts.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
    ax.set_xlabel('Number of species with peak')
    ax.set_ylabel('Number of peaks')
    ax.set_title('Peak Conservation Across Species')
    plt.tight_layout()
    
    # Save plot
    plot_file = os.path.join(OUTPUT_DIR, "conservation_distribution.png")
    plt.savefig(plot_file, dpi=150)
    print(f"\nüìä Plot saved: {plot_file}")
    plt.show()

## Output Summary

In [None]:
# List all output files
print("\nüìÅ Output files:")
print("=" * 70)

for root, dirs, files in os.walk(OUTPUT_DIR):
    level = root.replace(OUTPUT_DIR, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        filepath = os.path.join(root, file)
        size_kb = os.path.getsize(filepath) / 1024
        print(f"{subindent}{file} ({size_kb:.1f} KB)")

---

## Manual Step-by-Step (Alternative)

If you prefer more control, you can run each step individually.

In [None]:
# # STEP 1: Liftover to human (uncomment to run manually)
# from src import liftover_peaks, liftover_two_step, get_chain_file
# 
# lifted_beds = []
# for species, input_bed in SPECIES_BEDS.items():
#     output_bed = f"{OUTPUT_DIR}/01_lifted_to_human/{species}_hg38.bed"
#     os.makedirs(os.path.dirname(output_bed), exist_ok=True)
#     
#     if species == "Marmoset":
#         result = liftover_two_step(
#             input_bed=input_bed,
#             output_bed=output_bed,
#             chain_file_1=get_chain_file("Marmoset_step1"),
#             chain_file_2=get_chain_file("Marmoset_step2"),
#             liftover_path=LIFTOVER_PATH,
#         )
#     else:
#         result = liftover_peaks(
#             input_bed=input_bed,
#             output_bed=output_bed,
#             chain_file=get_chain_file(species),
#             liftover_path=LIFTOVER_PATH,
#         )
#     
#     if result["status"] == "success":
#         lifted_beds.append(output_bed)
#     print(result["message"])

In [None]:
# # STEP 2: Merge peaks (uncomment to run manually)
# from src import merge_bed_files, add_peak_ids
# 
# merged_bed = f"{OUTPUT_DIR}/02_merged_consensus/unified_consensus_hg38.bed"
# merge_bed_files(lifted_beds, merged_bed, merge_distance=0)
# 
# # Add IDs
# merged_with_ids = merged_bed.replace('.bed', '_with_ids.bed')
# add_peak_ids(merged_bed, merged_with_ids, prefix="unified")

In [None]:
# # STEP 3: Liftback to species (uncomment to run manually)
# from src import liftback_peaks
# 
# for species in SPECIES_BEDS.keys():
#     output_bed = f"{OUTPUT_DIR}/03_lifted_back/unified_consensus_{species}.bed"
#     os.makedirs(os.path.dirname(output_bed), exist_ok=True)
#     
#     result = liftback_peaks(
#         input_bed=merged_with_ids,
#         output_bed=output_bed,
#         species=species,
#         chain_dir=CHAIN_DIR,
#         liftover_path=LIFTOVER_PATH,
#     )
#     print(result["message"])