# Cross-Species Consensus Peak Analysis

This notebook creates a unified consensus peak set across all species for comparative ATAC-seq analysis.

## Workflow
1. **Liftover** species consensus peaks ‚Üí hg38
2. **Merge** all lifted peaks + Human peaks into unified hg38 consensus
3. **Add peak IDs** for cross-species tracking
4. **Liftback** unified peaks to each species genome
5. **Create peak matrix** showing presence/absence across species

In [None]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# Add atac_pipeline to path
sys.path.insert(0, '/cluster/home/jjanssens/jjans/analysis/adult_intestine/peaks/peak_calling/atac_pipeline')

from src.liftover import liftover_peaks, liftover_two_step, get_chain_file, CHAIN_FILES
from src.cross_species import (
    cross_species_consensus_pipeline,
    merge_bed_files,
    add_peak_ids,
    liftback_peaks,
    create_peak_matrix,
    get_reverse_chain_file,
    REVERSE_CHAIN_FILES,
)

print("‚úÖ atac_pipeline loaded successfully")
print(f"   Available chain files (species ‚Üí hg38): {list(CHAIN_FILES.keys())}")
print(f"   Available reverse chains (hg38 ‚Üí species): {list(REVERSE_CHAIN_FILES.keys())}")

## Configuration

In [None]:
# === CONFIGURATION ===

# Base paths
BASE_PATH = "/cluster/project/treutlein/USERS/jjans"
CHAIN_DIR = "/cluster/work/treutlein/jjans/data/intestine/nhp_atlas/genomes/chain_files"

# Output directory for cross-species analysis
OUTPUT_DIR = f"{BASE_PATH}/analysis/adult_intestine/peaks/cross_species_consensus"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Species and their consensus peak files
SPECIES_LIST = ["Bonobo", "Chimpanzee", "Gorilla", "Macaque", "Marmoset", "Human"]

SPECIES_BEDS = {}
for species in SPECIES_LIST:
    bed_file = f"{BASE_PATH}/analysis/adult_intestine/peaks/consensus_peak_calling_{species}/Consensus_Peaks_Filtered_500.bed"
    if os.path.exists(bed_file):
        SPECIES_BEDS[species] = bed_file
        print(f"‚úÖ {species}: {bed_file}")
    else:
        print(f"‚ùå {species}: NOT FOUND - {bed_file}")

print(f"\nüì¶ Found {len(SPECIES_BEDS)} species with consensus peaks")

In [None]:
# Check peak counts per species
print("üìä Peak counts per species:")
print("-" * 50)

total_peaks = 0
for species, bed_file in SPECIES_BEDS.items():
    with open(bed_file) as f:
        count = sum(1 for line in f if line.strip() and not line.startswith('#'))
    total_peaks += count
    print(f"   {species:<15} {count:>10,} peaks")

print("-" * 50)
print(f"   {'TOTAL':<15} {total_peaks:>10,} peaks")

## Step 1: Load Pre-Lifted Peaks (already lifted to hg38)

In [None]:
# Load pre-lifted peaks (already lifted to hg38)
LIFTED_PEAKS_DIR = f"{BASE_PATH}/analysis/adult_intestine/peaks/lifted_consensus_peaks"

# Map species to their lifted hg38 files
LIFTED_BEDS = {
    "Bonobo": f"{LIFTED_PEAKS_DIR}/Consensus_Peaks_Filtered_500.hg38_Bonobo.bed",
    "Chimpanzee": f"{LIFTED_PEAKS_DIR}/Consensus_Peaks_Filtered_500.hg38_Chimpanzee.bed",
    "Gorilla": f"{LIFTED_PEAKS_DIR}/Consensus_Peaks_Filtered_500.hg38_Gorilla.bed",
    "Macaque": f"{LIFTED_PEAKS_DIR}/Consensus_Peaks_Filtered_500.hg38_Macaque.bed",
    "Marmoset": f"{LIFTED_PEAKS_DIR}/Consensus_Peaks_Filtered_500.hg38_Marmoset.bed",
    "Human": f"{BASE_PATH}/analysis/adult_intestine/peaks/consensus_peak_calling_Human/Consensus_Peaks_Filtered_500.bed",
}

# Check which files exist and count peaks
print("üìÇ Loading pre-lifted peaks (hg38 coordinates):")
print("-" * 60)

lift_results = {}
for species, bed_file in LIFTED_BEDS.items():
    if os.path.exists(bed_file):
        with open(bed_file) as f:
            count = sum(1 for line in f if line.strip() and not line.startswith('#'))
        
        lift_results[species] = {
            "status": "success",
            "lifted": count,
            "unmapped": 0,
            "output_file": bed_file
        }
        print(f"   ‚úÖ {species:<15} {count:>10,} peaks  ({bed_file})")
    else:
        print(f"   ‚ùå {species:<15} NOT FOUND: {bed_file}")

print("-" * 60)
print(f"üì¶ Loaded {len(lift_results)} species with lifted peaks")

## Step 2: Merge All Peaks into Unified hg38 Consensus

In [None]:
# Collect all successfully loaded lifted BED files
lifted_beds = []
for species in SPECIES_LIST:
    if species in lift_results and lift_results[species]["status"] == "success":
        bed_file = lift_results[species]["output_file"]
        if os.path.exists(bed_file):
            lifted_beds.append(bed_file)
            print(f"‚úÖ Including: {species}")

print(f"\nüì¶ Total files to merge: {len(lifted_beds)}")

In [None]:
# Merge all lifted peaks into unified consensus
MERGED_DIR = os.path.join(OUTPUT_DIR, "02_merged_consensus")
os.makedirs(MERGED_DIR, exist_ok=True)

merged_bed = os.path.join(MERGED_DIR, "unified_consensus_hg38_merged.bed")

merge_result = merge_bed_files(
    input_beds=lifted_beds,
    output_bed=merged_bed,
    merge_distance=0,  # Only merge overlapping peaks
    verbose=True,
)

print(f"\n{merge_result['message']}")

## Step 3: Add Peak IDs for Cross-Species Tracking

In [None]:
# Add unique peak IDs to the merged consensus
unified_with_ids = os.path.join(MERGED_DIR, "unified_consensus_hg38_with_ids.bed")

id_result = add_peak_ids(
    input_bed=merged_bed,
    output_bed=unified_with_ids,
    prefix="unified",
    verbose=True,
)

# Show first few peaks
print("\nüìÑ First 10 unified peaks:")
df_unified = pd.read_csv(unified_with_ids, sep='\t', header=None, 
                         names=['Chromosome', 'Start', 'End', 'PeakID'])
print(df_unified.head(10).to_string())

## Step 4: Liftback Unified Peaks to Each Species

In [None]:
# Liftback to each species with species-specific match rates
# Using 80% of forward liftover rates for liftback (slightly more permissive for recovery)

LIFTBACK_DIR = os.path.join(OUTPUT_DIR, "03_lifted_back")
os.makedirs(LIFTBACK_DIR, exist_ok=True)

# Forward rates used during species ‚Üí hg38 liftover
FORWARD_MATCH_RATES = {
    "Bonobo": 0.9,
    "Chimpanzee": 0.9,
    "Gorilla": 0.9,
    "Macaque": 0.8,
    "Marmoset": 0.6,
    "Human": 1.0,  # Not used
}

# Liftback rates = 80% of forward rates
LIFTBACK_MATCH_RATES = {species: rate * 0.8 for species, rate in FORWARD_MATCH_RATES.items()}

print("üìä Liftback min_match rates (80% of forward rates):")
for species, rate in LIFTBACK_MATCH_RATES.items():
    if species != "Human":
        print(f"   {species}: {rate:.2f}")

liftback_results = {}

for species in SPECIES_LIST:
    output_bed = os.path.join(LIFTBACK_DIR, f"unified_consensus_{species}.bed")
    
    if species == "Human":
        # Human stays as is (already in hg38)
        print(f"\nüß¨ {species}: Copying hg38 consensus (no liftover needed)")
        
        import shutil
        shutil.copy(unified_with_ids, output_bed)
        
        with open(output_bed) as f:
            count = sum(1 for _ in f)
        
        liftback_results[species] = {
            "status": "success",
            "lifted": count,
            "unmapped": 0,
            "output_file": output_bed
        }
        print(f"   ‚úÖ {count:,} peaks")
        continue
    
    print(f"\nüîô Lifting back to {species} (min_match={LIFTBACK_MATCH_RATES[species]:.2f})...")
    
    result = liftback_peaks(
        input_bed=unified_with_ids,
        output_bed=output_bed,
        species=species,
        chain_dir=CHAIN_DIR,
        min_match=LIFTBACK_MATCH_RATES[species],
        auto_chr=True,
        verbose=True,
    )
    
    liftback_results[species] = result
    print(f"   {result['message']}")

In [None]:
# Summary of liftback
print("\n" + "=" * 70)
print("LIFTBACK TO SPECIES SUMMARY")
print("=" * 70)

total_unified = id_result["peak_count"]
print(f"Unified hg38 peaks: {total_unified:,}\n")

print(f"{'Species':<15} {'Lifted':>10} {'Unmapped':>10} {'Rate':>10}")
print("-" * 50)

for species in SPECIES_LIST:
    if species in liftback_results:
        r = liftback_results[species]
        lifted = r.get("lifted", 0)
        unmapped = r.get("unmapped", r.get("total_unmapped", 0))
        rate = (lifted / total_unified * 100) if total_unified > 0 else 0
        print(f"{species:<15} {lifted:>10,} {unmapped:>10,} {rate:>9.1f}%")

## Step 5: Create Peak Presence/Absence Matrix

In [None]:
# Build species beds dictionary for matrix creation
species_liftback_beds = {}
for species in SPECIES_LIST:
    bed_file = os.path.join(LIFTBACK_DIR, f"unified_consensus_{species}.bed")
    if os.path.exists(bed_file):
        species_liftback_beds[species] = bed_file

# Create peak matrix
matrix_file = os.path.join(OUTPUT_DIR, "peak_presence_matrix.tsv")

matrix_result = create_peak_matrix(
    unified_human_bed=unified_with_ids,
    species_beds=species_liftback_beds,
    output_file=matrix_file,
    verbose=True,
)

In [None]:
# Load and visualize the matrix
df_matrix = pd.read_csv(matrix_file, sep='\t')
print(f"üìä Peak matrix shape: {df_matrix.shape}")
print(df_matrix.head(10))

In [None]:
# Calculate conservation statistics
species_cols = [col for col in df_matrix.columns if col in SPECIES_LIST]

# Number of species each peak is present in
df_matrix['n_species'] = df_matrix[species_cols].sum(axis=1)

# Conservation categories
conservation_counts = df_matrix['n_species'].value_counts().sort_index()

print("üìä Peak Conservation:")
print("-" * 40)
for n, count in conservation_counts.items():
    pct = count / len(df_matrix) * 100
    print(f"   Present in {n} species: {count:>8,} ({pct:>5.1f}%)")

# Summary stats
print("\nüìä Summary:")
print(f"   Total unified peaks: {len(df_matrix):,}")
print(f"   Conserved in ALL {len(species_cols)} species: {(df_matrix['n_species'] == len(species_cols)).sum():,}")
print(f"   Species-specific (1 species only): {(df_matrix['n_species'] == 1).sum():,}")

In [None]:
# Visualize conservation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot of conservation levels
ax1 = axes[0]
conservation_counts.plot(kind='bar', ax=ax1, color='steelblue', edgecolor='black')
ax1.set_xlabel('Number of Species')
ax1.set_ylabel('Number of Peaks')
ax1.set_title('Peak Conservation Across Species')
ax1.set_xticklabels([f'{int(x)}' for x in conservation_counts.index], rotation=0)

# Add percentage labels
for i, (idx, val) in enumerate(conservation_counts.items()):
    ax1.text(i, val + 1000, f'{val/len(df_matrix)*100:.1f}%', ha='center', fontsize=9)

# Heatmap of per-species presence
ax2 = axes[1]
species_presence = df_matrix[species_cols].sum() / len(df_matrix) * 100
species_presence = species_presence.sort_values(ascending=True)

colors = plt.cm.Blues(species_presence / 100)
bars = ax2.barh(species_presence.index, species_presence.values, color=colors, edgecolor='black')
ax2.set_xlabel('% of Unified Peaks Present')
ax2.set_title('Peak Presence by Species')
ax2.set_xlim(0, 100)

# Add percentage labels
for i, (species, val) in enumerate(species_presence.items()):
    ax2.text(val + 1, i, f'{val:.1f}%', va='center', fontsize=9)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "conservation_summary.png"), dpi=150, bbox_inches='tight')
plt.show()

## Summary: Output Files

In [None]:
print("=" * 70)
print("CROSS-SPECIES CONSENSUS PIPELINE - OUTPUT FILES")
print("=" * 70)

print(f"\nüìÅ Output directory: {OUTPUT_DIR}")

print(f"\nüìÑ Main outputs:")
print(f"   Unified hg38 consensus: {unified_with_ids}")
print(f"   Peak presence matrix: {matrix_file}")

print(f"\nüìÑ Liftback files (peaks in each species' genome):")
for species in SPECIES_LIST:
    bed_file = os.path.join(LIFTBACK_DIR, f"unified_consensus_{species}.bed")
    if os.path.exists(bed_file):
        with open(bed_file) as f:
            count = sum(1 for _ in f)
        print(f"   {species}: {bed_file} ({count:,} peaks)")

print(f"\n‚úÖ Pipeline complete!")
print(f"   Total unified peaks: {id_result['peak_count']:,}")
print(f"   Use the peak_id (column 4) to compare accessibility across species")

---

## Step 6: Quantification Over Unified Peaks

Quantify fragment files, bigwigs, or Tn5 insertions over the unified peak set.
- Supports **coverage** (fragment overlap) or **cut-sites** (Tn5 insertion) counting
- Parallel processing by file or by region chunks
- Memory-efficient streaming for very large matrices

In [None]:
# Import quantification functions
from src.quantification import quantify, quantify_matrix, save_matrix, load_matrix

print("‚úÖ Quantification module loaded")
print("   quantify()        ‚Äì single file over peaks")
print("   quantify_matrix() ‚Äì multiple files ‚Üí peaks √ó samples matrix")
print("   save/load_matrix  ‚Äì feather / parquet / tsv I/O")

### Configure Input Files

Set up paths and discover input files. All three input types (fragments, bigwig, tn5) use the same `quantify()` / `quantify_matrix()` functions ‚Äî just change `input_type`.

In [None]:
# === CONFIGURATION: Adjust these paths ===

SPECIES = "Human"  # Or loop over species
QUANT_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "04_quantification")
os.makedirs(QUANT_OUTPUT_DIR, exist_ok=True)

# Peak file for quantification:
# - For human: use unified_with_ids (already hg38)
# - For other species: use the liftback file
peak_file_for_quant = unified_with_ids  # Or: liftback_results[SPECIES]["output_file"]

# Discover input files (update the directory and extension as needed)
import glob

INPUT_DIR = f"{BASE_PATH}/analysis/adult_intestine/peaks/fragments_{SPECIES}"

input_files = sorted(
    glob.glob(f"{INPUT_DIR}/*.tsv.gz")   # fragments
    # glob.glob(f"{INPUT_DIR}/*.bw")     # bigwigs
    # glob.glob(f"{INPUT_DIR}/*.bed")    # tn5 insertions
)

print(f"Found {len(input_files)} input files in {INPUT_DIR}")
for f in input_files[:5]:
    print(f"   {os.path.basename(f)}")
if len(input_files) > 5:
    print(f"   ... and {len(input_files) - 5} more")

In [None]:
# === SINGLE FILE ===
# Quantify one file over peaks (returns a pd.Series)
# Useful for testing or when you only have one sample.

if input_files:
    result = quantify(
        input_file=input_files[0],
        peak_file=peak_file_for_quant,
        input_type="fragments",   # "fragments" | "tn5" | "bigwig"
        method="cutsites",        # "coverage" | "cutsites"  (fragments only)
        # stat="mean",            # "mean"|"sum"|"max"|"min" (bigwig only)
        verbose=True,
    )
    print(f"\nüìä Result shape: {result.shape}")
    print(result.head())
else:
    print("‚ö†Ô∏è No input files found. Update INPUT_DIR.")

### Build Quantification Matrix (multiple files)

In [None]:
# === MULTIPLE FILES ‚Üí MATRIX ===
# Builds a peaks √ó samples matrix in parallel.
# Works for fragments, tn5, or bigwig ‚Äî just change input_type.

if input_files:
    matrix = quantify_matrix(
        input_files=input_files,
        peak_file=peak_file_for_quant,
        input_type="fragments",       # "fragments" | "tn5" | "bigwig"
        method="cutsites",            # "coverage" | "cutsites"  (fragments only)
        # stat="mean",                # "mean"|"sum"|"max"|"min" (bigwig only)
        n_workers=8,
        name_pattern=r"_fragments.*$",  # regex to clean filenames
        name_replacement="",
        # sample_names=["A", "B", ...],  # or provide explicit names
        output_file=os.path.join(QUANT_OUTPUT_DIR, f"quantification_{SPECIES}"),
        output_format="feather",       # "feather" | "parquet" | "tsv"
        # chunk_size=50,               # memory-efficient: process N files at a time
        verbose=True,
    )
else:
    print("‚ö†Ô∏è No input files found. Update INPUT_DIR.")

In [None]:
# === MEMORY-EFFICIENT MODE ===
# For very large numbers of files, use chunk_size to avoid loading the
# full matrix in memory.  Just add chunk_size= to quantify_matrix():

# quantify_matrix(
#     input_files=input_files,
#     peak_file=peak_file_for_quant,
#     input_type="fragments",
#     method="cutsites",
#     n_workers=8,
#     chunk_size=50,          # ‚Üê process 50 files at a time, write chunks
#     output_file=os.path.join(QUANT_OUTPUT_DIR, f"quantification_large_{SPECIES}"),
#     output_format="parquet",  # parquet recommended for large files
#     verbose=True,
# )

print("üí° Uncomment and adjust chunk_size for large-scale quantification")

### Load and Inspect a Saved Matrix

In [None]:
# Load a saved matrix (auto-detects feather / parquet / tsv from extension)
# quant_df = load_matrix(os.path.join(QUANT_OUTPUT_DIR, f"quantification_{SPECIES}.feather"))
# print(f"üìä Matrix shape: {quant_df.shape}")
# print(f"   Peaks:   {quant_df.shape[0]:,}")
# print(f"   Samples: {quant_df.shape[1]}")
# print(quant_df.head())

print("üí° Uncomment after running quantification to inspect results")