# Cross-Species Consensus Peak Pipeline (v2)

**Goal:** Build a unified cross-species ATAC-seq consensus peak set across 6 primate species,
with full provenance tracking and gene annotation.

**Pipeline overview:**
1. Lift all non-human species peaks to hg38
2. Merge lifted peaks + human peaks into a unified consensus, tracking which species contributed each peak
3. Identify **human-specific peaks** (human peaks not overlapping any non-human lifted peak)
4. Lift unified consensus back to each species genome
5. Identify **species-specific peaks** (original species peaks not covered by any liftback peak)
6. Annotate all peaks with closest gene (from species-appropriate GTF) and distance

**Outputs:**
- `unified_peak_NNNNNN` -- consensus peaks in hg38 with species-detection annotation
- `human_peak_NNNNNN` -- human-specific peaks (hg38) not liftable to any species
- `{species}_peak_NNNNNN` -- per-species peaks in native coordinates, not liftable to hg38
- `peak_annotation.tsv` -- master annotation: peak_id, type, species detected, closest gene, distance

## 1. Load Packages and Define Configuration

In [None]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Add the src directory to the path
sys.path.insert(0, os.path.abspath(".."))

from src.cross_species import (
    cross_species_consensus_pipeline,
    merge_with_species_tracking,
    find_human_specific_peaks,
    find_species_specific_peaks,
    create_peak_annotation,
    extract_gene_bed_from_gtf,
    annotate_with_closest_gene,
    add_peak_ids,
    DEFAULT_GTF_FILES,
    REVERSE_CHAIN_FILES,
)
from src.liftover import DEFAULT_CHAIN_DIR

print("All imports loaded successfully")

In [None]:
# =============================================================================
# Configuration -- edit paths here
# =============================================================================

PEAKS_BASE = "/cluster/project/treutlein/USERS/jjans/analysis/adult_intestine/peaks"

# Human consensus peaks (separate from non-human species)
HUMAN_BED = f"{PEAKS_BASE}/consensus_peak_calling_Human/Consensus_Peaks_Filtered_500.bed"

# Non-human species consensus peaks
SPECIES_BEDS = {
    "Bonobo":      f"{PEAKS_BASE}/consensus_peak_calling_Bonobo/Consensus_Peaks_Filtered_500.bed",
    "Chimpanzee":  f"{PEAKS_BASE}/consensus_peak_calling_Chimpanzee/Consensus_Peaks_Filtered_500.bed",
    "Gorilla":     f"{PEAKS_BASE}/consensus_peak_calling_Gorilla/Consensus_Peaks_Filtered_500.bed",
    "Macaque":     f"{PEAKS_BASE}/consensus_peak_calling_Macaque/Consensus_Peaks_Filtered_500.bed",
    "Marmoset":    f"{PEAKS_BASE}/consensus_peak_calling_Marmoset/Consensus_Peaks_Filtered_500.bed",
}

# Output directory
OUTPUT_DIR = f"{PEAKS_BASE}/cross_species_consensus_v2"

# liftOver executable
LIFTOVER_PATH = "/cluster/project/treutlein/jjans/software/miniforge3/envs/genomes/bin/liftOver"

# Chain file directory
CHAIN_DIR = DEFAULT_CHAIN_DIR

# GTF files for gene annotation (use defaults from cross_species.py)
GTF_FILES = DEFAULT_GTF_FILES.copy()

# Pipeline parameters
MIN_MATCH = 0.95    # Minimum match ratio for liftOver
MERGE_DISTANCE = 0  # Max distance between peaks to merge (0 = must overlap)
NCPU = 1            # Parallel workers for liftover

print(f"Output directory: {OUTPUT_DIR}")
print(f"Chain file dir:   {CHAIN_DIR}")
print(f"liftOver binary:  {LIFTOVER_PATH}")
print(f"Species:          {list(SPECIES_BEDS.keys())}")

## 2. Validate Input Files

Check that all input peak files, chain files, and GTF files exist before running the pipeline.

In [None]:
# =============================================================================
# Validate all input files exist
# =============================================================================
from src.liftover import CHAIN_FILES, get_chain_file

all_ok = True

# Check human BED
print("--- Human peaks ---")
if os.path.exists(HUMAN_BED):
    n = sum(1 for l in open(HUMAN_BED) if l.strip() and not l.startswith('#'))
    print(f"  OK  Human: {n:,} peaks")
else:
    print(f"  MISSING  {HUMAN_BED}")
    all_ok = False

# Check non-human BED files
print("\n--- Non-human species peaks ---")
for species, bed in SPECIES_BEDS.items():
    if os.path.exists(bed):
        n = sum(1 for l in open(bed) if l.strip() and not l.startswith('#'))
        print(f"  OK  {species}: {n:,} peaks")
    else:
        print(f"  MISSING  {species}: {bed}")
        all_ok = False

# Check chain files (forward: species -> hg38)
print("\n--- Forward chain files (species -> hg38) ---")
for species in SPECIES_BEDS:
    if species == "Marmoset":
        for step in ["Marmoset_step1", "Marmoset_step2"]:
            path = os.path.join(CHAIN_DIR, CHAIN_FILES[step])
            status = "OK" if os.path.exists(path) else "MISSING"
            print(f"  {status}  {step}: {CHAIN_FILES[step]}")
            if status == "MISSING":
                all_ok = False
    else:
        path = os.path.join(CHAIN_DIR, CHAIN_FILES[species])
        status = "OK" if os.path.exists(path) else "MISSING"
        print(f"  {status}  {species}: {CHAIN_FILES[species]}")
        if status == "MISSING":
            all_ok = False

# Check reverse chain files (hg38 -> species)
print("\n--- Reverse chain files (hg38 -> species) ---")
for key, chain in REVERSE_CHAIN_FILES.items():
    path = os.path.join(CHAIN_DIR, chain)
    status = "OK" if os.path.exists(path) else "MISSING"
    print(f"  {status}  {key}: {chain}")
    if status == "MISSING":
        all_ok = False

# Check GTF files
print("\n--- GTF files for gene annotation ---")
for species, gtf in GTF_FILES.items():
    status = "OK" if os.path.exists(gtf) else "MISSING"
    print(f"  {status}  {species}: {os.path.basename(gtf)}")
    if status == "MISSING":
        all_ok = False

# Check liftOver binary
print(f"\n--- liftOver binary ---")
status = "OK" if os.path.exists(LIFTOVER_PATH) else "MISSING"
print(f"  {status}  {LIFTOVER_PATH}")
if status == "MISSING":
    all_ok = False

print(f"\n{'All inputs validated successfully' if all_ok else 'WARNING: Some inputs are missing!'}")

## 3. Run Cross-Species Consensus Pipeline

This executes all 6 steps:
1. **Lift to hg38** -- liftOver each non-human species to human genome
2. **Merge with tracking** -- merge all species (incl. human) with bedtools, recording which species contributed
3. **Human-specific** -- human peaks not overlapping any lifted non-human peak
4. **Lift back** -- lift unified consensus back to each species' genome
5. **Species-specific** -- original species peaks not covered by any liftback peak
6. **Annotate** -- closest gene and distance for every peak

In [None]:
# =============================================================================
# Run the full pipeline
# =============================================================================
results = cross_species_consensus_pipeline(
    species_beds=SPECIES_BEDS,
    human_bed=HUMAN_BED,
    output_dir=OUTPUT_DIR,
    chain_dir=CHAIN_DIR,
    liftover_path=LIFTOVER_PATH,
    min_match=MIN_MATCH,
    merge_distance=MERGE_DISTANCE,
    peak_prefix="unified",
    gtf_files=GTF_FILES,
    verbose=True,
    ncpu=NCPU,
)

print(f"\nPipeline status: {results['status']}")
print(f"Message: {results['message']}")

## 4. Inspect Output Files

Check what was produced in each output subdirectory.

In [None]:
# =============================================================================
# List all output files with sizes
# =============================================================================
import subprocess

print("Output directory structure:")
print("=" * 70)

for dirpath, dirnames, filenames in sorted(os.walk(OUTPUT_DIR)):
    level = dirpath.replace(OUTPUT_DIR, "").count(os.sep)
    indent = "  " * level
    reldir = os.path.relpath(dirpath, OUTPUT_DIR)
    print(f"{indent}{os.path.basename(dirpath)}/")
    subindent = "  " * (level + 1)
    for f in sorted(filenames):
        fpath = os.path.join(dirpath, f)
        size = os.path.getsize(fpath)
        if size > 1e6:
            size_str = f"{size / 1e6:.1f} MB"
        else:
            size_str = f"{size / 1e3:.0f} KB"
        # Count lines
        n_lines = sum(1 for _ in open(fpath))
        print(f"{subindent}{f:<50s} {size_str:>10s}  ({n_lines:,} lines)")

## 5. Explore the Unified Consensus Peaks

The unified BED has columns: `chr, start, end, peak_id, species_detected`. The species_detected column is a comma-separated list.

In [None]:
# =============================================================================
# Load and explore unified consensus peaks
# =============================================================================
unified_bed = results["output_files"]["unified_consensus"]
unified_df = pd.read_csv(unified_bed, sep="\t", header=None,
                         names=["chr", "start", "end", "peak_id", "species_detected"])

print(f"Unified consensus peaks: {len(unified_df):,}")
print(f"\nFirst 10 peaks:")
print(unified_df.head(10).to_string(index=False))

# Parse species detection
all_species = ["Bonobo", "Chimpanzee", "Gorilla", "Human", "Macaque", "Marmoset"]
for sp in all_species:
    unified_df[f"detected_in_{sp}"] = unified_df["species_detected"].str.contains(sp, na=False)

# Count species per peak
unified_df["n_species"] = unified_df[[f"detected_in_{sp}" for sp in all_species]].sum(axis=1)

print(f"\nSpecies detection distribution:")
print(unified_df["n_species"].value_counts().sort_index().to_string())

# Per-species detection rate
print(f"\nPer-species detection:")
for sp in all_species:
    col = f"detected_in_{sp}"
    n = unified_df[col].sum()
    pct = n / len(unified_df) * 100
    print(f"  {sp:<15s}: {n:>10,} peaks ({pct:5.1f}%)")

## 6. Explore Human-Specific and Species-Specific Peaks

In [None]:
# =============================================================================
# Human-specific peaks
# =============================================================================
hs_bed = results["output_files"]["human_specific"]
hs_df = pd.read_csv(hs_bed, sep="\t", header=None,
                     names=["chr", "start", "end", "peak_id"])

print(f"Human-specific peaks: {len(hs_df):,}")
print(f"First 5:")
print(hs_df.head().to_string(index=False))

# =============================================================================
# Species-specific peaks
# =============================================================================
print(f"\n{'='*70}")
print(f"Species-specific peaks:")
print(f"{'='*70}")

species_specific_counts = {}
for species in SPECIES_BEDS:
    key = f"species_specific_{species}"
    if key in results["output_files"]:
        sp_bed = results["output_files"][key]
        if os.path.exists(sp_bed):
            sp_df = pd.read_csv(sp_bed, sep="\t", header=None,
                                names=["chr", "start", "end", "peak_id"])
            species_specific_counts[species] = len(sp_df)
            print(f"\n  {species}: {len(sp_df):,} specific peaks")
            print(f"  Example IDs: {', '.join(sp_df['peak_id'].head(3))}")

# Summary bar
print(f"\n{'='*70}")
print(f"Summary:")
print(f"  Unified consensus:   {len(unified_df):>10,}")
print(f"  Human-specific:      {len(hs_df):>10,}")
for sp, n in species_specific_counts.items():
    print(f"  {sp}-specific: {n:>10,}")
total = len(unified_df) + len(hs_df) + sum(species_specific_counts.values())
print(f"  {'TOTAL':>22s}: {total:>10,}")

## 7. Explore Peak Annotation File

The master annotation file contains: `peak_id, chr, start, end, peak_type, species_detected, closest_gene, distance_to_gene`.

In [None]:
# =============================================================================
# Load and explore the master peak annotation
# =============================================================================
annotation_file = results["output_files"]["annotation"]
annot_df = pd.read_csv(annotation_file, sep="\t")

print(f"Total annotated peaks: {len(annot_df):,}")
print(f"\nColumns: {list(annot_df.columns)}")
print(f"\nPeak types:")
print(annot_df["peak_type"].value_counts().to_string())

print(f"\nDistance to nearest gene (summary):")
valid_dist = annot_df[annot_df["distance_to_gene"] >= 0]["distance_to_gene"]
print(f"  Median: {valid_dist.median():,.0f} bp")
print(f"  Mean:   {valid_dist.mean():,.0f} bp")
print(f"  At TSS (0 bp): {(valid_dist == 0).sum():,}")
print(f"  < 1 kb:  {(valid_dist < 1000).sum():,}")
print(f"  < 10 kb: {(valid_dist < 10000).sum():,}")
print(f"  < 100 kb: {(valid_dist < 100000).sum():,}")

print(f"\nSample rows from each peak type:")
for pt in annot_df["peak_type"].unique():
    subset = annot_df[annot_df["peak_type"] == pt].head(3)
    print(f"\n  {pt}:")
    print(subset[["peak_id", "chr", "start", "end", "species_detected", "closest_gene", "distance_to_gene"]].to_string(index=False))

## 8. Build Combined BED File

Write a single BED file with all peak categories: unified (hg38), human-specific (hg38), and species-specific (native coords). Also write per-species BED files with liftback unified + species-specific peaks.

In [None]:
# =============================================================================
# Build combined BED file (all peak categories)
# =============================================================================
combined_dir = os.path.join(OUTPUT_DIR, "07_combined")
os.makedirs(combined_dir, exist_ok=True)

combined_bed = os.path.join(combined_dir, "all_peaks_combined.bed")

with open(combined_bed, 'w') as fout:
    # Header as a comment
    fout.write("#chr\tstart\tend\tpeak_id\tcategory\tgenome_assembly\n")

    # 1. Unified peaks (hg38)
    n_unified = 0
    with open(results["output_files"]["unified_consensus"]) as fin:
        for line in fin:
            if line.strip() and not line.startswith('#'):
                parts = line.strip().split('\t')
                fout.write(f"{parts[0]}\t{parts[1]}\t{parts[2]}\t{parts[3]}\tunified\thg38\n")
                n_unified += 1

    # 2. Human-specific peaks (hg38)
    n_human_spec = 0
    with open(results["output_files"]["human_specific"]) as fin:
        for line in fin:
            if line.strip() and not line.startswith('#'):
                parts = line.strip().split('\t')
                fout.write(f"{parts[0]}\t{parts[1]}\t{parts[2]}\t{parts[3]}\thuman_specific\thg38\n")
                n_human_spec += 1

    # 3. Species-specific peaks (native coordinates)
    assembly_map = {
        "Bonobo": "panPan2", "Chimpanzee": "panTro5", "Gorilla": "gorGor4",
        "Macaque": "rheMac10", "Marmoset": "calJac1",
    }
    n_sp_spec = 0
    for species in SPECIES_BEDS:
        key = f"species_specific_{species}"
        if key in results["output_files"] and os.path.exists(results["output_files"][key]):
            assembly = assembly_map.get(species, species)
            with open(results["output_files"][key]) as fin:
                for line in fin:
                    if line.strip() and not line.startswith('#'):
                        parts = line.strip().split('\t')
                        fout.write(f"{parts[0]}\t{parts[1]}\t{parts[2]}\t{parts[3]}\t{species.lower()}_specific\t{assembly}\n")
                        n_sp_spec += 1

print(f"Combined BED written: {combined_bed}")
print(f"  Unified:          {n_unified:,}")
print(f"  Human-specific:   {n_human_spec:,}")
print(f"  Species-specific: {n_sp_spec:,}")
print(f"  Total:            {n_unified + n_human_spec + n_sp_spec:,}")

# =============================================================================
# Per-species BED files (liftback unified + species-specific, in native coords)
# =============================================================================
print(f"\nPer-species complete peak sets:")

for species in SPECIES_BEDS:
    sp_combined = os.path.join(combined_dir, f"all_peaks_{species}.bed")
    n = 0

    with open(sp_combined, 'w') as fout:
        # Liftback unified peaks
        liftback_key = f"liftback_{species}"
        if liftback_key in results["output_files"]:
            liftback_file = results["output_files"][liftback_key]
            if os.path.exists(liftback_file):
                with open(liftback_file) as fin:
                    for line in fin:
                        if line.strip() and not line.startswith('#'):
                            parts = line.strip().split('\t')
                            # liftback may have: chr, start, end, peak_id, source_coords
                            fout.write(f"{parts[0]}\t{parts[1]}\t{parts[2]}\t{parts[3]}\n")
                            n += 1

        # Species-specific peaks
        sp_key = f"species_specific_{species}"
        if sp_key in results["output_files"]:
            sp_file = results["output_files"][sp_key]
            if os.path.exists(sp_file):
                with open(sp_file) as fin:
                    for line in fin:
                        if line.strip() and not line.startswith('#'):
                            parts = line.strip().split('\t')
                            fout.write(f"{parts[0]}\t{parts[1]}\t{parts[2]}\t{parts[3]}\n")
                            n += 1

    print(f"  {species}: {n:,} peaks -> {sp_combined}")

## 9. Summary Statistics and Validation

In [None]:
# =============================================================================
# Summary statistics and validation
# =============================================================================
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

print("=" * 70)
print("VALIDATION")
print("=" * 70)

# 1. Check no duplicate peak IDs
all_ids = annot_df["peak_id"].tolist()
n_unique = len(set(all_ids))
n_total = len(all_ids)
print(f"\nPeak ID uniqueness: {n_unique:,} unique / {n_total:,} total", end="")
if n_unique == n_total:
    print(" -- OK")
else:
    dup_ids = annot_df[annot_df["peak_id"].duplicated(keep=False)]["peak_id"].unique()
    print(f" -- WARNING: {n_total - n_unique} duplicates!")
    print(f"  Duplicate IDs: {list(dup_ids[:10])}")

# 2. Liftover success rates
print(f"\nLiftover success rates (species -> hg38):")
print(f"  {'Species':<15s} {'Input':>10s} {'Lifted':>10s} {'Rate':>8s}")
print(f"  {'-'*48}")
for species, res in results["lift_to_human"].items():
    inp = res.get("total", res.get("input", 0))
    lifted = res.get("lifted", 0)
    rate = lifted / inp * 100 if inp > 0 else 0
    print(f"  {species:<15s} {inp:>10,} {lifted:>10,} {rate:>7.1f}%")

print(f"\nLiftback success rates (hg38 -> species):")
print(f"  {'Species':<15s} {'Input':>10s} {'Lifted':>10s} {'Rate':>8s}")
print(f"  {'-'*48}")
for species, res in results["lift_back"].items():
    inp = res.get("total", res.get("input", 0))
    lifted = res.get("lifted", 0)
    rate = lifted / inp * 100 if inp > 0 else 0
    print(f"  {species:<15s} {inp:>10,} {lifted:>10,} {rate:>7.1f}%")

# 3. Species detection distribution plot
print(f"\n{'='*70}")
print(f"SPECIES DETECTION IN UNIFIED PEAKS")
print(f"{'='*70}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart: peaks per n_species
n_species_counts = unified_df["n_species"].value_counts().sort_index()
axes[0].bar(n_species_counts.index, n_species_counts.values, color="steelblue", edgecolor="white")
axes[0].set_xlabel("Number of species detected")
axes[0].set_ylabel("Number of peaks")
axes[0].set_title("Distribution of species detection\n(unified consensus)")
for x, y in zip(n_species_counts.index, n_species_counts.values):
    axes[0].text(x, y, f"{y:,}", ha="center", va="bottom", fontsize=8)

# Bar chart: peak category counts
categories = {"Unified": len(unified_df), "Human-specific": len(hs_df)}
for sp, n in species_specific_counts.items():
    categories[f"{sp}-specific"] = n

cats = list(categories.keys())
vals = list(categories.values())
colors = ["steelblue"] + ["firebrick"] + ["seagreen"] * len(species_specific_counts)
axes[1].barh(cats, vals, color=colors, edgecolor="white")
axes[1].set_xlabel("Number of peaks")
axes[1].set_title("Peak counts by category")
for y_pos, v in enumerate(vals):
    axes[1].text(v, y_pos, f"  {v:,}", ha="left", va="center", fontsize=8)

plt.tight_layout()
plot_file = os.path.join(OUTPUT_DIR, "peak_summary.png")
plt.savefig(plot_file, dpi=150, bbox_inches="tight")
print(f"\nSaved summary plot: {plot_file}")
plt.show()

# 4. Save summary stats
summary_file = os.path.join(OUTPUT_DIR, "pipeline_summary.txt")
with open(summary_file, 'w') as f:
    f.write("Cross-Species Consensus Pipeline v2 -- Summary\n")
    f.write("=" * 60 + "\n\n")
    f.write(f"Unified consensus peaks: {len(unified_df):,}\n")
    f.write(f"Human-specific peaks:    {len(hs_df):,}\n")
    for sp, n in species_specific_counts.items():
        f.write(f"{sp}-specific peaks:  {n:,}\n")
    f.write(f"\nSpecies detection distribution (unified):\n")
    for n_sp, count in n_species_counts.items():
        f.write(f"  {n_sp} species: {count:,} peaks\n")
    f.write(f"\nAnnotation file: {annotation_file}\n")
    f.write(f"Combined BED:    {combined_bed}\n")

print(f"Saved summary: {summary_file}")