# Liftover Consensus Peaks to Human (hg38)

Quick notebook to convert existing consensus BED files to human genome coordinates.

---

In [None]:
import os
import sys
from pathlib import Path

# Add src to path
PIPELINE_DIR = Path(os.getcwd()).parent if 'notebooks' in os.getcwd() else Path(os.getcwd())
sys.path.insert(0, str(PIPELINE_DIR))

from src.liftover import liftover_peaks, print_chain_info, get_chain_file, DEFAULT_CHAIN_DIR

print(f"‚úÖ Pipeline loaded")

In [None]:
# Show available chain files
print_chain_info()

## Configuration

Set your input files and species below.

In [None]:
# =============================================================================
# CONFIGURATION - Edit these paths
# =============================================================================

# Input consensus BED files (add your files here)
INPUT_FILES = {
    # "Species": "/path/to/consensus_peaks.bed",
    "Gorilla": "/path/to/Gorilla_consensus_peaks.bed",
    "Chimpanzee": "/path/to/Chimpanzee_consensus_peaks.bed",
    "Bonobo": "/path/to/Bonobo_consensus_peaks.bed",
    "Macaque": "/path/to/Macaque_consensus_peaks.bed",
    # "Marmoset": "/path/to/Marmoset_consensus_peaks.bed",  # Requires 2-step liftover
}

# Output directory for lifted files
OUTPUT_DIR = "/path/to/output/lifted_consensus"

# Chain file directory (default: Treutlein lab shared location)
CHAIN_DIR = DEFAULT_CHAIN_DIR

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

In [None]:
# Validate input files
print("Input files:")
print("=" * 60)
for species, filepath in INPUT_FILES.items():
    exists = os.path.exists(filepath)
    status = "‚úÖ" if exists else "‚ùå NOT FOUND"
    print(f"{status} {species}: {filepath}")

## Run Liftover

In [None]:
# Liftover all files
results = []

for species, input_file in INPUT_FILES.items():
    if not os.path.exists(input_file):
        print(f"‚è≠Ô∏è Skipping {species} - file not found")
        continue
    
    # Get chain file
    chain_file = get_chain_file(species, CHAIN_DIR)
    
    # Output file
    basename = os.path.basename(input_file)
    output_file = os.path.join(OUTPUT_DIR, basename.replace(".bed", ".hg38.bed"))
    
    print(f"\n{'='*60}")
    print(f"Species: {species}")
    print(f"Input:   {input_file}")
    print(f"Output:  {output_file}")
    
    # Run liftover
    result = liftover_peaks(
        input_bed=input_file,
        output_bed=output_file,
        chain_file=chain_file,
        min_match=0.95,
    )
    
    results.append({
        'species': species,
        'input': input_file,
        'output': output_file,
        **result
    })
    
    print(f"\n‚úÖ Lifted: {result['lifted']:,} peaks")
    print(f"‚ùå Unmapped: {result['unmapped']:,} peaks")
    
    # Calculate success rate with zero-division check
    total_peaks = result['lifted'] + result['unmapped']
    if total_peaks > 0:
        success_rate = result['lifted'] / total_peaks * 100
        print(f"üìä Success rate: {success_rate:.1f}%")
    else:
        print(f"‚ö†Ô∏è  No peaks processed - check input file and error message:")
        print(f"    {result.get('message', 'Unknown error')}")
        if 'command' in result:
            print(f"    Command: {result['command']}")


## Summary

In [None]:
# Summary table
print("\n" + "=" * 70)
print("LIFTOVER SUMMARY")
print("=" * 70)
print(f"{'Species':<15} {'Lifted':>12} {'Unmapped':>12} {'Success':>10}")
print("-" * 70)

total_lifted = 0
total_unmapped = 0

for r in results:
    lifted = r['lifted']
    unmapped = r['unmapped']
    total = lifted + unmapped
    pct = (lifted / total * 100) if total > 0 else 0.0
    
    print(f"{r['species']:<15} {lifted:>12,} {unmapped:>12,} {pct:>9.1f}%")
    
    total_lifted += lifted
    total_unmapped += unmapped

print("-" * 70)
total_all = total_lifted + total_unmapped
total_pct = (total_lifted / total_all * 100) if total_all > 0 else 0.0
print(f"{'TOTAL':<15} {total_lifted:>12,} {total_unmapped:>12,} {total_pct:>9.1f}%")
print("=" * 70)


In [None]:
# List output files
print("\nüìÅ Output files created:")
for r in results:
    if os.path.exists(r['output']):
        size_mb = os.path.getsize(r['output']) / 1024 / 1024
        print(f"  ‚úÖ {r['output']} ({size_mb:.1f} MB)")

---
## Marmoset: Two-Step Liftover

Marmoset requires lifting first to calJac4, then to hg38.

In [None]:
# Marmoset two-step liftover (uncomment to use)

# MARMOSET_INPUT = "/path/to/Marmoset_consensus_peaks.bed"
# MARMOSET_OUTPUT = os.path.join(OUTPUT_DIR, "Marmoset_consensus_peaks.hg38.bed")

# if os.path.exists(MARMOSET_INPUT):
#     import tempfile
    
#     # Chain files
#     chain_step1 = os.path.join(CHAIN_DIR, "calJac1ToCalJac4.over.chain")
#     chain_step2 = os.path.join(CHAIN_DIR, "calJac4ToHg38.over.chain")
    
#     print("Marmoset: Two-step liftover")
#     print(f"  Step 1: calJac1 ‚Üí calJac4")
#     print(f"  Step 2: calJac4 ‚Üí hg38")
    
#     # Step 1: calJac1 ‚Üí calJac4
#     with tempfile.NamedTemporaryFile(suffix='.bed', delete=False) as tmp:
#         intermediate_file = tmp.name
    
#     result1 = liftover_peaks(
#         input_bed=MARMOSET_INPUT,
#         output_bed=intermediate_file,
#         chain_file=chain_step1,
#     )
#     print(f"  Step 1: {result1['lifted']:,} lifted, {result1['unmapped']:,} unmapped")
    
#     # Step 2: calJac4 ‚Üí hg38
#     result2 = liftover_peaks(
#         input_bed=intermediate_file,
#         output_bed=MARMOSET_OUTPUT,
#         chain_file=chain_step2,
#     )
#     print(f"  Step 2: {result2['lifted']:,} lifted, {result2['unmapped']:,} unmapped")
    
#     # Cleanup
#     os.unlink(intermediate_file)
    
#     print(f"\n‚úÖ Final output: {MARMOSET_OUTPUT}")
# else:
#     print("Marmoset input file not found")

print("üí° Uncomment the code above to run Marmoset two-step liftover")