# Flexible ATAC-seq Peak Calling Workflow

**üéØ Use this notebook for custom analyses with your own files.**

This notebook provides full flexibility to:
- Use your own fragment files from any location
- Specify custom chain files for liftover
- Set individual parameters for each step
- Run only the steps you need

For a more streamlined workflow using `config.yaml`, use `01_peak_calling_workflow.ipynb`.

---

## Setup

In [None]:
import os
import sys
from pathlib import Path
from datetime import datetime

# Add src to path
PIPELINE_DIR = Path(os.getcwd()).parent if 'notebooks' in os.getcwd() else Path(os.getcwd())
sys.path.insert(0, str(PIPELINE_DIR))

# Import all pipeline modules
from src.peak_calling import (
    convert_fragments_to_cutsites,
    process_all_fragments,
    run_peak_calling,
    EFFECTIVE_GENOME_SIZES,
    DEFAULT_MACS3_PARAMS,
)
from src.consensus import (
    get_consensus_peaks,
    load_narrowpeaks,
    harmonize_chromosomes,
)
from src.liftover import (
    liftover_peaks,
    print_chain_info,
    get_chain_file,
    CHAIN_FILES,
    DEFAULT_CHAIN_DIR,
)
from src.bigwig import (
    create_bigwig,
    fragments_to_bigwig,
    process_all_fragments_to_bigwig,
)
from src.utils import get_chromsizes, save_parameters, ensure_dir
from src.visualization import plot_peak_distribution, plot_consensus_summary, plot_genome_regions

print(f"‚úÖ Pipeline loaded from: {PIPELINE_DIR}")
print(f"‚úÖ Python: {sys.version.split()[0]}")

## üìÇ Available Chain Files

Chain files are required for liftover between genome assemblies.

In [None]:
# Print all available chain files
print_chain_info()

In [None]:
# List actual files in chain directory
import subprocess
print("\nüìÅ Files in chain directory:")
print(f"   {DEFAULT_CHAIN_DIR}")
print()

if os.path.exists(DEFAULT_CHAIN_DIR):
    for f in sorted(os.listdir(DEFAULT_CHAIN_DIR)):
        if f.endswith('.chain') or f.endswith('.chain.gz'):
            print(f"   - {f}")
else:
    print("   ‚ö†Ô∏è Directory not accessible from this machine")

---
## üîß Configuration: Set Your Paths

**Edit the cells below to specify your input/output locations.**

In [None]:
# =============================================================================
# ‚ö° USER CONFIGURATION - EDIT THESE VALUES
# =============================================================================

# --- Input Files ---
# Path to your fragment file(s) - can be a single file or directory
FRAGMENT_FILE = "/path/to/your/fragments.tsv.gz"  # Single file
# OR
FRAGMENT_DIR = "/path/to/your/fragment_directory/"  # Directory with multiple files

# --- Species & Genome ---
SPECIES = "Gorilla"  # Options: Human, Gorilla, Chimpanzee, Bonobo, Macaque, Marmoset

# --- Reference Files ---
# Chromosome sizes file
CHROMSIZES_FILE = "/path/to/your/genome.chrom.sizes"

# Chain file for liftover (leave empty if not doing liftover or Human)
CHAIN_FILE = ""  # Will be auto-detected if empty

# Chain file directory (default: Treutlein lab shared location)
CHAIN_DIR = DEFAULT_CHAIN_DIR

# --- Output Directory ---
OUTPUT_DIR = str(PIPELINE_DIR / "output" / "custom_analysis")

# --- MACS3 ---
MACS3_PATH = "macs3"  # or full path like: "/path/to/macs3"

print("Configuration set. Review the values above and modify as needed.")

In [None]:
# Auto-detect chain file if not specified
if not CHAIN_FILE and SPECIES != "Human":
    CHAIN_FILE = get_chain_file(SPECIES, CHAIN_DIR)
    print(f"üîó Auto-detected chain file: {CHAIN_FILE}")
elif SPECIES == "Human":
    print("‚ÑπÔ∏è Human samples don't need liftover")
else:
    print(f"üîó Using specified chain file: {CHAIN_FILE}")

In [None]:
# Create output directories
CUTSITES_DIR = os.path.join(OUTPUT_DIR, "cutsites")
PEAKS_DIR = os.path.join(OUTPUT_DIR, "peaks")
LIFTED_DIR = os.path.join(OUTPUT_DIR, "lifted")
CONSENSUS_DIR = os.path.join(OUTPUT_DIR, "consensus")
BIGWIG_DIR = os.path.join(OUTPUT_DIR, "bigwigs")

for d in [CUTSITES_DIR, PEAKS_DIR, LIFTED_DIR, CONSENSUS_DIR, BIGWIG_DIR]:
    ensure_dir(d)

print(f"üìÅ Output directories created in: {OUTPUT_DIR}")

---
## üìä Available Genome Sizes

In [None]:
print("Effective genome sizes for MACS3:")
print("=" * 40)
for species, size in sorted(EFFECTIVE_GENOME_SIZES.items()):
    marker = "üëâ" if species == SPECIES else "  "
    print(f"{marker} {species}: {size:,}")

---
## üîÑ Step 1: Fragment to Cut-site Conversion

Convert fragment files to Tn5 cut-site BED files.

In [None]:
# Option A: Convert a single file
def convert_single_file(input_file, output_dir):
    """Convert a single fragment file to cut-sites."""
    output_file = os.path.join(
        output_dir,
        os.path.basename(input_file).replace('.tsv.gz', '.cutsites.bed.gz')
    )
    
    result = convert_fragments_to_cutsites(input_file, output_file)
    print(f"‚úÖ {result['message']}")
    return result

# Example usage (uncomment to run):
# result = convert_single_file(FRAGMENT_FILE, CUTSITES_DIR)

In [None]:
# Option B: Convert all files in a directory
def convert_all_files(input_dir, output_dir, workers=8):
    """Convert all fragment files in a directory."""
    results = process_all_fragments(
        input_dir=input_dir,
        output_dir=output_dir,
        max_workers=workers,
    )
    
    success = sum(1 for r in results if r['status'] == 'success')
    print(f"\n‚úÖ Converted {success}/{len(results)} files")
    return results

# Example usage (uncomment to run):
# results = convert_all_files(FRAGMENT_DIR, CUTSITES_DIR)

---
## üèîÔ∏è Step 2: MACS3 Peak Calling

In [None]:
# MACS3 Parameters (customize as needed)
MACS3_PARAMS = {
    'format': 'BED',
    'qvalue': 0.01,      # FDR threshold
    'shift': -73,        # ATAC-seq shift
    'extsize': 146,      # Extension size
    'keep_dup': 'all',
    'min_length': 200,
    'nomodel': True,
    'call_summits': True,
    'nolambda': True,
}

print("MACS3 Parameters:")
for k, v in MACS3_PARAMS.items():
    print(f"  {k}: {v}")

In [None]:
# Run peak calling
def call_peaks(input_dir, output_dir, species, workers=15):
    """Run MACS3 peak calling on cut-site files."""
    results = run_peak_calling(
        species=species,
        frag_dir=input_dir,
        out_dir=output_dir,
        macs3_path=MACS3_PATH,
        max_workers=workers,
        params=MACS3_PARAMS,
    )
    
    total_peaks = sum(r.get('peak_count', 0) for r in results if r['status'] == 'success')
    print(f"\nüìä Total peaks called: {total_peaks:,}")
    return results

# Example usage (uncomment to run):
# peak_results = call_peaks(CUTSITES_DIR, PEAKS_DIR, SPECIES)

---
## üîó Step 3: Liftover to hg38

In [None]:
# Liftover a single file
def liftover_single(input_bed, output_bed, chain_file):
    """Liftover a single BED file."""
    result = liftover_peaks(
        input_bed=input_bed,
        output_bed=output_bed,
        chain_file=chain_file,
        verbose=True,
    )
    return result

# Example usage (uncomment to run):
# result = liftover_single(
#     input_bed="/path/to/peaks.bed",
#     output_bed="/path/to/peaks.hg38.bed",
#     chain_file=CHAIN_FILE,
# )

In [None]:
# Liftover all narrowPeak files
def liftover_all_peaks(peaks_dir, output_dir, chain_file):
    """Liftover all narrowPeak files in a directory."""
    peak_files = list(Path(peaks_dir).glob("*_peaks.narrowPeak"))
    print(f"Found {len(peak_files)} peak files")
    print(f"üîó Using chain file: {chain_file}")
    
    results = []
    for pf in peak_files:
        out_file = Path(output_dir) / pf.name.replace(".narrowPeak", ".hg38.bed")
        result = liftover_peaks(
            input_bed=str(pf),
            output_bed=str(out_file),
            chain_file=chain_file,
            verbose=False,
        )
        print(f"  {pf.name}: {result['lifted']:,} lifted, {result['unmapped']:,} unmapped")
        results.append(result)
    
    return results

# Example usage (uncomment to run):
# liftover_results = liftover_all_peaks(PEAKS_DIR, LIFTED_DIR, CHAIN_FILE)

---
## üéØ Step 4: Consensus Peaks

In [None]:
# Consensus parameters
CONSENSUS_PARAMS = {
    'peak_half_width': 250,      # Total width = 500bp
    'q_value_threshold': 0.05,   # Filter peaks by q-value
    'min_peaks_per_sample': 5000,  # Minimum peaks to include sample
}

print("Consensus Peak Parameters:")
for k, v in CONSENSUS_PARAMS.items():
    print(f"  {k}: {v}")

In [None]:
# Generate consensus peaks
def generate_consensus(peaks_dir, output_dir, chromsizes_file, params=CONSENSUS_PARAMS):
    """Generate consensus peaks from narrowPeak files."""
    
    # Load peaks
    peaks_dict = load_narrowpeaks(
        peak_dir=peaks_dir,
        q_value_threshold=params['q_value_threshold'],
        min_peaks_per_sample=params['min_peaks_per_sample'],
    )
    print(f"Loaded {len(peaks_dict)} samples")
    
    # Load chromsizes
    chromsizes = get_chromsizes(SPECIES, chromsizes_file, as_pyranges=True)
    
    # Harmonize
    peaks_dict, chromsizes = harmonize_chromosomes(peaks_dict, chromsizes)
    
    # Generate consensus
    consensus = get_consensus_peaks(
        narrow_peaks_dict=peaks_dict,
        peak_half_width=params['peak_half_width'],
        chromsizes=chromsizes,
    )
    
    # Save
    output_file = os.path.join(output_dir, f"consensus_peaks_{params['peak_half_width']*2}bp.bed")
    consensus.to_bed(output_file)
    print(f"\n‚úÖ Saved {len(consensus):,} consensus peaks to: {output_file}")
    
    return consensus

# Example usage (uncomment to run):
# consensus_peaks = generate_consensus(PEAKS_DIR, CONSENSUS_DIR, CHROMSIZES_FILE)

---
## üìà Step 5: BigWig Generation

In [None]:
# Create a single BigWig file
def make_bigwig(fragments_file, output_file, chromsizes_file, cut_sites=True, normalize=True):
    """Create a bigWig from a fragment file."""
    result = create_bigwig(
        fragments=fragments_file,
        chromsizes=chromsizes_file,
        output=output_file,
        cut_sites=cut_sites,
        normalize=normalize,
        verbose=True,
    )
    return result

# Example usage (uncomment to run):
# result = make_bigwig(
#     fragments_file="/path/to/fragments.tsv.gz",
#     output_file="/path/to/output.bw",
#     chromsizes_file=CHROMSIZES_FILE,
# )

In [None]:
# Create BigWigs for all files in a directory
def make_all_bigwigs(input_dir, output_dir, chromsizes_file, pattern="*.tsv.gz"):
    """Create bigWigs for all fragment files in a directory."""
    results = process_all_fragments_to_bigwig(
        input_dir=input_dir,
        output_dir=output_dir,
        chrom_sizes_file=chromsizes_file,
        pattern=pattern,
        cut_sites=True,
        normalize=True,
        verbose=True,
    )
    return results

# Example usage (uncomment to run):
# bigwig_results = make_all_bigwigs(FRAGMENT_DIR, BIGWIG_DIR, CHROMSIZES_FILE)

---
## üìã Quick Reference: One-Liner Examples

In [None]:
# Quick reference for common operations
print("""
üìã QUICK REFERENCE
==================

# Convert fragments to cut-sites:
convert_fragments_to_cutsites("input.tsv.gz", "output.bed.gz")

# Run MACS3 on all files:
run_peak_calling("Gorilla", "cutsites_dir", "peaks_dir", max_workers=15)

# Liftover to hg38:
liftover_peaks("peaks.bed", "peaks.hg38.bed", "gorGor4ToHg38.over.chain")

# Generate consensus peaks:
get_consensus_peaks(peaks_dict, peak_half_width=250, chromsizes=chromsizes)

# Create bigWig:
create_bigwig("fragments.tsv.gz", "chromsizes", "output.bw")

# Get chain file for species:
get_chain_file("Gorilla")  # Auto-uses default chain directory

# Print chain file info:
print_chain_info()
""")

---
## üîç Check Your Configuration

In [None]:
# Validate your configuration
print("=" * 60)
print("CONFIGURATION CHECK")
print("=" * 60)

checks = [
    ("Fragment file", FRAGMENT_FILE, os.path.exists(FRAGMENT_FILE) if FRAGMENT_FILE != "/path/to/your/fragments.tsv.gz" else False),
    ("Fragment directory", FRAGMENT_DIR, os.path.exists(FRAGMENT_DIR) if FRAGMENT_DIR != "/path/to/your/fragment_directory/" else False),
    ("Chromsizes file", CHROMSIZES_FILE, os.path.exists(CHROMSIZES_FILE) if CHROMSIZES_FILE != "/path/to/your/genome.chrom.sizes" else False),
    ("Chain file", CHAIN_FILE, os.path.exists(CHAIN_FILE) if CHAIN_FILE else True),
    ("Output directory", OUTPUT_DIR, os.path.exists(OUTPUT_DIR)),
]

for name, path, exists in checks:
    status = "‚úÖ" if exists else "‚ùå"
    print(f"{status} {name}:")
    print(f"   {path}")
    print()

print("\n‚ÑπÔ∏è Edit the USER CONFIGURATION cell above to set your paths.")