In [1]:
import scipy
import pycisTopic
pycisTopic.__version__
from pycisTopic.pseudobulk_peak_calling import peak_calling


  from .autonotebook import tqdm as notebook_tqdm
2026-01-30 17:37:49,449	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
effective_genome_sizes = {
    'Bonobo': 2595269547,
    'Macaque': 2653677440,
    'Chimpanzee': 2792339170,
    'Gorilla': 2661668758,
    'Marmoset': 2597026658,
    'Human': 2913022398 #value from macs3 site (deeptools)
}

In [3]:
import os
out_dir = os.getcwd()
print(out_dir)

import subprocess

def convert_fragments_to_cuts(input_fragments: str, output_bed: str):
    """
    Converts a paired-end fragments file into a single-end BED file of cut sites
    using low-memory bash streams.
    
    Input:  chr start end ...
    Output: BED6 format with strands assigned (+ for start, - for end)
    """
    
    # The awk command does the heavy lifting:
    # 1. Prints the start coordinate (column 2) with strand +
    # 2. Prints the end coordinate (column 3) with strand -
    # Note: BED is 0-based, so end-1 is necessary for the second cut's start.
    awk_cmd = r"""awk -v OFS='\t' '{print $1, $2, $2+1, ".", ".", "+"; print $1, $3-1, $3, ".", ".", "-"}'"""

    # Build the full pipeline command
    # zcat -> awk -> gzip > output
    cmd = f"zcat {input_fragments} | {awk_cmd} | gzip > {output_bed}"
    
    print(f"Generating cut sites from {input_fragments}...")
    
    # Run it using subprocess
    try:
        subprocess.run(cmd, shell=True, check=True, executable='/bin/bash')
        print(f"Done! Saved to {output_bed}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing fragments: {e}")
        raise e

/cluster/project/treutlein/USERS/jjans/analysis/adult_intestine/peaks


In [6]:
species = 'Human'

In [13]:
import os
import re
from pathlib import Path

# Setup paths
fragments_fh = f"../atac/consensus_peak_calling_{species}_filter/pseudobulk_bed_files/"
target_dir = Path(f"fragment_files/{species}")

# Ensure the target directory exists
target_dir.mkdir(parents=True, exist_ok=True)

old_dict = {}
new_dict = {}

# Use os.scandir for better performance on large directories
for entry in os.scandir(fragments_fh):
    if entry.is_file():
        file_name = entry.name
        
        # Extract celltype using regex
        celltype = re.sub(r"\.fragments.*", "", file_name)
        
        # Populate dictionaries
        old_dict[celltype] = os.path.join(fragments_fh, file_name)
        new_dict[celltype] = str(target_dir / file_name)

In [14]:
for celltype in old_dict.keys():
    print(celltype)
    convert_fragments_to_cuts(input_fragments=old_dict[celltype],output_bed=new_dict[celltype])

Crypt_Fibroblasts_WNT2B+
Generating cut sites from ../atac/consensus_peak_calling_Human_filter/pseudobulk_bed_files/Crypt_Fibroblasts_WNT2B+.fragments.tsv.gz...
Done! Saved to fragment_files/Human/Crypt_Fibroblasts_WNT2B+.fragments.tsv.gz
Adipocytes
Generating cut sites from ../atac/consensus_peak_calling_Human_filter/pseudobulk_bed_files/Adipocytes.fragments.tsv.gz...
Done! Saved to fragment_files/Human/Adipocytes.fragments.tsv.gz
BEST4+_cells
Generating cut sites from ../atac/consensus_peak_calling_Human_filter/pseudobulk_bed_files/BEST4+_cells.fragments.tsv.gz...
Done! Saved to fragment_files/Human/BEST4+_cells.fragments.tsv.gz
Colonocytes
Generating cut sites from ../atac/consensus_peak_calling_Human_filter/pseudobulk_bed_files/Colonocytes.fragments.tsv.gz...
Done! Saved to fragment_files/Human/Colonocytes.fragments.tsv.gz
Enteric_glia
Generating cut sites from ../atac/consensus_peak_calling_Human_filter/pseudobulk_bed_files/Enteric_glia.fragments.tsv.gz...
Done! Saved to fragment_

In [15]:
import os
import subprocess
import json
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor
from functools import partial

# --- SPECIES AND GENOME SIZES ---
EFFECTIVE_GENOME_SIZES = {
    'Bonobo': 2595269547,
    'Macaque': 2653677440,
    'Chimpanzee': 2792339170,
    'Gorilla': 2661668758,
    'Marmoset': 2597026658,
    'Human': 2913022398  # value from macs3 site (deeptools)
}

# --- CONFIGURATION ---
SPECIES = "Human"  # Change this to switch species

# Input/Output directories
BASE_DIR = "/cluster/home/jjanssens/jjans/analysis/adult_intestine/peaks"
FRAG_DIR = os.path.join(BASE_DIR, f"fragment_files/{SPECIES}")
# Output will be saved in current working directory under consensus_peak_calling_{SPECIES}/
OUT_DIR_BASE = os.path.join(os.getcwd(), f"consensus_peak_calling_{SPECIES}")
MACS3_PATH = "/cluster/project/treutlein/jjans/software/miniforge3/envs/scenicplus/bin/macs3"

# --- MACS3 PARAMETERS (easy to modify) ---
# Full documentation: https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md
MACS3_PARAMS = {
    # Input format: BED, BAM, SAM, BEDPE, etc.
    "format": "BED",
    # q-value (minimum FDR) cutoff for peak detection
    "qvalue": 0.01,
    # Shift reads by this amount (negative for ATAC-seq)
    "shift": -73,
    # Extend reads to this fragment size
    "extsize": 146,
    # How to handle duplicate reads: "auto", "all", or integer
    "keep_dup": "all",
    # Minimum length of peak region
    "min_length": 200,
    # Flags (set to True to enable)
    "nomodel": True,       # Skip model building, use shift/extsize directly
    "call_summits": True,  # Call peak summits (required for narrowPeak output)
    "nolambda": True,      # Use fixed background lambda
}

# Number of parallel workers (cores)
MAX_WORKERS = 15


def build_macs3_command(sample_name, fragment_path, species, out_dir, macs3_path, params):
    """Build MACS3 command with configurable parameters."""
    
    gsize = EFFECTIVE_GENOME_SIZES.get(species)
    if gsize is None:
        raise ValueError(f"Unknown species: {species}. Available: {list(EFFECTIVE_GENOME_SIZES.keys())}")
    
    cmd = [
        macs3_path, "callpeak",
        "--treatment", fragment_path,
        "--name", sample_name,
        "--outdir", out_dir,
        "--format", params["format"],
        "--gsize", str(gsize),
        "--qvalue", str(params["qvalue"]),
        "--shift", str(params["shift"]),
        "--extsize", str(params["extsize"]),
        "--keep-dup", str(params["keep_dup"]),
        "--min-length", str(params["min_length"]),
    ]
    
    # Add boolean flags
    if params.get("nomodel"):
        cmd.append("--nomodel")
    if params.get("call_summits"):
        cmd.append("--call-summits")
    if params.get("nolambda"):
        cmd.append("--nolambda")
    
    return cmd


def run_macs3_worker(job, species, out_dir, macs3_path, params):
    """Worker function for parallel execution.
    
    Returns:
        dict with sample_name, status, peak_count, and error message if any
    """
    sample_name, fragment_path = job
    
    cmd = build_macs3_command(sample_name, fragment_path, species, out_dir, macs3_path, params)
    
    print(f"üöÄ Starting: {sample_name}")
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        
        # Count peaks from the narrowPeak file
        narrowpeak_file = os.path.join(out_dir, f"{sample_name}_peaks.narrowPeak")
        peak_count = 0
        if os.path.exists(narrowpeak_file):
            with open(narrowpeak_file, 'r') as f:
                peak_count = sum(1 for _ in f)
        
        return {
            "sample_name": sample_name,
            "status": "success",
            "peak_count": peak_count,
            "message": f"‚úÖ Finished: {sample_name} ({peak_count:,} peaks)"
        }
    except subprocess.CalledProcessError as e:
        return {
            "sample_name": sample_name,
            "status": "error",
            "peak_count": 0,
            "message": f"‚ùå Error in {sample_name}: {e.stderr}"
        }


def run_peak_calling(
    species,
    frag_dir=None,
    out_dir=None,
    macs3_path=MACS3_PATH,
    max_workers=MAX_WORKERS,
    params=None,
    **param_overrides
):
    """
    Run MACS3 peak calling in parallel for all fragment files.
    
    Args:
        species: Species name (must be in EFFECTIVE_GENOME_SIZES)
        frag_dir: Directory with fragment files (default: BASE_DIR/fragment_files/{species})
        out_dir: Output directory (default: ./consensus_peak_calling_{species}/)
        macs3_path: Path to macs3 executable
        max_workers: Number of parallel workers/cores (default: 15)
        params: Full parameter dict (if None, uses MACS3_PARAMS)
        **param_overrides: Individual parameters to override (e.g., qvalue=0.05)
    
    Returns:
        List of result dicts containing sample info and peak counts
    
    Output files created by MACS3:
        - {sample}_peaks.narrowPeak: BED6+4 format with peak information
        - {sample}_peaks.xls: Spreadsheet with peak info
        - {sample}_summits.bed: Peak summit positions
        - {sample}_treat_pileup.bdg: Treatment pileup (if --bdg flag)
        - {sample}_control_lambda.bdg: Local lambda (if --bdg flag)
    """
    # Set defaults based on species
    if frag_dir is None:
        frag_dir = os.path.join(BASE_DIR, f"fragment_files/{species}")
    if out_dir is None:
        # Save in current working directory
        out_dir = os.path.join(os.getcwd(), f"consensus_peak_calling_{species}")
    
    # Build final parameters
    final_params = (params if params is not None else MACS3_PARAMS).copy()
    final_params.update(param_overrides)
    
    # Ensure output directory exists
    os.makedirs(out_dir, exist_ok=True)
    
    # Find all fragment files
    fragment_files = [f for f in os.listdir(frag_dir) if f.endswith(".fragments.tsv.gz")]
    
    if not fragment_files:
        print(f"‚ö†Ô∏è No fragment files found in {frag_dir}")
        return []
    
    print(f"üìÇ Found {len(fragment_files)} fragment files for {species}")
    print(f"üìÅ Output directory: {out_dir}")
    print(f"üß¨ Genome size: {EFFECTIVE_GENOME_SIZES[species]:,}")
    print(f"‚öôÔ∏è Parameters: qvalue={final_params['qvalue']}, shift={final_params['shift']}, extsize={final_params['extsize']}, min_length={final_params['min_length']}")
    print(f"üë∑ Workers: {max_workers}")
    print("-" * 60)
    
    # Save parameters to file
    params_file = os.path.join(out_dir, "macs3_parameters.json")
    params_to_save = {
        "species": species,
        "genome_size": EFFECTIVE_GENOME_SIZES[species],
        "macs3_path": macs3_path,
        "max_workers": max_workers,
        "frag_dir": frag_dir,
        "out_dir": out_dir,
        "run_date": datetime.now().isoformat(),
        "macs3_params": final_params
    }
    with open(params_file, 'w') as f:
        json.dump(params_to_save, f, indent=2)
    print(f"üíæ Parameters saved to: {params_file}")
    print("-" * 60)
    
    # Create jobs list
    jobs = [(f.split('.')[0], os.path.join(frag_dir, f)) for f in fragment_files]
    
    # Create worker with fixed arguments
    worker = partial(
        run_macs3_worker,
        species=species,
        out_dir=out_dir,
        macs3_path=macs3_path,
        params=final_params
    )
    
    # Run in parallel
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(worker, jobs))
    
    # Generate peak count report
    report_file = os.path.join(out_dir, "peak_counts_report.tsv")
    with open(report_file, 'w') as f:
        f.write("cell_type\tpeak_count\tstatus\n")
        for result in results:
            f.write(f"{result['sample_name']}\t{result['peak_count']}\t{result['status']}\n")
    print(f"\nüìä Peak count report saved to: {report_file}")
    
    return results


# --- EXECUTION ---
if __name__ == "__main__":
    results = run_peak_calling(
        species=SPECIES,
        max_workers=MAX_WORKERS,
        # Easy parameter overrides - uncomment/modify as needed:
        # qvalue=0.05,
        # min_length=150,
        # shift=-100,
        # extsize=200,
    )
    
    print("\n" + "=" * 60)
    print("SUMMARY - Peak counts per cell type")
    print("=" * 60)
    
    total_peaks = 0
    successful = 0
    failed = 0
    
    for result in results:
        print(result["message"])
        if result["status"] == "success":
            successful += 1
            total_peaks += result["peak_count"]
        else:
            failed += 1
    
    print("\n" + "-" * 60)
    print(f"Total samples processed: {len(results)}")
    print(f"  ‚úÖ Successful: {successful}")
    print(f"  ‚ùå Failed: {failed}")
    print(f"  üìä Total peaks called: {total_peaks:,}")
    print(f"\nOutput saved to: {os.path.join(os.getcwd(), f'consensus_peak_calling_{SPECIES}')}")

üìÇ Found 32 fragment files for Human
üìÅ Output directory: /cluster/project/treutlein/USERS/jjans/analysis/adult_intestine/peaks/consensus_peak_calling_Human
üß¨ Genome size: 2,913,022,398
‚öôÔ∏è Parameters: qvalue=0.01, shift=-73, extsize=146, min_length=200
üë∑ Workers: 15
------------------------------------------------------------
üíæ Parameters saved to: /cluster/project/treutlein/USERS/jjans/analysis/adult_intestine/peaks/consensus_peak_calling_Human/macs3_parameters.json
------------------------------------------------------------
üöÄ Starting: Crypt_Fibroblasts_WNT2B+üöÄ Starting: AdipocytesüöÄ Starting: ColonocytesüöÄ Starting: BEST4+_cellsüöÄ Starting: ECsüöÄ Starting: Enteric_gliaüöÄ Starting: Specialized_Fibroblasts_KCNN3+üöÄ Starting: Enteric_neurons

üöÄ Starting: EECsüöÄ Starting: Goblet_cellsüöÄ Starting: EnterocytesüöÄ Starting: ICCsüöÄ Starting: ILCs






üöÄ Starting: MUC6+_cells
üöÄ Starting: Lymphatic_ECs




üöÄ Starting: Macrophages
üöÄ St