# Quantification Over Unified Peaks

Quantify fragment files, bigWigs, or Tn5 insertion BEDs over the cross-species
unified peak set (or any BED peak file).

## Workflow
1. **Load** the unified consensus peak set (from `Cross_species_consensus` notebook)
2. **Discover** fragment / bigWig / Tn5 files per species
3. **Quantify** – single file or full matrix
4. **Save / load** results in feather, parquet, or TSV

In [None]:
# Imports
import os
import sys
import glob
import subprocess
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# Ensure bedtools is on PATH (lives in scenicplus conda env)
bedtools_dir = '/cluster/project/treutlein/jjans/software/miniforge3/envs/scenicplus/bin'
os.environ['PATH'] = bedtools_dir + ':' + os.environ.get('PATH', '')

# Add atac_pipeline to path
sys.path.insert(0, '/cluster/home/jjanssens/jjans/analysis/adult_intestine/peaks/peak_calling/atac_pipeline')

from src.quantification import quantify, quantify_matrix, save_matrix, load_matrix

print("Quantification module loaded")
print("   quantify()        - single file over peaks")
print("   quantify_matrix() - multiple files -> peaks x samples matrix")
print("   save/load_matrix  - feather / parquet / tsv I/O")
bt = subprocess.run('bedtools --version', shell=True, capture_output=True, text=True)
print(f"   {bt.stdout.strip()}")

## Configuration

In [None]:
# === CONFIGURATION ===

BASE_PATH = "/cluster/project/treutlein/USERS/jjans"
CROSS_SPECIES_DIR = f"{BASE_PATH}/analysis/adult_intestine/peaks/cross_species_consensus"

# Species
SPECIES_LIST = ["Bonobo", "Chimpanzee", "Gorilla", "Macaque", "Marmoset", "Human"]

# Output
QUANT_OUTPUT_DIR = os.path.join(CROSS_SPECIES_DIR, "04_quantification")
os.makedirs(QUANT_OUTPUT_DIR, exist_ok=True)

print(f"Output directory: {QUANT_OUTPUT_DIR}")

## Load Peak Set

In [None]:
# Load unified consensus peak file (hg38) produced by Cross_species_consensus notebook
UNIFIED_PEAKS = os.path.join(
    CROSS_SPECIES_DIR, "02_merged_consensus", "unified_consensus_hg38_with_ids.bed"
)

# Per-species liftback peaks (in each species' own coordinates)
LIFTBACK_DIR = os.path.join(CROSS_SPECIES_DIR, "03_lifted_back")

SPECIES_PEAK_FILES = {}
for species in SPECIES_LIST:
    if species == "Human":
        peak_file = UNIFIED_PEAKS  # already hg38
    else:
        peak_file = os.path.join(LIFTBACK_DIR, f"unified_consensus_{species}.bed")
    
    if os.path.exists(peak_file):
        with open(peak_file) as f:
            n = sum(1 for line in f if line.strip())
        SPECIES_PEAK_FILES[species] = peak_file
        print(f"{species:<15} {n:>8,} peaks  {peak_file}")
    else:
        print(f"{species:<15} NOT FOUND: {peak_file}")

print(f"Peak files available for {len(SPECIES_PEAK_FILES)}/{len(SPECIES_LIST)} species")

## Discover Fragment Files

In [None]:
# Auto-detect fragment files for each species
# Adjust FRAGMENT_DIR_TEMPLATE if your directory layout differs

FRAGMENT_DIR_TEMPLATE = f"{BASE_PATH}/analysis/adult_intestine/peaks/fragment_files/{{species}}"
FRAGMENT_PATTERN = "*.tsv.gz"

SPECIES_FRAGMENTS = {}

print("Scanning for fragment files:")
print("-" * 70)

for species in SPECIES_LIST:
    frag_dir = FRAGMENT_DIR_TEMPLATE.format(species=species)
    
    if not os.path.isdir(frag_dir):
        print(f"{species:<15} directory not found: {frag_dir}")
        continue
    
    files = sorted(glob.glob(os.path.join(frag_dir, FRAGMENT_PATTERN)))
    
    if files:
        SPECIES_FRAGMENTS[species] = files
        print(f"{species:<15} {len(files):>4} files   ({frag_dir})")
    else:
        print(f"{species:<15} 0 files matching {FRAGMENT_PATTERN} in {frag_dir}")

print("-" * 70)
total_files = sum(len(v) for v in SPECIES_FRAGMENTS.values())
print(f"{total_files} total fragment files across {len(SPECIES_FRAGMENTS)} species")

In [None]:
# Preview files for one species
PREVIEW_SPECIES = SPECIES_LIST[0]  # change as needed

if PREVIEW_SPECIES in SPECIES_FRAGMENTS:
    files = SPECIES_FRAGMENTS[PREVIEW_SPECIES]
    print(f"{PREVIEW_SPECIES} fragment files ({len(files)}):")
    for f in files[:10]:
        print(f"   {os.path.basename(f)}")
    if len(files) > 10:
        print(f"   ... and {len(files) - 10} more")
else:
    print(f"No fragments found for {PREVIEW_SPECIES}")

## Single-File Quantification (test)

Try one file to make sure everything works before running the full matrix.

In [None]:
# Pick the first species that has both peaks and fragments
test_species = next(
    (s for s in SPECIES_LIST if s in SPECIES_FRAGMENTS and s in SPECIES_PEAK_FILES),
    None,
 )

if test_species:
    test_file = SPECIES_FRAGMENTS[test_species][0]
    test_peaks = SPECIES_PEAK_FILES[test_species]
    
    print(f"Testing with {test_species}")
    print(f"   File:  {os.path.basename(test_file)}")
    print(f"   Peaks: {os.path.basename(test_peaks)}")
    
    result = quantify(
        input_file=test_file,
        peak_file=test_peaks,
        input_type="fragments",   # "fragments" | "tn5" | "bigwig"
        method="cutsites",        # "coverage" | "cutsites"  (fragments only)
        verbose=True,
    )
    
    print(f"\nResult shape: {result.shape}")
    print(f"   Non-zero peaks: {(result > 0).sum():,} / {len(result):,}")
    print(result.head(10))
else:
    print("No species has both peak file and fragment files. Check paths above.")

## Build Quantification Matrix

Run quantification for one or all species. The result is a **peaks \u00d7 samples** matrix.

All input types (`fragments`, `tn5`, `bigwig`) use the same interface — just change `input_type`.

In [None]:
# === QUANTIFY ONE SPECIES ===

SPECIES = "Human"  # Change to run a different species

if SPECIES in SPECIES_FRAGMENTS and SPECIES in SPECIES_PEAK_FILES:
    input_files = SPECIES_FRAGMENTS[SPECIES]
    peak_file   = SPECIES_PEAK_FILES[SPECIES]
    
    print(f"Quantifying {SPECIES}: {len(input_files)} files over {peak_file}")
    
    matrix = quantify_matrix(
        input_files=input_files,
        peak_file=peak_file,
        input_type="fragments",       # "fragments" | "tn5" | "bigwig"
        method="cutsites",            # "coverage" | "cutsites"  (fragments only)
        # stat="mean",                # "mean"|"sum"|"max"|"min" (bigwig only)
        n_workers=8,
        name_pattern=r"_fragments.*$",  # regex to clean filenames
        name_replacement="",
        output_file=os.path.join(QUANT_OUTPUT_DIR, f"quantification_{SPECIES}"),
        output_format="feather",       # "feather" | "parquet" | "tsv"
        verbose=True,
    )
    
    print(f"\nMatrix shape: {matrix.shape}")
    print(matrix.head())
else:
    print(f"Missing peaks or fragments for {SPECIES}. Check configuration above.")

In [None]:
# === QUANTIFY ALL SPECIES ===
# Loops over every species that has both fragments and peak files.
# Each species gets its own output matrix.

# for species in SPECIES_LIST:
#     if species not in SPECIES_FRAGMENTS or species not in SPECIES_PEAK_FILES:
#         print(f"Skipping {species}: missing data")
#         continue
#
#     input_files = SPECIES_FRAGMENTS[species]
#     peak_file   = SPECIES_PEAK_FILES[species]
#
#     print(f"\n{'='*60}")
#     print(f"{species}: {len(input_files)} files")
#     print(f"{'='*60}")
#
#     quantify_matrix(
#         input_files=input_files,
#         peak_file=peak_file,
#         input_type="fragments",
#         method="cutsites",
#         n_workers=8,
#         name_pattern=r"_fragments.*$",
#         name_replacement="",
#         output_file=os.path.join(QUANT_OUTPUT_DIR, f"quantification_{species}"),
#         output_format="feather",
#         verbose=True,
#     )

print("Uncomment the loop above to quantify all species")

## Memory-Efficient Mode

For very large numbers of files, use `chunk_size` to process in batches and avoid
loading the full matrix in memory.

In [None]:
# === MEMORY-EFFICIENT MODE ===
# Processes N files at a time, writes intermediate chunks.

# quantify_matrix(
#     input_files=input_files,
#     peak_file=peak_file,
#     input_type="fragments",
#     method="cutsites",
#     n_workers=8,
#     chunk_size=50,                # ← process 50 files at a time
#     output_file=os.path.join(QUANT_OUTPUT_DIR, f"quantification_large_{SPECIES}"),
#     output_format="parquet",      # parquet recommended for large files
#     verbose=True,
# )

print("Uncomment and adjust chunk_size for large-scale quantification")

## Load and Inspect Saved Matrix

In [None]:
# Load a saved matrix (auto-detects feather / parquet / tsv from extension)
# quant_df = load_matrix(os.path.join(QUANT_OUTPUT_DIR, f"quantification_{SPECIES}.feather"))
# print(f"Matrix shape: {quant_df.shape}")
# print(f"   Peaks:   {quant_df.shape[0]:,}")
# print(f"   Samples: {quant_df.shape[1]}")
# print(quant_df.head())

print("Uncomment after running quantification to inspect results")

In [None]:
# List all saved quantification files
print(f"Saved quantification files in {QUANT_OUTPUT_DIR}:")
print("-" * 60)

if os.path.isdir(QUANT_OUTPUT_DIR):
    for f in sorted(os.listdir(QUANT_OUTPUT_DIR)):
        fpath = os.path.join(QUANT_OUTPUT_DIR, f)
        size_mb = os.path.getsize(fpath) / (1024 * 1024)
        print(f"   {f:<50} {size_mb:>8.1f} MB")
else:
    print("   (no output directory yet)")