# Debug: Quantification returning zeros

Test with Bonobo Adipocytes fragments and liftback peaks.

In [9]:
import os, sys, gzip, subprocess
import pandas as pd
import numpy as np

# Add bedtools to PATH (lives in scenicplus conda env)
bedtools_dir = '/cluster/project/treutlein/jjans/software/miniforge3/envs/scenicplus/bin'
os.environ['PATH'] = bedtools_dir + ':' + os.environ.get('PATH', '')

sys.path.insert(0, '/cluster/home/jjanssens/jjans/analysis/adult_intestine/peaks/peak_calling/atac_pipeline')

FRAG_FILE = '/cluster/project/treutlein/USERS/jjans/analysis/adult_intestine/peaks/fragment_files/Bonobo/Adipocytes.fragments.tsv.gz'
PEAK_FILE = '/cluster/project/treutlein/USERS/jjans/analysis/adult_intestine/peaks/cross_species_consensus/03_lifted_back/unified_consensus_Bonobo.bed'

# Verify bedtools
result = subprocess.run('bedtools --version', shell=True, capture_output=True, text=True)
print(result.stdout.strip())

bedtools v2.31.1


In [2]:
# 1. Inspect fragment file: first 20 lines
print('=== FRAGMENT FILE ===')
with gzip.open(FRAG_FILE, 'rt') as f:
    for i, line in enumerate(f):
        if i >= 20:
            break
        print(repr(line.rstrip()))

=== FRAGMENT FILE ===
'1\t769687\t769688\t.\t.\t+'
'1\t769724\t769725\t.\t.\t-'
'1\t770624\t770625\t.\t.\t+'
'1\t770669\t770670\t.\t.\t-'
'1\t773197\t773198\t.\t.\t+'
'1\t773356\t773357\t.\t.\t-'
'1\t776675\t776676\t.\t.\t+'
'1\t776733\t776734\t.\t.\t-'
'1\t777827\t777828\t.\t.\t+'
'1\t778048\t778049\t.\t.\t-'
'1\t778530\t778531\t.\t.\t+'
'1\t778900\t778901\t.\t.\t-'
'1\t778962\t778963\t.\t.\t+'
'1\t779139\t779140\t.\t.\t-'
'1\t779293\t779294\t.\t.\t+'
'1\t779389\t779390\t.\t.\t-'
'1\t779925\t779926\t.\t.\t+'
'1\t780054\t780055\t.\t.\t-'
'1\t785799\t785800\t.\t.\t+'
'1\t785965\t785966\t.\t.\t-'


In [3]:
# 2. Check if fragments are already cut-sites (length == 1)
lengths = []
with gzip.open(FRAG_FILE, 'rt') as f:
    for i, line in enumerate(f):
        if i >= 10000:
            break
        parts = line.strip().split('\t')
        if len(parts) >= 3 and not line.startswith('#'):
            lengths.append(int(parts[2]) - int(parts[1]))

lengths = np.array(lengths)
print(f'Fragment lengths (first {len(lengths)} entries):')
print(f'   min: {lengths.min()}, max: {lengths.max()}, median: {np.median(lengths)}')
print(f'   unique values: {np.unique(lengths)}')
print(f'   all length==1: {(lengths == 1).all()}')
print(f'\nThese fragments are {"ALREADY cut-sites" if (lengths == 1).all() else "paired-end fragments"}')

Fragment lengths (first 10000 entries):
   min: 1, max: 1, median: 1.0
   unique values: [1]
   all length==1: True

These fragments are ALREADY cut-sites


In [4]:
# 3. Inspect peak file: first 10 lines
print('=== PEAK FILE ===')
with open(PEAK_FILE) as f:
    for i, line in enumerate(f):
        if i >= 10:
            break
        print(repr(line.rstrip()))

=== PEAK FILE ===
'chr4\t168323332\t168323405\tunified_000003\tchr1:136132-136205'
'chr12\t55039\t57560\tunified_000008\tchr1:191241-191741'
'chr4\t168307953\t168308072\tunified_000009\tchr1:288851-288970'
'chr17\t83431310\t83431810\tunified_000012\tchr1:593175-593675'
'chrM\t3946\t4446\tunified_000014\tchr1:629696-630196'
'chrM\t4669\t4869\tunified_000015\tchr1:630419-630619'
'chrM\t8024\t8524\tunified_000016\tchr1:633774-634274'
'chr7\t56713159\t56713654\tunified_000017\tchr1:732235-732735'
'chr7\t82525154\t82525323\tunified_000018\tchr1:737317-737486'
'chr11\t50206302\t50206800\tunified_000019\tchr1:770343-770813'


In [6]:
# 4. Compare chromosome naming conventions
frag_chroms = set()
with gzip.open(FRAG_FILE, 'rt') as f:
    for i, line in enumerate(f):
        if i >= 10000:
            break
        parts = line.strip().split('\t')
        if parts and not line.startswith('#'):
            frag_chroms.add(parts[0])

# Peak file has 5 columns (chr, start, end, name, hg38_coords)
peak_df = pd.read_csv(PEAK_FILE, sep='\t', header=None, usecols=[0,1,2,3],
                       names=['Chromosome','Start','End','Name'], dtype={'Chromosome': str})
peak_chroms = set(peak_df['Chromosome'].unique())

print(f'Fragment chroms (sample): {sorted(frag_chroms)[:10]}')
print(f'Peak chroms (sample):     {sorted(peak_chroms)[:10]}')
print(f'\nFragment has chr prefix: {any(c.startswith("chr") for c in frag_chroms)}')
print(f'Peak has chr prefix:     {any(c.startswith("chr") for c in peak_chroms)}')

overlap = frag_chroms & peak_chroms
print(f'\nOverlapping chroms: {len(overlap)}')
if not overlap:
    print('>>> NO OVERLAP - this is why you get zeros! <<<')

Fragment chroms (sample): ['1']
Peak chroms (sample):     ['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18']

Fragment has chr prefix: False
Peak has chr prefix:     True

Overlapping chroms: 0
>>> NO OVERLAP - this is why you get zeros! <<<


In [7]:
# 5. Check column count in fragments (is there a barcode / score column?)
with gzip.open(FRAG_FILE, 'rt') as f:
    line = next(f)
    n_cols = len(line.strip().split('\t'))
    print(f'Fragment file has {n_cols} columns')
    print(f'Line: {repr(line.rstrip())}')
    print()
    if n_cols == 3:
        print('Format: chr start end  (pure BED3 - cut-sites only, no barcode)')
    elif n_cols == 4:
        print('Format: chr start end barcode')
    elif n_cols >= 5:
        print('Format: chr start end barcode score (standard fragments)')
    elif n_cols == 6:
        print('Format: chr start end name score strand (BED6 - cut-sites with strand)')

Fragment file has 6 columns
Line: '1\t769687\t769688\t.\t.\t+'

Format: chr start end barcode score (standard fragments)


In [10]:
# 6. Direct bedtools test with a small subset
# Subset peaks to first 5000
peaks_subset = peak_df.head(5000).copy()

# If chr mismatch, fix it for this test
frag_has_chr = any(c.startswith('chr') for c in frag_chroms)
peak_has_chr = any(c.startswith('chr') for c in peak_chroms)

if frag_has_chr and not peak_has_chr:
    print('Fixing: adding chr prefix to peaks')
    peaks_subset['Chromosome'] = 'chr' + peaks_subset['Chromosome'].astype(str)
elif not frag_has_chr and peak_has_chr:
    print('Fixing: removing chr prefix from peaks')
    peaks_subset['Chromosome'] = peaks_subset['Chromosome'].astype(str).str.replace(r'^chr', '', regex=True)
else:
    print('Chr naming already matches')

print(f'Peak chroms after fix: {sorted(peaks_subset["Chromosome"].unique())[:5]}')

# Write temp peaks
import tempfile
tmp_peaks = tempfile.mktemp(suffix='.bed')
peaks_subset[['Chromosome','Start','End','Name']].to_csv(tmp_peaks, sep='\t', header=False, index=False)

# Test: simple coverage count (fragments overlapping peaks)
cmd = f'bedtools intersect -a {tmp_peaks} -b {FRAG_FILE} -c | head -20'
print(f'\n=== bedtools intersect -c (coverage count) ===')
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print(f'STDERR: {result.stderr}')

# Count non-zero
cmd_all = f'bedtools intersect -a {tmp_peaks} -b {FRAG_FILE} -c'
result_all = subprocess.run(cmd_all, shell=True, capture_output=True, text=True)
counts = [int(line.split('\t')[-1]) for line in result_all.stdout.strip().split('\n') if line]
counts = np.array(counts)
print(f'Total peaks tested: {len(counts)}')
print(f'Non-zero: {(counts > 0).sum()}')
print(f'Sum: {counts.sum()}')
print(f'Max: {counts.max()}')

os.unlink(tmp_peaks)

Fixing: removing chr prefix from peaks
Peak chroms after fix: ['1', '11', '12', '17', '4']

=== bedtools intersect -c (coverage count) ===
4	168323332	168323405	unified_000003	1
12	55039	57560	unified_000008	0
4	168307953	168308072	unified_000009	0
17	83431310	83431810	unified_000012	0
M	3946	4446	unified_000014	0
M	4669	4869	unified_000015	0
M	8024	8524	unified_000016	0
7	56713159	56713654	unified_000017	0
7	82525154	82525323	unified_000018	1
11	50206302	50206800	unified_000019	0
7	57159132	57159977	unified_000021	10
7	56768877	56769376	unified_000022	2
7	56771831	56772314	unified_000024	1
7	56774392	56774900	unified_000027	0
8	348563	349063	unified_000028	0
8	347935	348429	unified_000029	0
Un_NW_014017077v1	25235	26084	unified_000033	0
Un_NW_014017077v1	26185	26797	unified_000034	0
Un_NW_014024175v1	23620	24120	unified_000039	0
Un_NW_014024175v1	22938	23553	unified_000040	0

Total peaks tested: 5000
Non-zero: 1722
Sum: 5291
Max: 30


In [11]:
# 7. Test the quantify function with the FIXED code
# Reload the module to pick up changes
import importlib
import src.quantification
importlib.reload(src.quantification)
from src.quantification import quantify

# Since fragments are already cut-sites, the auto-detect should kick in
# even when method='cutsites' is passed
result_cutsites = quantify(
    input_file=FRAG_FILE,
    peak_file=PEAK_FILE,
    input_type='fragments',
    method='cutsites',
    verbose=True,
)
print(f'\nmethod=cutsites (auto-detected): non-zero = {(result_cutsites > 0).sum()} / {len(result_cutsites)}')
print(f'Sum: {result_cutsites.sum()}')
print(result_cutsites[result_cutsites > 0].head(10))

print('\n' + '='*60)

# Also test explicit coverage
result_coverage = quantify(
    input_file=FRAG_FILE,
    peak_file=PEAK_FILE,
    input_type='fragments',
    method='coverage',
    verbose=True,
)
print(f'\nmethod=coverage: non-zero = {(result_coverage > 0).sum()} / {len(result_coverage)}')
print(f'Sum: {result_coverage.sum()}')
print(result_coverage[result_coverage > 0].head(10))

ðŸ“Š Quantifying Adipocytes.fragments.tsv.gz (fragments) over 840,026 peaks...
  Auto-detected 1bp cut-site fragments, using 'coverage' instead of 'cutsites'

method=cutsites (auto-detected): non-zero = 258726 / 840026
Sum: 719972
unified_000003     1
unified_000018     1
unified_000021    10
unified_000022     2
unified_000024     1
unified_000050     2
unified_000051     2
unified_000052     2
unified_000054     3
unified_000069     1
Name: Adipocytes.fragments.tsv, dtype: int64

ðŸ“Š Quantifying Adipocytes.fragments.tsv.gz (fragments) over 840,026 peaks...

method=coverage: non-zero = 258726 / 840026
Sum: 719972
unified_000003     1
unified_000018     1
unified_000021    10
unified_000022     2
unified_000024     1
unified_000050     2
unified_000051     2
unified_000052     2
unified_000054     3
unified_000069     1
Name: Adipocytes.fragments.tsv, dtype: int64


In [12]:
# 8. Test auto-detection function directly
importlib.reload(src.quantification)
from src.quantification import _are_fragments_already_cutsites, _detect_input_chroms, _harmonize_chr_prefix
from src.utils import load_peaks

is_cutsites = _are_fragments_already_cutsites(FRAG_FILE)
print(f'Are fragments already cut-sites? {is_cutsites}')

# Test chr harmonization
peaks_orig = load_peaks(PEAK_FILE).head(5000)
print(f'\nPeaks before harmonize: {sorted(peaks_orig["Chromosome"].unique())[:5]}')

detected = _detect_input_chroms(FRAG_FILE, 'fragments')
print(f'Detected input chroms:  {sorted(detected)[:5]}')

peaks_fixed = _harmonize_chr_prefix(peaks_orig, FRAG_FILE, 'fragments')
print(f'Peaks after harmonize:  {sorted(peaks_fixed["Chromosome"].unique())[:5]}')
print(f'Was modified: {not peaks_orig["Chromosome"].equals(peaks_fixed["Chromosome"])}')

Are fragments already cut-sites? True

Peaks before harmonize: ['chr1', 'chr11', 'chr12', 'chr17', 'chr4']
Detected input chroms:  ['1']
Peaks after harmonize:  ['1', '11', '12', '17', '4']
Was modified: True
