In [4]:
import os
import re
from collections import defaultdict
directory = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/vaf_spectra/txtfiles"
output_tsv = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/vaf_spectra/txtfiles/recurrent_sites.tsv"

In [5]:
# Regex to detect "Total:" footer line
total_line = re.compile(r"^#\s*Total:", re.IGNORECASE)

# { (chrom,pos,ref,alt) : set(sample_names) }
site_to_samples = defaultdict(set)

def parse_file(filepath):
    # sample name derived from filename (can adjust if needed)
    sample = os.path.basename(filepath).split()[0]

    with open(filepath) as f:
        for line in f:
            line = line.strip()

            if not line:
                continue
            if line.startswith("#") or line.startswith("CHROM"):
                continue
            if total_line.match(line):
                break

            fields = line.split()
            if len(fields) < 5:
                continue

            chrom, pos, ref, alt = fields[0], fields[1], fields[2], fields[3]
            site_to_samples[(chrom, pos, ref, alt)].add(sample)


# -----------------------------
# Process directory
# -----------------------------
files = [
    os.path.join(directory, f)
    for f in os.listdir(directory)
    if os.path.isfile(os.path.join(directory, f))
]

for fpath in files:
    parse_file(fpath)

# -----------------------------
# Summaries
# -----------------------------
total_sites = len(site_to_samples)
recurrent_sites = {k: v for k, v in site_to_samples.items() if len(v) > 1}

print("====================================")
print(" SNV Recurrence Summary")
print("====================================")
print(f"Files processed:        {len(files)}")
print(f"Total unique sites:     {total_sites}")
print(f"Recurrent sites:        {len(recurrent_sites)}\n")

# Sort by recurrence count
sorted_rec = sorted(recurrent_sites.items(), key=lambda x: len(x[1]), reverse=True)

print("Top recurrent sites:")
for (chrom, pos, ref, alt), samples in sorted_rec[:20]:
    print(f"{chrom}:{pos} {ref}>{alt}  -  {len(samples)} samples  "
          f"({', '.join(sorted(samples))})")

# -----------------------------
# Write output file
# -----------------------------
os.makedirs(os.path.dirname(output_tsv), exist_ok=True)

with open(output_tsv, "w") as out:
    out.write("CHROM\tPOS\tREF\tALT\tCount\tSamples\n")
    for (chrom, pos, ref, alt), samples in sorted_rec:
        out.write(
            f"{chrom}\t{pos}\t{ref}\t{alt}\t"
            f"{len(samples)}\t{','.join(sorted(samples))}\n"
        )

print(f"\nWrote recurrent sites to: {output_tsv}")

 SNV Recurrence Summary
Files processed:        513
Total unique sites:     582516
Recurrent sites:        772

Top recurrent sites:
chr10:109723029 G>A  -  4 samples  (C5_148_TR_snv_count.txt, D17_130_CE_snv_count.txt, D27_074_SI_snv_count.txt, D31_031_DE_snv_count.txt)
chrX:114809556 G>C  -  4 samples  (A5__074_AS_snv_count.txt, B2_116_RE_snv_count.txt, D25_029_RE_snv_count.txt, D31_031_DE_snv_count.txt)
chr2:80093772 T>C  -  3 samples  (029_DC_T_snv_count.txt, B38_180_SI_snv_count.txt, C5_148_TR_snv_count.txt)
chr2:239646364 C>T  -  3 samples  (A6__103_CE_snv_count.txt, C17_93_AS_snv_count.txt, D33_076_DE_snv_count.txt)
chr6:24992398 A>G  -  3 samples  (029_DC_T_snv_count.txt, C17_93_AS_snv_count.txt, PD37513b5_snv_count.txt)
chrX:105217364 G>T  -  3 samples  (029_DC_T_snv_count.txt, C17_93_AS_snv_count.txt, C5_148_TR_snv_count.txt)
chr2:120197542 C>T  -  3 samples  (PD37457f7_snv_count.txt, PD37509b8_snv_count.txt, PD37510b8_snv_count.txt)
chr2:189971856 G>A  -  3 samples  (D25_029