In [1]:
import os
import glob
import re
from collections import Counter
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:

# Collapsed canonical mutation map
mutation_map = {
    "C>A": ["C>A", "G>T"],
    "C>G": ["C>G", "G>C"],
    "C>T": ["C>T", "G>A"],
    "T>A": ["T>A", "A>T"],
    "T>C": ["T>C", "A>G"],
    "T>G": ["T>G", "A>C"],
}
# reverse lookup: e.g. "G>A" -> "C>T"
_rev_lookup = {}
for k, vals in mutation_map.items():
    for v in vals:
        _rev_lookup[v.upper()] = k
CANONICAL_ORDER = list(mutation_map.keys())

_total_re = re.compile(r"Total\s*[:]\s*(\d+)", re.IGNORECASE)

def assign_class_simple(ref, alt):
    """
    Very simple mapping:
      - If ref starts with 'CpG' -> treat ref_base = 'C' (is_cpg True)
      - If ref starts with 'GpC' -> treat ref_base = 'G' (is_cpg True)
      - Else, if ref is a single base or contains a single base, use that base.
    Returns (mut_class_or_None, is_cpg_bool)
    """
    if ref is None or alt is None:
        return None, False
    ref_s = str(ref).strip()
    alt_s = str(alt).strip().upper()

    if ref_s == "" or alt_s == "":
        return None, False

    ref_up = ref_s.upper()
    is_cpg = False

    if ref_up.startswith("CPG"):
        ref_base = "C"
        is_cpg = True
    elif ref_up.startswith("GPC"):
        ref_base = "G"
        is_cpg = True
    else:
        # simple fallback: look for first A/C/G/T in the token
        m = re.search(r"[ACGTacgt]", ref_s)
        ref_base = m.group(0).upper() if m else None

    if ref_base is None or ref_base not in {"A","C","G","T"}:
        return None, is_cpg
    if len(alt_s) != 1 or alt_s not in {"A","C","G","T"}:
        return None, is_cpg

    change = f"{ref_base}>{alt_s}".upper()
    mut_class = _rev_lookup.get(change)  # collapse complements
    return mut_class, is_cpg

def parse_file(path):
    """
    Parse a single .snv_count.txt file.
    Returns (total_if_found_or_None, list_of_mut_classes)
    where list_of_mut_classes contains tuples (mut_class_or_None, is_cpg)
    """
    total = None
    muts = []
    with open(path) as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            if line.startswith("#"):
                m = _total_re.search(line)
                if m and total is None:
                    try:
                        total = int(m.group(1))
                    except ValueError:
                        pass
                continue
            parts = re.split(r"\s+", line)
            # try to detect REF and ALT: default CHROM POS REF ALT VAF -> REF at idx 2, ALT at idx 3
            ref = alt = None
            if len(parts) >= 4:
                ref_candidate, alt_candidate = parts[2], parts[3]
                # accept alt if single base
                if re.fullmatch(r"[ACGTacgt]", alt_candidate):
                    ref = ref_candidate
                    alt = alt_candidate
            # fallback: search for single-letter tokens among first 6 fields
            if ref is None or alt is None:
                ref = alt = None
                for p in parts[:6]:
                    if ref is None and re.fullmatch(r"[ACGTacgt]|[ACGTacgt]{3}|CpG|GpC", p):
                        ref = p
                        continue
                    if ref is not None and re.fullmatch(r"[ACGTacgt]", p):
                        alt = p
                        break
            if ref is None or alt is None:
                continue
            mut_class, is_cpg = assign_class_simple(ref, alt)
            muts.append((mut_class, is_cpg))
    return total, muts

def collect_files(dirpath):
    files = glob.glob(os.path.join(dirpath, "*.snv_count.txt"))
    mapping = {}
    for f in files:
        base = os.path.basename(f)
        sample = base[:-len(".snv_count.txt")] if base.endswith(".snv_count.txt") else os.path.splitext(base)[0]
        mapping[sample] = f
    return mapping

def main(dir1, dir2, outdir):
    os.makedirs(outdir, exist_ok=True)
    per_sample = os.path.join(outdir, "per_sample")
    os.makedirs(per_sample, exist_ok=True)

    files1 = collect_files(dir1)
    files2 = collect_files(dir2)

    # treat as separate samples
    sample_paths = {}
    cohort = {}
    for s, p in files1.items():
        name = f"{s}_dir1"
        sample_paths[name] = p
        cohort[name] = "dir1"
    for s, p in files2.items():
        name = f"{s}_dir2"
        sample_paths[name] = p
        cohort[name] = "dir2"
    samples = sorted(sample_paths.keys())

    # build counts matrix
    counts_df = pd.DataFrame(0, index=CANONICAL_ORDER, columns=samples, dtype=int)
    totals_by_sample = {}

    # also aggregated totals per directory (from # Total lines if present, else from counted variants)
    agg_total_dir1 = 0
    agg_total_dir2 = 0

    for s in samples:
        path = sample_paths[s]
        total_line_val, muts = parse_file(path)
        # if the file had an explicit total, use it; else compute from parsed muts
        if total_line_val is not None:
            totals_by_sample[s] = total_line_val
        else:
            totals_by_sample[s] = len(muts)
        # count per class
        counter = Counter()
        for mut_class, is_cpg in muts:
            if mut_class:
                # ensure CpG/GpC prefixed refs were converted to base before mapping, so they count
                counter[mut_class] += 1
        for k in CANONICAL_ORDER:
            counts_df.at[k, s] = counter.get(k, 0)
        # update aggregated totals per directory
        if cohort[s] == "dir1":
            agg_total_dir1 += totals_by_sample[s]
        else:
            agg_total_dir2 += totals_by_sample[s]
        # write per-sample raw counts (optional)
        outps = os.path.join(per_sample, f"{s}_spectra.tsv")
        with open(outps, "w", newline="") as fh:
            w = csv.writer(fh, delimiter="\t")
            w.writerow(["canonical_mutation", "count"])
            for k in CANONICAL_ORDER:
                w.writerow([k, counts_df.at[k, s]])

    # compute proportions per sample (columns sum to 1)
    prop = counts_df.astype(float).copy()
    for col in prop.columns:
        ssum = prop[col].sum()
        if ssum == 0:
            prop[col] = 0.0
        else:
            prop[col] = prop[col] / ssum

    # save matrices
    prop.index.name = "MutationType"
    prop.to_csv(os.path.join(outdir, "proportions_matrix.tsv"), sep="\t", float_format="%.6f")
    prop.reset_index().melt(id_vars="MutationType", var_name="Sample", value_name="Proportion") \
        .assign(Cohort=lambda df: df["Sample"].map(cohort)) \
        .to_csv(os.path.join(outdir, "proportions_long.tsv"), sep="\t", index=False, float_format="%.6f")

    # produce cohort boxplot (two boxes per mutation type)
    df_long = prop.reset_index().melt(id_vars="MutationType", var_name="Sample", value_name="Proportion")
    df_long["Cohort"] = df_long["Sample"].map(cohort)

    sns.set(style="whitegrid")
    plt.figure(figsize=(10,6))
    ax = sns.boxplot(data=df_long, x="MutationType", y="Proportion", hue="Cohort",
                     order=CANONICAL_ORDER, showfliers=False)
    sns.stripplot(data=df_long, x="MutationType", y="Proportion", hue="Cohort",
                  dodge=True, order=CANONICAL_ORDER, jitter=True, size=3, alpha=0.6)
    # tidy legend (remove duplicate legend entries)
    if ax.get_legend() is not None:
        ax.legend_.remove()
        ax.legend(title="Cohort", loc="upper right")
    ax.set_title("Per-sample mutation-type proportions (dir1 vs dir2)")
    ax.set_ylabel("Proportion")
    ax.set_xlabel("Mutation Type")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(outdir, "spectra_proportions_boxplot_by_cohort.png"), dpi=150)
    plt.close()

    # one-line totals summary between directories
    diff = agg_total_dir2 - agg_total_dir1
    summary_line = f"Total unique SNVs: dir1={agg_total_dir1}  dir2={agg_total_dir2}  difference(dir2-dir1)={diff}"
    print(summary_line)
    with open(os.path.join(outdir, "totals_summary.txt"), "w") as fh:
        fh.write(summary_line + "\n")

    # also write totals per sample
    with open(os.path.join(outdir, "totals_comparison.tsv"), "w", newline="") as fh:
        w = csv.writer(fh, delimiter="\t")
        w.writerow(["sample_with_cohort", "cohort", "total_unique_snvs"])
        for s in samples:
            w.writerow([s, cohort[s], totals_by_sample[s]])

    print("Wrote outputs to:", outdir)
    print(" - proportions_matrix.tsv, proportions_long.tsv")
    print(" - spectra_proportions_boxplot_by_cohort.png")
    print(" - totals_summary.txt and totals_comparison.tsv")
    print(" - per-sample TSVs in", per_sample)



In [3]:
#in1 = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/Hiatt_continue2Dec22025/txtfiles/"
#in2 = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/Hiatt_okay2Dec82025/txtfiles/"

in1 = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/Hiatt_okay2Dec82025/txtfiles/"
in2 = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/Hiatt_continueif2unknownDec82025/txtfiles"

out = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/continueornot/"
main(in1, in2, out)


Total unique SNVs: dir1=247888  dir2=246911  difference(dir2-dir1)=-977
Wrote outputs to: /uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/continueornot/
 - proportions_matrix.tsv, proportions_long.tsv
 - spectra_proportions_boxplot_by_cohort.png
 - totals_summary.txt and totals_comparison.tsv
 - per-sample TSVs in /uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/continueornot/per_sample
