In [1]:
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
input_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/vaf_spectra/txtfiles/"
out_png = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results/vaf_spectra/plots"

In [3]:
files = glob.glob(os.path.join(input_dir, "*_snv_count.txt"))

sample_data = {}

for file in files:
    sample = os.path.basename(file).replace("_snv_count.txt", "")
    df = pd.read_csv(file, sep="\t", comment="#")
    if "VAF" not in df.columns:
        continue  # skip if file has no VAF column
    sample_data[sample] = df["VAF"]

In [12]:
os.makedirs(out_png, exist_ok=True)

for sample, vafs in sample_data.items():
    plt.figure(figsize=(8, 3))
    plt.hist(vafs, bins=30, range=(0, 1), alpha=0.7, color="steelblue", edgecolor="black")
    plt.title(f"{sample} - Unique SNV VAFs")
    plt.xlabel("Variant Allele Frequency (VAF)")
    plt.ylabel("Count")
    plt.xlim(0, 1)
    # plt.ylim(0,3000)
    plt.tight_layout()

    # save with sample name in filename
    outfile = os.path.join(out_png, f"{sample}_vaf_hist.png")
    plt.savefig(outfile, dpi=300)
    plt.close()

In [4]:
## Plot VAF by mutation type
mutation_map = {
    "C>A": ["C>A", "G>T"],
    "C>G": ["C>G", "G>C"],
    "C>T": ["C>T", "G>A"],
    "T>A": ["T>A", "A>T"],
    "T>C": ["T>C", "A>G"],
    "T>G": ["T>G", "A>C"],
}

def assign_class(ref, alt):
    mut = f"{ref}>{alt}"
    for mclass, muts in mutation_map.items():
        if mut in muts:
            return mclass
    return None

In [5]:
def plot_vaf_by_class(txt_file, out_dir):
    sample = os.path.basename(txt_file).replace("_snv_count.txt", "")

    # load SNVs
    df = pd.read_csv(txt_file, sep="\t", comment="#")

    # assign mutation classes
    df["Class"] = df.apply(lambda r: assign_class(r["REF"], r["ALT"]), axis=1)

    # output path
    os.makedirs(out_dir, exist_ok=True)
    out_file = os.path.join(out_dir, f"{sample}_mutation_classes.png")

    # make 6 subplots
    fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharex=True, sharey=True)
    axes = axes.flatten()

    for ax, mclass in zip(axes, mutation_map.keys()):
        vafs = df.loc[df["Class"] == mclass, "VAF"]
        ax.hist(vafs, bins=30, range=(0, 1), alpha=0.7,
                color="steelblue", edgecolor="black")
        ax.set_title(f"{mclass}")
        ax.set_xlim(0, 1)
        ax.set_ylim(0, None)

    fig.suptitle(f"{sample} - VAF by Mutation Type", fontsize=14)
    for ax in axes[3:]:
        ax.set_xlabel("Variant Allele Frequency (VAF)")
    for ax in [axes[0], axes[3]]:
        ax.set_ylabel("Count")

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(out_file, dpi=300)
    plt.close()

In [15]:
def process_dir(input_dir, out_dir):
    txt_files = glob.glob(os.path.join(input_dir, "*_snv_count.txt"))
    for f in txt_files:
        plot_vaf_by_class(f, out_dir)

In [16]:
process_dir(input_dir, out_png)

Okay so that's for all the different sample-specific plots. Here is for the two cohorts --> 

In [6]:
def plot_vaf_by_class_donors(df, cohort_name, out_dir):
    """Make VAF histogram plots by mutation class for a cohort dataframe."""
    os.makedirs(out_dir, exist_ok=True)
    out_file = os.path.join(out_dir, f"{cohort_name}_mutation_classes.png")

    fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharex=True, sharey=True)
    axes = axes.flatten()

    for ax, mclass in zip(axes, mutation_map.keys()):
        vafs = df.loc[df["Class"] == mclass, "VAF"]
        ax.hist(vafs, bins=30, range=(0, 1), alpha=0.7,
                color="steelblue", edgecolor="black")
        ax.set_title(f"{mclass}")
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 50000)

    fig.suptitle(f"{cohort_name} - VAF by Mutation Type", fontsize=14)
    for ax in axes[3:]:
        ax.set_xlabel("Variant Allele Frequency (VAF)")
    for ax in [axes[0], axes[3]]:
        ax.set_ylabel("Count")

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(out_file, dpi=300)
    plt.close()


def process_dir_donors(input_dir, out_dir, lee_donors, hiatt_donors):
    txt_files = glob.glob(os.path.join(input_dir, "*_snv_count.txt"))

    lee_df_list = []
    hiatt_df_list = []

    for f in txt_files:
        fname = os.path.basename(f)

        # classify file based on donor strings in filename
        if any(tag in fname for tag in lee_donors):
            cohort = "Lee"
        elif any(tag in fname for tag in hiatt_donors):
            cohort = "Hiatt"
        else:
            continue  # skip files not matching either cohort

        df = pd.read_csv(f, sep="\t", comment="#")
        df["Class"] = df.apply(lambda r: assign_class(r["REF"], r["ALT"]), axis=1)

        if cohort == "Lee":
            lee_df_list.append(df)
        else:
            hiatt_df_list.append(df)

    # concatenate and plot
    if lee_df_list:
        lee_df = pd.concat(lee_df_list, ignore_index=True)
        plot_vaf_by_class_donors(lee_df, "Lee_cohort", out_dir)

    if hiatt_df_list:
        hiatt_df = pd.concat(hiatt_df_list, ignore_index=True)
        plot_vaf_by_class_donors(hiatt_df, "Hiatt_cohort", out_dir)


In [7]:
lee_donors = ["PD", "HLS"]
hiatt_donors = ["AS", "AC", "DC", "DE", "CE", "RE", "TR", "SI", "TC", "RTM", "Laurel"]

process_dir_donors(input_dir, out_png, lee_donors, hiatt_donors)

