In [1]:
import yaml
import pandas as pd
import os
import re

## First, we deal with our actual dataset. 

In [2]:
with open("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/config/snakemake_config/CCconfig.yaml") as f:
    config = yaml.safe_load(f)

# Extract
samples: list[str] = config["samples"]
donors: list[str] = config["donors"]
matches = config["matches"]
regions = config["regions"]
cohorts = config["cohorts"]

with open("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/cohort.txt") as f:
    config = yaml.safe_load(f)
ages: list[str] = config["ages"]

In [3]:
rows = []
for donor, info in matches.items():
    crypt_samples = info.get("crypt_samples", [])
    for sample in crypt_samples:
        rows.append((donor, sample))

df = pd.DataFrame(rows, columns=["donor", "crypt_sample"])

# Add age directly from dict
df["age"] = df["donor"].map(ages)

In [4]:
# --- Regions → donor/crypt_sample/region ---
rows = []
for donor, region_dict in regions.items():
    # print(donor, region_dict)
    for region, samples in region_dict.items():
        for sample in samples:
            rows.append((donor, sample, region))

df_regions = pd.DataFrame(rows, columns=["donor", "crypt_sample", "region"])

# Merge regions
df = df.merge(df_regions, on=["donor", "crypt_sample"], how="left")

In [5]:
# --- Cohorts → donor/cohort ---
rows = []
for cohort, donors in cohorts.items():
    for donor in donors:
        rows.append((donor, cohort))

df_cohorts = pd.DataFrame(rows, columns=["donor", "cohort"])

# Merge cohorts
df = df.merge(df_cohorts, on="donor", how="left")

In [6]:
mosdepth_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/mosdepth"  # top-level directory

coverage_rows = []

# Iterate over donor subdirectories
for donor in os.listdir(mosdepth_dir):
    donor_path = os.path.join(mosdepth_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    # Look for HTML files
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_mosdepth_coverage.html"):
            file_path = os.path.join(donor_path, file_name)
            with open(file_path) as f:
                for line in f:
                    if line.strip().startswith("Plotly.newPlot('plot-div-total'"):
                        # Extract sample and coverage
                        match = re.search(r'"name":\s*"(.+?)\s+\(([\d\.]+)\)"', line)
                        if match:
                            sample_name = match.group(1)
                            coverage = float(match.group(2))
                            coverage_rows.append((donor, sample_name, coverage))
                        break  # only first matching line

# Create DataFrame
df_coverage = pd.DataFrame(coverage_rows, columns=["donor", "crypt_sample", "coverage"])

df = df.merge(df_coverage, on=["donor", "crypt_sample"], how="left")

In [7]:
results_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results"

snv_rows = []

# Iterate over donor subdirectories
for donor in os.listdir(results_dir):
    donor_path = os.path.join(results_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    # Look for *_snv_count.txt files
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_snv_count.txt"):
            sample_name = file_name.replace("_snv_count.txt", "")
            file_path = os.path.join(donor_path, file_name)
            with open(file_path) as f:
                lines = f.readlines()
                if lines:
                    last_line = lines[-1].strip()
                    # Expecting: "# Total: ##### unique SNVs"
                    match = re.search(r"# Total:\s+(\d+)\s+unique SNVs", last_line)
                    if match:
                        total_snvs = int(match.group(1))
                        snv_rows.append((donor, sample_name, total_snvs))

# Create DataFrame
df_snv = pd.DataFrame(snv_rows, columns=["donor", "crypt_sample", "unique_SNVs"])

# Merge with main df
df = df.merge(df_snv, on=["donor", "crypt_sample"], how="left")

In [8]:
donors_to_process = df["donor"].unique()

# Canonical mutation map (collapsed)
mutation_map = {
    "C>A": ["C>A", "G>T"],
    "C>G": ["C>G", "G>C"],
    "C>T": ["C>T", "G>A"],
    "T>A": ["T>A", "A>T"],
    "T>C": ["T>C", "A>G"],
    "T>G": ["T>G", "A>C"],
}

In [9]:
def assign_class_and_cpg(ref, alt):
    ref_str = str(ref)  # ensure string
    if ref_str.startswith("CpG"):
        ref_base = "C"
        is_cpg = True
    elif ref_str.startswith("GpC"):
        ref_base = "G"
        is_cpg = True
    else:
        ref_base = ref_str
        is_cpg = False

    mut = f"{ref_base}>{alt}"
    mut_class = None
    for mclass, muts in mutation_map.items():
        if mut in muts:
            mut_class = mclass
            break

    return mut_class, is_cpg

In [10]:
snv_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results"
mutation_counts = []

In [11]:
for donor in donors_to_process:
    donor_path = os.path.join(snv_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_snv_count.txt"):
            sample_name = file_name.replace("_snv_count.txt", "")
            file_path = os.path.join(donor_path, file_name)

            df_snv = pd.read_csv(
                file_path,
                sep="\t",
                comment="#",
                usecols=["REF", "ALT", "VAF"]
            )

            # Assign mutation class and CpG
            results = df_snv.apply(lambda x: assign_class_and_cpg(x.REF, x.ALT), axis=1)
            df_snv["mut_class"] = results.apply(lambda x: x[0])
            df_snv["is_CpG"] = results.apply(lambda x: x[1])

            # Count frequencies
            counts = df_snv["mut_class"].value_counts().to_dict()
            counts["CpG"] = df_snv["is_CpG"].sum()
            counts["donor"] = donor
            counts["crypt_sample"] = sample_name

            mutation_counts.append(counts)

# Convert to DataFrame
df_mutation = pd.DataFrame(mutation_counts).fillna(0)

# Merge with your main df
df = df.merge(df_mutation, on=["donor", "crypt_sample"], how="left")

## Next, let's do the Lee-Six data

In [2]:
leesix = pd.read_csv("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixcohort.txt", sep="\t")
leesix_clean = leesix.drop(columns=['sample_title', 'sample_accession_id', 'file_name', 'file_accession_id', 'accession_id'])
leesix_clean = leesix_clean.drop_duplicates()

In [3]:
with open("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/config/test_config/testconfig.yaml") as f:
    config = yaml.safe_load(f)

# Extract and subset
samples: list[str] = config["samples"]
mask_to_keep = leesix_clean['subject_id'].isin(samples)
leesix_clean = leesix_clean[mask_to_keep]



In [4]:
leesix_clinical = pd.read_csv("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixclinical.csv")
donors: list[str] = config["donors"]
mask_to_keep = leesix_clinical['patient'].isin(donors)
leesix_clinical = leesix_clinical[mask_to_keep]

In [5]:
leesix_full = pd.merge(leesix_clean, leesix_clinical, on=['patient', 'sex', 'age'], how= 'outer')

In [6]:
mosdepth_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/mosdepth"  # top-level directory

coverage_rows = []

# Iterate over donor subdirectories
for donor in os.listdir(mosdepth_dir):
    donor_path = os.path.join(mosdepth_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    # Look for HTML files
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_mosdepth_coverage.html"):
            file_path = os.path.join(donor_path, file_name)
            with open(file_path) as f:
                for line in f:
                    if line.strip().startswith("Plotly.newPlot('plot-div-total'"):
                        # Extract sample and coverage
                        match = re.search(r'"name":\s*"(.+?)\s+\(([\d\.]+)\)"', line)
                        if match:
                            sample_name = match.group(1)
                            coverage = float(match.group(2))
                            coverage_rows.append((donor, sample_name, coverage))
                        break  # only first matching line

# Create DataFrame
df_coverage = pd.DataFrame(coverage_rows, columns=["patient", "subject_id", "coverage"])

leesix_full = leesix_full.merge(df_coverage, on=["patient", "subject_id"], how="left")

In [7]:
results_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results"

snv_rows = []

# Iterate over donor subdirectories
for donor in donors:
    donor_path = os.path.join(results_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    # Look for *_snv_count.txt files
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_snv_count.txt"):
            sample_name = file_name.replace("_snv_count.txt", "")
            file_path = os.path.join(donor_path, file_name)
            with open(file_path) as f:
                lines = f.readlines()
                if lines:
                    last_line = lines[-1].strip()
                    # Expecting: "# Total: ##### unique SNVs"
                    # will need to change when I update these VAFs, right now I'm running stuff for the unvariant called Lee-Six donors
                    match = re.search(r"Total:\s+(\d+)\s+unique SNVs", last_line)
                    if match:
                        total_snvs = int(match.group(1))
                        snv_rows.append((donor, sample_name, total_snvs))

# Create DataFrame
df_snv = pd.DataFrame(snv_rows, columns=["patient", "subject_id", "unique_SNVs"])

# Merge with main df
leesix_full = leesix_full.merge(df_snv, on=["patient", "subject_id"], how="left")

In [8]:
donors_to_process = leesix_full["patient"].unique()

# Canonical mutation map (collapsed)
mutation_map = {
    "C>A": ["C>A", "G>T"],
    "C>G": ["C>G", "G>C"],
    "C>T": ["C>T", "G>A"],
    "T>A": ["T>A", "A>T"],
    "T>C": ["T>C", "A>G"],
    "T>G": ["T>G", "A>C"],
}

def assign_class_and_cpg(ref, alt):
    ref_str = str(ref)  # ensure string
    if ref_str.startswith("CpG"):
        ref_base = "C"
        is_cpg = True
    elif ref_str.startswith("GpC"):
        ref_base = "G"
        is_cpg = True
    else:
        ref_base = ref_str
        is_cpg = False

    mut = f"{ref_base}>{alt}"
    mut_class = None
    for mclass, muts in mutation_map.items():
        if mut in muts:
            mut_class = mclass
            break

    return mut_class, is_cpg

snv_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/results"
mutation_counts = []

In [9]:
for donor in donors_to_process:
    donor_path = os.path.join(snv_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_snv_count.txt"):
            sample_name = file_name.replace("_snv_count.txt", "")
            file_path = os.path.join(donor_path, file_name)

            df_snv = pd.read_csv(
                file_path,
                sep="\t",
                comment="#",
                usecols=["REF", "ALT", "VAF"]
            )

            # Assign mutation class and CpG
            results = df_snv.apply(lambda x: assign_class_and_cpg(x.REF, x.ALT), axis=1)
            df_snv["mut_class"] = results.apply(lambda x: x[0])
            df_snv["is_CpG"] = results.apply(lambda x: x[1])

            # Count frequencies
            counts = df_snv["mut_class"].value_counts().to_dict()
            counts["CpG"] = df_snv["is_CpG"].sum()
            counts["patient"] = donor
            counts["subject_id"] = sample_name

            mutation_counts.append(counts)

# Convert to DataFrame
df_mutation = pd.DataFrame(mutation_counts).fillna(0)

# Merge with your main df
leesix_full = leesix_full.merge(df_mutation, on=["patient", "subject_id"], how="left")

In [None]:
df.to_csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/Hiattdataframe.csv', index=False) # index=False prevents writing the DataFrame index to the CSV

leesix_full.to_csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixdataframe.csv', index=False)