In [1]:
import yaml
import pandas as pd
import os
import re

## First, we deal with our actual dataset. 

In [3]:
with open("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/config/snakemake_config/CCconfig.yaml") as f:
    config = yaml.safe_load(f)

leesix = pd.read_csv("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixcohort.txt", sep="\t")
leesix_clean = leesix.drop(columns=['sample_title', 'sample_accession_id', 'file_name', 'file_accession_id', 'accession_id'])
leesix_clean = leesix_clean.drop_duplicates()

# Extract
samples: list[str] = config["samples"]
donors: list[str] = config["donors"]
matches = config["matches"]
regions = config["regions"]
cohorts = config["cohorts"]
mask_to_keep = leesix_clean['subject_id'].isin(samples)
leesix_clean = leesix_clean[mask_to_keep]

with open("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/cohort.txt") as f:
    config = yaml.safe_load(f)
ages: list[str] = config["ages"]

In [4]:
rows = []
for donor, info in matches.items():
    crypt_samples = info.get("crypt_samples", [])
    for sample in crypt_samples:
        rows.append((donor, sample))

df = pd.DataFrame(rows, columns=["donor", "crypt_sample"])
df = df[~df["donor"].isin(leesix_clean["patient"])]

# Add age directly from dict
df["age"] = df["donor"].map(ages)
df["age"] = df["age"].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

In [5]:
# --- Regions → donor/crypt_sample/region ---
rows = []
for donor, region_dict in regions.items():
    #print(donor, region_dict)
    for region, samples in region_dict.items():
        #print(region, samples)
        for sample in samples:
            rows.append((donor, sample, region))

df_regions = pd.DataFrame(rows, columns=["donor", "crypt_sample", "region"])

# Merge regions
df = df.merge(df_regions, on=["donor", "crypt_sample"], how="left")

In [6]:
# --- Cohorts → donor/cohort ---
rows = []
for cohort, donors in cohorts.items():
    for donor in donors:
        rows.append((donor, cohort))

df_cohorts = pd.DataFrame(rows, columns=["donor", "cohort"])

# Merge cohorts
df = df.merge(df_cohorts, on="donor", how="left")

In [7]:
mosdepth_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/mosdepth"  # top-level directory

coverage_rows = []

# Iterate over donor subdirectories
for donor in os.listdir(mosdepth_dir):
    donor_path = os.path.join(mosdepth_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    # Look for HTML files
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_mosdepth_coverage.html"):
            file_path = os.path.join(donor_path, file_name)
            with open(file_path) as f:
                for line in f:
                    if line.strip().startswith("Plotly.newPlot('plot-div-total'"):
                        # Extract sample and coverage
                        match = re.search(r'"name":\s*"(.+?)\s+\(([\d\.]+)\)"', line)
                        if match:
                            sample_name = match.group(1)
                            coverage = float(match.group(2))
                            coverage_rows.append((donor, sample_name, coverage))
                        break  # only first matching line

# Create DataFrame
df_coverage = pd.DataFrame(coverage_rows, columns=["donor", "crypt_sample", "coverage"])

df = df.merge(df_coverage, on=["donor", "crypt_sample"], how="left")

In [8]:
# If elsewhere
results_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/overlap_variants/txtfiles_all/snvs"

snv_rows = []

# Iterate over all files directly in results_dir
for file_name in os.listdir(results_dir):
    if file_name.endswith(".tsv"):
        crypt_sample = file_name.replace(".snvs_merged.tsv", "")
        file_path = os.path.join(results_dir, file_name)

        with open(file_path) as f:
            lines = [line.strip() for line in f if line.strip()]

        total_snvs = None

        # Try to parse final "# Total" line if present
        if lines:
            last_line = lines[-1]
            match = re.search(r"# Total:\s+(\d+)\s+unique SNVs", last_line)
            if match:
                total_snvs = int(match.group(1))

        # Fallback: count lines starting with "chr"
        if total_snvs is None:
            total_snvs = sum(1 for line in lines if line.startswith("chr"))

        snv_rows.append((crypt_sample, total_snvs))

# Create DataFrame
df_snv = pd.DataFrame(snv_rows, columns=["crypt_sample", "unique_SNVs"])

# Merge with main df on crypt_sample only
df = df.merge(df_snv, on="crypt_sample", how="left")


In [9]:
donors_to_process = df["donor"].unique()

# Canonical mutation map (collapsed)
mutation_map = {
    "C>A": ["C>A", "G>T"],
    "C>G": ["C>G", "G>C"],
    "C>T": ["C>T", "G>A"],
    "T>A": ["T>A", "A>T"],
    "T>C": ["T>C", "A>G"],
    "T>G": ["T>G", "A>C"],
}

In [10]:
def assign_class_and_cpg(ref, alt):
    ref_str = str(ref)  # ensure string
    if ref_str.startswith("CpG"):
        ref_base = "C"
        is_cpg = True
    elif ref_str.startswith("GpC"):
        ref_base = "G"
        is_cpg = True
    else:
        ref_base = ref_str
        is_cpg = False

    mut = f"{ref_base}>{alt}"
    mut_class = None
    for mclass, muts in mutation_map.items():
        if mut in muts:
            mut_class = mclass
            break

    return mut_class, is_cpg

In [11]:
# IF ELSEWHERE
snv_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/overlap_variants/txtfiles_all/snvs"

mutation_counts = []

In [12]:
# IF NOT IN DONOR DIRECTORIES
# Iterate over all files directly in snv_dir
for file_name in os.listdir(snv_dir):
    if file_name.endswith(".tsv"):
        crypt_sample = file_name.replace(".snvs_merged.tsv", "")
        file_path = os.path.join(snv_dir, file_name)

        # Read SNV table (excluding comment rows)
        df_snv = pd.read_csv(
            file_path,
            sep="\t",
            comment="#",
            usecols=["REF", "ALT", "VAF_DS"]
        )

        # Assign mutation class + CpG
        results = df_snv.apply(lambda x: assign_class_and_cpg(x.REF, x.ALT), axis=1)
        df_snv["mut_class"] = results.apply(lambda x: x[0])
        df_snv["is_CpG"] = results.apply(lambda x: x[1])

        # Count mutations by class
        counts = df_snv["mut_class"].value_counts().to_dict()
        counts["CpG"] = df_snv["is_CpG"].sum()

        # Add sample ID
        counts["crypt_sample"] = crypt_sample

        mutation_counts.append(counts)

# Convert to wide-format DataFrame
df_mutation = pd.DataFrame(mutation_counts).fillna(0)

# Merge with main df on crypt_sample only
df = df.merge(df_mutation, on="crypt_sample", how="left")

## Next, let's do the Lee-Six data

In [16]:
leesix_clinical = pd.read_csv("/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixclinical.csv")

In [17]:
leesix_full = pd.merge(leesix_clean, leesix_clinical, on=['patient', 'sex', 'age'], how= 'outer')

In [19]:
mosdepth_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/mosdepth"  # top-level directory

coverage_rows = []

# Iterate over donor subdirectories
for donor in os.listdir(mosdepth_dir):
    donor_path = os.path.join(mosdepth_dir, donor)
    if not os.path.isdir(donor_path):
        continue
    # Look for HTML files
    for file_name in os.listdir(donor_path):
        if file_name.endswith("_mosdepth_coverage.html"):
            file_path = os.path.join(donor_path, file_name)
            with open(file_path) as f:
                for line in f:
                    if line.strip().startswith("Plotly.newPlot('plot-div-total'"):
                        # Extract sample and coverage
                        match = re.search(r'"name":\s*"(.+?)\s+\(([\d\.]+)\)"', line)
                        if match:
                            sample_name = match.group(1)
                            coverage = float(match.group(2))
                            coverage_rows.append((sample_name, coverage))
                        break  # only first matching line

# Create DataFrame
df_coverage = pd.DataFrame(coverage_rows, columns=["subject_id", "coverage"])

leesix_full = leesix_full.merge(df_coverage, on=["subject_id"], how="left")

In [20]:
results_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/overlap_variants/txtfiles_all/snvs"

snv_rows = []

# Iterate over donor subdirectories
for file_name in os.listdir(results_dir):
    if file_name.endswith(".tsv"):
        crypt_sample = file_name.replace(".snvs_merged.tsv", "")
        file_path = os.path.join(results_dir, file_name)

        with open(file_path) as f:
            lines = [line.strip() for line in f if line.strip()]

        total_snvs = None

        # Try to parse final "# Total" line if present
        if lines:
            last_line = lines[-1]
            match = re.search(r"# Total:\s+(\d+)\s+unique SNVs", last_line)
            if match:
                total_snvs = int(match.group(1))

        # Fallback: count lines starting with "chr"
        if total_snvs is None:
            total_snvs = sum(1 for line in lines if line.startswith("chr"))

        snv_rows.append((crypt_sample, total_snvs))

# Create DataFrame
df_snv = pd.DataFrame(snv_rows, columns=["subject_id", "unique_SNVs"])

# Merge with main df
leesix_full = leesix_full.merge(df_snv, on=["subject_id"], how="left")

In [21]:
donors_to_process = leesix_full["patient"].unique()

# Canonical mutation map (collapsed)
mutation_map = {
    "C>A": ["C>A", "G>T"],
    "C>G": ["C>G", "G>C"],
    "C>T": ["C>T", "G>A"],
    "T>A": ["T>A", "A>T"],
    "T>C": ["T>C", "A>G"],
    "T>G": ["T>G", "A>C"],
}

def assign_class_and_cpg(ref, alt):
    ref_str = str(ref)  # ensure string
    if ref_str.startswith("CpG"):
        ref_base = "C"
        is_cpg = True
    elif ref_str.startswith("GpC"):
        ref_base = "G"
        is_cpg = True
    else:
        ref_base = ref_str
        is_cpg = False

    mut = f"{ref_base}>{alt}"
    mut_class = None
    for mclass, muts in mutation_map.items():
        if mut in muts:
            mut_class = mclass
            break

    return mut_class, is_cpg

mutation_counts = []
snv_dir = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/overlap_variants/txtfiles_all/snvs"


In [22]:
for file_name in os.listdir(snv_dir):
    if file_name.endswith(".tsv"):
        sample_name = file_name.replace(".snvs_merged.tsv", "")
        file_path = os.path.join(snv_dir, file_name)

        # Read SNV table (excluding comment rows)
        df_snv = pd.read_csv(
            file_path,
            sep="\t",
            comment="#",
            usecols=["REF", "ALT", "VAF_DS"]
        )

        # Assign mutation class + CpG
        results = df_snv.apply(lambda x: assign_class_and_cpg(x.REF, x.ALT), axis=1)
        df_snv["mut_class"] = results.apply(lambda x: x[0])
        df_snv["is_CpG"] = results.apply(lambda x: x[1])

        # Count mutations by class
        counts = df_snv["mut_class"].value_counts().to_dict()
        counts["CpG"] = df_snv["is_CpG"].sum()
        counts["subject_id"] = sample_name

        mutation_counts.append(counts)

# Convert to DataFrame
df_mutation = pd.DataFrame(mutation_counts).fillna(0)

# Merge with your main df
leesix_full = leesix_full.merge(df_mutation, on=["subject_id"], how="left")

In [23]:
leesix_full

Unnamed: 0,subject_id,biological_sex,phenotype,site,sex,age,med_depths,patient,cohort_status,coverage,unique_SNVs,C>T,T>C,C>A,T>A,C>G,T>G,CpG
0,HLS_1C_30_B5,unknown,single cell,Right,female,60,12.0,HLS,transplant_donor_cohort,19.0,2505.0,769.0,530.0,595.0,283.0,153.0,175.0,190.0
1,HLS_1C_30_D5,unknown,single cell,Right,female,60,10.0,HLS,transplant_donor_cohort,16.0,2406.0,678.0,485.0,652.0,268.0,141.0,182.0,165.0
2,HLS_1C_30_G5,unknown,single cell,Right,female,60,12.0,HLS,transplant_donor_cohort,19.0,2236.0,682.0,493.0,528.0,284.0,108.0,141.0,179.0
3,HLS_1C_30_H5,unknown,single cell,Right,female,60,9.0,HLS,transplant_donor_cohort,14.0,1794.0,565.0,363.0,449.0,195.0,115.0,107.0,137.0
4,HLS_2C_30_D6,unknown,single cell,Right,female,60,8.0,HLS,transplant_donor_cohort,13.0,1975.0,584.0,490.0,429.0,226.0,114.0,132.0,139.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,PD37590c_lo0032,male,Transverse colon,Transverse,female,55,21.0,PD37590,transplant_donor_cohort,22.0,1064.0,411.0,168.0,228.0,136.0,85.0,36.0,92.0
338,PD37590c_lo0034,male,Transverse colon,Transverse,female,55,28.0,PD37590,transplant_donor_cohort,30.0,940.0,370.0,148.0,176.0,131.0,86.0,29.0,78.0
339,PD37590c_lo0040,male,Transverse colon,Transverse,female,55,26.0,PD37590,transplant_donor_cohort,27.0,920.0,363.0,138.0,186.0,119.0,81.0,33.0,60.0
340,PD37590c_lo0048,male,Transverse colon,Transverse,female,55,23.0,PD37590,transplant_donor_cohort,24.0,907.0,342.0,152.0,185.0,104.0,84.0,40.0,76.0


In [24]:
df.to_csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/Hiattdataframe.csv', index=False)
# index=False prevents writing the DataFrame index to the CSV

leesix_full.to_csv('/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/LeeSixdataframe.csv', index=False)