# Overview: checking metadata and samplesheet for all RUNS

## Preparation

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam
import pyfaidx
import warnings
import re
import argparse
import sys
import subprocess
from tqdm import tqdm 

tqdm.pandas()
warnings.filterwarnings("ignore")

main_outptudir = "/workdir/outdir/pipeline_output"
umt_distance = 0

rundf = pd.read_excel("../panel_design/All_panel_designs_20250601.xlsx", sheet_name="Runs_tracking")
rundf.columns = ["Run", "SampleID", "UMI", "Panel_version", "Name", "amplicon_name", "serial"]

umi_rundf = rundf[(rundf["UMI"] == "Yes") & (rundf["Name"] == "Vi") ]

collect_all_sample_sheets = [item for item in pathlib.Path("../experiments/").glob("SampleSheet_batch_*.csv")]

samplesheet = pd.DataFrame()
for file in collect_all_sample_sheets:
    df = pd.read_csv(file, sep=",")
    df["batch"] = file.name.replace(".csv", "")
    samplesheet = pd.concat([samplesheet, df], ignore_index=True)

samplesheet.columns = ["filename", "FASTQ1", "FASTQ2", "batch"]
samplesheet["Run"] = samplesheet["FASTQ1"].apply(lambda x: x.split("/")[5] if x.split("/")[5] != "2025" else x.split("/")[6])
samplesheet["SampleID"] = samplesheet["filename"].apply(lambda x: x.split("_")[0].split("-")[1])
print(f"All available runs: {", ".join(sorted(samplesheet['Run'].unique()))}")

All available runs: R7288, R7297, R7312, R7331, R7347, R7353, R7373, R7374, R7393, R7400


## Check if there is any missing FASTQ samples

In [2]:
missing_fastq = [item for item in umi_rundf.SampleID.unique() if item not in samplesheet.SampleID.unique()]

umi_rundf[umi_rundf.SampleID.isin(missing_fastq)]


Unnamed: 0,Run,SampleID,UMI,Panel_version,Name,amplicon_name,serial


## Match output paths to main metadata

since there is duplicated SampleID, we should merge labcode and Run together to make them unique

In [None]:
umi_rundf["uniqueID"] = umi_rundf[["SampleID", "Run"]].apply(lambda x: f"{x[0]}_{x[1]}", axis=1)
samplesheet["uniqueID"] = samplesheet[["SampleID", "Run"]].apply(lambda x: f"{x[0]}_{x[1]}", axis=1)

final_umi_rundf = umi_rundf.merge(samplesheet.drop(["SampleID", "Run"], axis = 1), left_on = "uniqueID", right_on = "uniqueID")
final_umi_rundf["UMI_cov_file"] = final_umi_rundf[["batch", "filename"]].apply(lambda x: os.path.join(
    main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_COV", f"{x[1]}.connor_R1_bismark_bt2_pe.bedGraph.gz.bismark.zero.cov"
), axis = 1)
final_umi_rundf["nonUMI_cov_file"] = final_umi_rundf[["batch", "filename"]].apply(lambda x: os.path.join(
    main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_COV", f"{x[1]}.CutAdapt_R1_bismark_bt2_pe.bedGraph.gz.bismark.zero.cov"
), axis = 1)

final_umi_rundf["check_UMI"] = final_umi_rundf["UMI_cov_file"].apply(lambda x: os.path.exists(x))
final_umi_rundf["check_nonUMI"] = final_umi_rundf["nonUMI_cov_file"].apply(lambda x: os.path.exists(x))

final_umi_rundf["unmapped_bam"] = final_umi_rundf[["batch", "filename"]].apply(
    lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_ALIGNMENT_UNMAPPED_BAM", f"{x[1]}_R1.UMIprocessed_bismark_bt2_pe.bam"), axis = 1
)
final_umi_rundf["connor_annotated"] = final_umi_rundf[["batch", "filename"]].apply(
    lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "CONNOR_UMI_OUTPUT", f"{x[1]}.connor.fully_annotated.bam"), axis = 1
)
final_umi_rundf["connor_fastq"] = final_umi_rundf[["batch", "filename"]].apply(
    lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "CONNOR_UMI_OUTPUT", f"{x[1]}.connor_R1.fastq.gz"), axis = 1
)
final_umi_rundf["umi_bam"] = final_umi_rundf[["batch", "filename"]].apply(
    lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_ALIGNMENT", f"{x[1]}.connor_R1_bismark_bt2_pe.sorted.bam"), axis = 1
)
final_umi_rundf["non_umi_bam"] = final_umi_rundf[["batch", "filename"]].apply(
    lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_ALIGNMENT", f"{x[1]}.CutAdapt_R1_bismark_bt2_pe.sorted.bam") , axis = 1
)

tqdm.pandas()
for col in ["unmapped_bam", "connor_annotated", "connor_fastq", "final_bam" , "UMI_cov_file"]:
    print(f"working on column: {col}")
    def _get_line_count(x):
        check_file = os.path.exists(x)
        x_raw = x.replace(".sorted.bam", ".bam")
        if check_file == False:
            if os.path.exists(x_raw) == True: 
                # print(f"sorted bam file is not available, but raw bam file is available, sorting and indexing it now...")
                os.system(f"samtools sort -@ 15 -n {x_raw} -o {x}")
                os.system(f"samtools index {x}")
            else:
                # rint(f"File {x} does not exist, skipping line count for this file.")
                return 0
        if ".bam" in x:
            result = subprocess.run(f"samtools view {x} -c", shell=True, capture_output=True, text=True)
        elif ".fastq.gz" in x:
            result = subprocess.run(f"zcat {x} | wc -l", shell=True, capture_output=True, text=True)
        else:
            result = subprocess.run(f"cat {x} | wc -l", shell=True, capture_output=True, text=True)
        return int(result.stdout.strip())
        
    final_umi_rundf[f"count_{col}"] = final_umi_rundf[col].progress_apply(_get_line_count)    


working on column: unmapped_bam


  0%|          | 0/314 [00:00<?, ?it/s]

  2%|▏         | 6/314 [00:06<06:45,  1.32s/it]

In [17]:
final_umi_rundf.unmapped_bam.values[1]

'/workdir/outdir/pipeline_output/SampleSheet_batch_20250604/UMT_DISTANCE_0/BISMARK_ALIGNMENT_UNMAPPED_BAM/2-TML1S2_S7502-S7702.UMIprocessed_bismark_bt2_pe.bam'

# Helper functions

In [None]:
def get_refseq(path_to_all_fa, chrom, start, end):
        refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "{}.fa".format(chrom)))
        return(str.upper(refseq.get_seq(name = "{}".format(chrom), start = start, end = end).seq))

path_to_all_fa = "/home/hieunguyen/resources/hg19"
genome_version = "hg19"

##### get list of all real cpg for this panel
all_cpgdf = dict()
for panel_name in final_umi_rundf.Panel_version.unique():
    cpgdf = pd.DataFrame()
    panel_name = panel_name.replace(" ", "_")
    paneldf = pd.read_excel("../panel_design/All_panel_designs_20250601.xlsx", sheet_name=panel_name)
    for region in paneldf[genome_version].unique():
        region_name = paneldf[paneldf[genome_version] == region]["Amplicon"].values[0]
        chrom = region.split(":")[0].replace("Chr", "chr").replace("chr", "")
        start = int(region.split(":")[1].split("-")[0])
        end = int(region.split(":")[1].split("-")[1])
        refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "chr{}.fa".format(chrom)))
        refseq_at_region = str.upper(refseq.get_seq(name = "chr{}".format(chrom), start = start, end = end).seq)

        all_cpg_in_cluster = [m.start(0) for m in re.finditer("CG", refseq_at_region)]
        cpg_coords = [f"chr{chrom}:{item + start}-{item + start + 1}" for item in all_cpg_in_cluster]

        tmp_cpgdf = pd.DataFrame(data = cpg_coords, columns = ["CpG"])
        tmp_cpgdf["region"] = region
        tmp_cpgdf["region_name"] = region_name
        cpgdf = pd.concat([cpgdf, tmp_cpgdf], axis = 0)

    cpgdf = cpgdf[["region", "CpG", "region_name"]]
    all_cpgdf[panel_name] = cpgdf.copy()


# PROCESSING COV FILES

In [None]:

input_cov_file = final_umi_rundf.UMI_cov_file.values[0]
panel_name = final_umi_rundf.Panel_version.values[0].replace(" ", "_")

cpgdf = all_cpgdf[panel_name]
covdf = pd.read_csv(input_cov_file, header = None, sep = "\t")
covdf.columns = ["chrom", "start", "end", "meth_density", "countC", "countT"]
covdf = covdf[covdf["chrom"].isin(["chrX", "chrY", "chrMT"]) == False]
covdf = covdf[covdf["chrom"].str.contains("chrUn") == False]
covdf = covdf[covdf["chrom"].str.contains("_") == False]
covdf["seq"] = covdf[["chrom", "start"]].progress_apply(lambda x: get_refseq(path_to_all_fa= path_to_all_fa, 
                                                        chrom = x[0], start = x[1], end = x[1] + 1), axis = 1)
covdf["strand"] = covdf["seq"].apply(lambda x: "+" if x != "CG" else "-")
covdf_raw = covdf.copy()
covdf["start"] = covdf[["seq", "start"]].apply(lambda x: x[1] + 1 if x[0] != "CG" else x[1], axis = 1)

covdf["chrom"] = covdf["chrom"].apply(lambda x: str(x))
covdf["CpG"] = covdf[["chrom", "start"]].apply(lambda x: f"{str(x[0])}:{x[1]}-{x[1] + 1}", axis = 1)
covdf["check_context"] = covdf["CpG"].apply(lambda x: "CpG_context" if x in cpgdf["CpG"].values else "False")
covdf["amplicon"] = covdf["CpG"].apply(lambda x: cpgdf[cpgdf["CpG"] == x]["region_name"].values[0] if x in cpgdf["CpG"].values else "NA")


In [None]:
final_umi_rundf[final_umi_rundf["check_UMI"] == False]
