In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam
import pyfaidx
import warnings
import re
import argparse
import sys
import subprocess
from tqdm import tqdm 

tqdm.pandas()
warnings.filterwarnings("ignore")

main_outptudir = "/workdir/outdir/pipeline_output"
umt_distance = 1
analysis_output_version = "20250614"

name = "Vi"

path_to_main_output = os.path.join(main_outptudir, "data_analysis", name, analysis_output_version)
path_to_01_output = os.path.join(path_to_main_output, "01_output")
os.system(f"mkdir -p {path_to_01_output}")

rundf = pd.read_excel("../panel_design/All_panel_designs_20250601.xlsx", sheet_name="Runs_tracking")
rundf.columns = ["Run", "SampleID", "UMI", "Panel_version", "Name", "amplicon_name", "serial"]

umi_rundf = rundf[(rundf["UMI"] == "Yes") & (rundf["Name"] == name) ]

collect_all_sample_sheets = [item for item in pathlib.Path("../experiments/").glob("SampleSheet_batch_*.csv")]

samplesheet = pd.DataFrame()
for file in collect_all_sample_sheets:
    df = pd.read_csv(file, sep=",")
    df["batch"] = file.name.replace(".csv", "")
    samplesheet = pd.concat([samplesheet, df], ignore_index=True)

samplesheet.columns = ["filename", "FASTQ1", "FASTQ2", "batch"]
samplesheet["Run"] = samplesheet["FASTQ1"].apply(lambda x: x.split("/")[5] if x.split("/")[5] != "2025" else x.split("/")[6])
samplesheet["SampleID"] = samplesheet["filename"].apply(lambda x: x.split("_")[0].split("-")[1])
print(f"All available runs: {", ".join(sorted(samplesheet['Run'].unique()))}")

rerun = False
if (os.path.isfile(os.path.join(path_to_01_output, "UMI_runs_tracking.xlsx")) == False) | (rerun  == True):
    umi_rundf["uniqueID"] = umi_rundf[["SampleID", "Run"]].apply(lambda x: f"{x[0]}_{x[1]}", axis=1)
    samplesheet["uniqueID"] = samplesheet[["SampleID", "Run"]].apply(lambda x: f"{x[0]}_{x[1]}", axis=1)

    final_umi_rundf = umi_rundf.merge(samplesheet.drop(["SampleID", "Run"], axis = 1), left_on = "uniqueID", right_on = "uniqueID")
    final_umi_rundf["UMI_cov_file"] = final_umi_rundf[["batch", "filename"]].apply(lambda x: os.path.join(
        main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_COV", f"{x[1]}.connor_R1_bismark_bt2_pe.bedGraph.gz.bismark.zero.cov"
    ), axis = 1)
    final_umi_rundf["nonUMI_cov_file"] = final_umi_rundf[["batch", "filename"]].apply(lambda x: os.path.join(
        main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_COV", f"{x[1]}.CutAdapt_R1_bismark_bt2_pe.bedGraph.gz.bismark.zero.cov"
    ), axis = 1)

    final_umi_rundf["check_UMI"] = final_umi_rundf["UMI_cov_file"].apply(lambda x: os.path.exists(x))
    final_umi_rundf["check_nonUMI"] = final_umi_rundf["nonUMI_cov_file"].apply(lambda x: os.path.exists(x))

    final_umi_rundf["unmapped_bam"] = final_umi_rundf[["batch", "filename"]].apply(
        lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_ALIGNMENT_UNMAPPED_BAM", f"{x[1]}_R1.UMIprocessed_bismark_bt2_pe.bam"), axis = 1
    )
    final_umi_rundf["connor_annotated"] = final_umi_rundf[["batch", "filename"]].apply(
        lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "CONNOR_UMI_OUTPUT", f"{x[1]}.connor.fully_annotated.bam"), axis = 1
    )
    final_umi_rundf["connor_fastq"] = final_umi_rundf[["batch", "filename"]].apply(
        lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "CONNOR_UMI_OUTPUT", f"{x[1]}.connor_R1.fastq.gz"), axis = 1
    )
    final_umi_rundf["umi_bam"] = final_umi_rundf[["batch", "filename"]].apply(
        lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_ALIGNMENT", f"{x[1]}.connor_R1_bismark_bt2_pe.sorted.bam"), axis = 1
    )
    final_umi_rundf["non_umi_bam"] = final_umi_rundf[["batch", "filename"]].apply(
        lambda x: os.path.join(main_outptudir, x[0], f"UMT_DISTANCE_{umt_distance}", "BISMARK_ALIGNMENT", f"{x[1]}.CutAdapt_R1_bismark_bt2_pe.sorted.bam"), axis = 1
    )

    tqdm.pandas()
    for col in ["unmapped_bam", "connor_annotated", "connor_fastq", "umi_bam", "non_umi_bam" , "UMI_cov_file"]:
        print(f"working on column: {col}")
        def _get_line_count(x):
            check_file = os.path.exists(x)
            x_raw = x.replace(".sorted.bam", ".bam")
            if check_file == False:
                if os.path.exists(x_raw) == True: 
                    # print(f"sorted bam file is not available, but raw bam file is available, sorting and indexing it now...")
                    os.system(f"samtools sort -@ 15 {x_raw} -o {x}")
                    os.system(f"samtools index {x}")
                else:
                    # rint(f"File {x} does not exist, skipping line count for this file.")
                    return 0
            if ".bam" in x:
                result = subprocess.run(f"samtools view {x} -c", shell=True, capture_output=True, text=True)
            elif ".fastq.gz" in x:
                result = subprocess.run(f"zcat {x} | wc -l", shell=True, capture_output=True, text=True)
            else:
                result = subprocess.run(f"cat {x} | wc -l", shell=True, capture_output=True, text=True)
            return int(result.stdout.strip())
            
        final_umi_rundf[f"count_{col}"] = final_umi_rundf[col].progress_apply(_get_line_count)    
    # small size fastq
    final_umi_rundf["small_FASTQ_size"] = final_umi_rundf["FASTQ1"].apply(lambda x: "smallsize" in x)
    final_umi_rundf.groupby("small_FASTQ_size")["SampleID"].count()
    final_umi_rundf.to_excel(os.path.join(path_to_01_output, "UMI_runs_tracking.xlsx"), index=False)
else:
    print("reading in saved processed UMI runs tracking file...")
    final_umi_rundf = pd.read_excel(os.path.join(path_to_01_output, "UMI_runs_tracking.xlsx"))

All available runs: R7288, R7297, R7312, R7331, R7332, R7347, R7353, R7373, R7374, R7381, R7393, R7400
reading in saved processed UMI runs tracking file...


In [11]:
exprdir = "/home/hieunguyen/src/ampliconSeq_UMI/experiments"
final_umi_rundf[final_umi_rundf["SampleID"] == "TMC455"][["SampleID", "FASTQ1", "FASTQ2"]].to_csv(
    os.path.join(exprdir, "SampleSheet_check_TMC455.csv"), index=False, sep=","
    )

In [8]:
final_umi_rundf[(final_umi_rundf["Run"] == "R7393") & (final_umi_rundf["Panel_version"] == "CRC_panel version 1.1")].batch

253    SampleSheet_batch_20250613_smallsize
254    SampleSheet_batch_20250613_smallsize
255    SampleSheet_batch_20250613_smallsize
256    SampleSheet_batch_20250613_smallsize
257    SampleSheet_batch_20250613_smallsize
258    SampleSheet_batch_20250613_smallsize
259    SampleSheet_batch_20250613_smallsize
Name: batch, dtype: object

In [19]:
final_umi_rundf[(final_umi_rundf["Panel_version"] == "CRC_panel version 3.1") & 
                (final_umi_rundf["amplicon_name"] == "C4")]

Unnamed: 0,Run,SampleID,UMI,Panel_version,Name,amplicon_name,serial,uniqueID,filename,FASTQ1,...,connor_fastq,umi_bam,non_umi_bam,count_unmapped_bam,count_connor_annotated,count_connor_fastq,count_umi_bam,count_non_umi_bam,count_UMI_cov_file,small_FASTQ_size
168,R7347,TMC455,Yes,CRC_panel version 3.1,Vi,C4,cfDNA,TMC455_R7347,4-TMC455_S7528-S7728,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/4-TMC45...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,68,66,0,0,68,0,False
263,R7393,TMC4S1,Yes,CRC_panel version 3.1,Vi,C4,S1,TMC4S1_R7393,11-TMC4S1_S7584-S7784,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/11-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,18,18,0,0,18,0,False
269,R7393,TMC4S2,Yes,CRC_panel version 3.1,Vi,C4,S2,TMC4S2_R7393,17-TMC4S2_S7505-S7705,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/17-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,20,18,0,0,20,0,False
275,R7393,TMC4S3,Yes,CRC_panel version 3.1,Vi,C4,S3,TMC4S3_R7393,23-TMC4S3_S7556-S7756,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/23-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,26,26,0,0,26,0,False
279,R7393,TMC4S4,Yes,CRC_panel version 3.1,Vi,C4,S4,TMC4S4_R7393,29-TMC4S4_S7517-S7717,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/29-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,20,20,0,0,20,0,False
285,R7393,TMC4S5,Yes,CRC_panel version 3.1,Vi,C4,S5,TMC4S5_R7393,35-TMC4S5_S7523-S7723,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/35-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,12,12,0,0,12,0,False
291,R7393,TMC4S6,Yes,CRC_panel version 3.1,Vi,C4,S6,TMC4S6_R7393,41-TMC4S6_S7529-S7729,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/41-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,8,8,0,0,8,0,False
297,R7393,TMC4S7,Yes,CRC_panel version 3.1,Vi,C4,S7,TMC4S7_R7393,47-TMC4S7_S7535-S7735,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/47-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,8,8,0,0,8,0,False


In [28]:
final_umi_rundf[(final_umi_rundf["Panel_version"] == "CRC_panel version 3.1") & 
                (final_umi_rundf["Run"] == "R7347")]

Unnamed: 0,Run,SampleID,UMI,Panel_version,Name,amplicon_name,serial,uniqueID,filename,FASTQ1,...,connor_fastq,umi_bam,non_umi_bam,count_unmapped_bam,count_connor_annotated,count_connor_fastq,count_umi_bam,count_non_umi_bam,count_UMI_cov_file,small_FASTQ_size
165,R7347,TMC155,Yes,CRC_panel version 3.1,Vi,C1,cfDNA,TMC155_R7347,1-TMC155_S7525-S7725,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/1-TMC15...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,1442540,1315723,64,28,1442540,5,False
166,R7347,TMC255,Yes,CRC_panel version 3.1,Vi,C2,cfDNA,TMC255_R7347,2-TMC255_S7526-S7726,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/2-TMC25...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,1330818,1327910,8820,180,1330818,4,False
167,R7347,TMC355,Yes,CRC_panel version 3.1,Vi,C3,cfDNA,TMC355_R7347,3-TMC355_S7527-S7727,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/3-TMC35...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,1216004,1212586,13088,46,1216004,13,False
168,R7347,TMC455,Yes,CRC_panel version 3.1,Vi,C4,cfDNA,TMC455_R7347,4-TMC455_S7528-S7728,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/4-TMC45...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,68,66,0,0,68,0,False
169,R7347,TMC555,Yes,CRC_panel version 3.1,Vi,C5,cfDNA,TMC555_R7347,5-TMC555_S7529-S7729,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/5-TMC55...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,1185886,517523,8180,104,1185886,5,False
170,R7347,TMC655,Yes,CRC_panel version 3.1,Vi,C6,cfDNA,TMC655_R7347,6-TMC655_S7530-S7730,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/6-TMC65...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,1289632,1284003,9484,62,1289632,10,False


In [33]:
final_umi_rundf[(final_umi_rundf["Panel_version"] == "CRC_panel version 3.1") & 
                (final_umi_rundf["amplicon_name"] == "C4")]

Unnamed: 0,Run,SampleID,UMI,Panel_version,Name,amplicon_name,serial,uniqueID,filename,FASTQ1,...,connor_fastq,umi_bam,non_umi_bam,count_unmapped_bam,count_connor_annotated,count_connor_fastq,count_umi_bam,count_non_umi_bam,count_UMI_cov_file,small_FASTQ_size
168,R7347,TMC455,Yes,CRC_panel version 3.1,Vi,C4,cfDNA,TMC455_R7347,4-TMC455_S7528-S7728,/mnt/GS-BACKUP05/FASTQ/2025/R7347/zatm/4-TMC45...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,68,66,0,0,68,0,False
263,R7393,TMC4S1,Yes,CRC_panel version 3.1,Vi,C4,S1,TMC4S1_R7393,11-TMC4S1_S7584-S7784,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/11-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,18,18,0,0,18,0,False
269,R7393,TMC4S2,Yes,CRC_panel version 3.1,Vi,C4,S2,TMC4S2_R7393,17-TMC4S2_S7505-S7705,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/17-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,20,18,0,0,20,0,False
275,R7393,TMC4S3,Yes,CRC_panel version 3.1,Vi,C4,S3,TMC4S3_R7393,23-TMC4S3_S7556-S7756,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/23-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,26,26,0,0,26,0,False
279,R7393,TMC4S4,Yes,CRC_panel version 3.1,Vi,C4,S4,TMC4S4_R7393,29-TMC4S4_S7517-S7717,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/29-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,20,20,0,0,20,0,False
285,R7393,TMC4S5,Yes,CRC_panel version 3.1,Vi,C4,S5,TMC4S5_R7393,35-TMC4S5_S7523-S7723,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/35-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,12,12,0,0,12,0,False
291,R7393,TMC4S6,Yes,CRC_panel version 3.1,Vi,C4,S6,TMC4S6_R7393,41-TMC4S6_S7529-S7729,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/41-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,8,8,0,0,8,0,False
297,R7393,TMC4S7,Yes,CRC_panel version 3.1,Vi,C4,S7,TMC4S7_R7393,47-TMC4S7_S7535-S7735,/mnt/GS-BACKUP05/FASTQ/2025/R7393/zatm/47-TMC4...,...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,/workdir/outdir/pipeline_output/SampleSheet_ba...,8,8,0,0,8,0,False
