In [None]:
import pandas as pd
import numpy as np
import pathlib 
import os
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

path_to_main_src = pathlib.Path("/media/hieunguyen/HNSD01/src/tmd_features")
data_version = "TMD_cov"
output_version = "20240907"

outdir = pathlib.Path("/media/hieunguyen/HNSD_mini/outdir")
path_to_input = outdir / "raw_data" / "bismark_cov" / data_version
path_to_main_output = outdir / "TMD_read_based_features" / "output" / f"data_{data_version}" / output_version
path_to_difftest_output = path_to_main_output / "difftest_output"
path_to_difftest_output.mkdir(parents=True, exist_ok=True)

path_to_save_QC_output = path_to_main_output / "QC"

thres_hypo = 0.3
thres_hyper = 0.6

# input_cancer_class = "Liver"
for input_cancer_class in ["Liver", "Breast", "CRC", "Lung", "Gastric"]:
    check_region = pd.read_excel(os.path.join(path_to_difftest_output, "diff_test_{}_region_summary.xlsx".format(input_cancer_class)))

    metadata = pd.read_excel("metadata_cfDNA_lowpdepth_TMD_bam_cov.xlsx")

    path_to_read_data = "/media/hieunguyen/GSHD_HN01/raw_data/reads_from_450_regions"

    path_to_py01_output = os.path.join(path_to_main_output, "py01_output")
    os.system("mkdir -p {}".format(path_to_py01_output))

    all_read_files = [item for item in pathlib.Path(path_to_read_data).glob("*.csv")]

    def classify_read(alpha, region):
        region_type = check_region[check_region["CpG"] == region]["region_type"].values[0]
        if region_type == "hypo":
            if alpha < thres_hypo:
                return "candi"
            else:
                return "none"
        elif region_type == "hyper":
            if alpha > thres_hyper:
                return "candi"
            else:
                return "none"

    def assign_read_type(x, thres_hypo, thres_hyper):
        if x < thres_hypo:
            return "hypo"
        elif x > thres_hyper:
            return "hyper"
        else:
            return "none"

    for file in tqdm([item for item in all_read_files if metadata[metadata["SampleID"] == item.name.split(".")[0]]["Label"].values[0] == input_cancer_class]):
        if os.path.isfile(os.path.join(path_to_py01_output, "{}.candi_reads.csv".format(file.name.split(".")[0]))) == False:
            tmpdf = pd.read_csv(file, sep=",", index_col=0)

            tmpdf["region"] = tmpdf["region"].apply(lambda x: x.replace(":", "_").replace("-", "_"))
            tmpdf = tmpdf[tmpdf["region"].isin(check_region.CpG.unique())]

            tmpdf["read_type"] = tmpdf["alpha"].apply(lambda x: assign_read_type(x, thres_hypo, thres_hyper))

            tmpdf["read_classification"] = tmpdf[["alpha", "region"]].apply(lambda x: classify_read(x[0], x[1]), axis = 1)

            count_candi_reads = tmpdf.groupby(["region", "read_classification"])["methyl_string"].count().reset_index().pivot(index = "region", columns = "read_classification", values = "methyl_string").fillna(0)
            count_candi_reads["fraction"] = count_candi_reads[["candi", "none"]].apply(lambda x: x[0]/(x[0] + x[1]), axis = 1)

            count_candi_reads.to_csv(os.path.join(path_to_py01_output, "{}.candi_reads.csv".format(file.name.split(".")[0])))
        else:
            print(" File {} exists".format(os.path.join(path_to_py01_output, "{}.candi_reads.csv".format(file.name.split(".")[0]))))

  6%|▋         | 3/47 [01:02<16:10, 22.06s/it]

Unnamed: 0,SampleID,TM_BAM,Label,Set,Run_TM,Run_GW,TM_COV
0,MYAAAA03,/mnt/GS_NAS05/ECDTM_repo/R4551/align/14-MYAAAA...,Breast,Recent_unseen2_GW_direct,R4551,R4551,/mnt/GS_NAS05/ECDTM_repo/R4551/methylation_ext...
1,MYAAAA06,/mnt/GS_NAS05/ECDTM_repo/R4551/align/62-MYAAAA...,Breast,Recent_unseen2_GW_direct,R4551,R4551,/mnt/GS_NAS05/ECDTM_repo/R4551/methylation_ext...
2,MYAAAA08,/mnt/GS_NAS05/ECDTM_repo/R4551/align/16-MYAAAA...,Breast,Recent_unseen2_GW_direct,R4551,R4551,/mnt/GS_NAS05/ECDTM_repo/R4551/methylation_ext...
3,MYAAAA10,/mnt/GS_NAS05/ECDTM_repo/R4551/align/64-MYAAAA...,Breast,Recent_unseen2_GW_direct,R4551,R4551,/mnt/GS_NAS05/ECDTM_repo/R4551/methylation_ext...
4,MYAAAA11,/mnt/GS_NAS05/ECDTM_repo/R4563/align/43-MYAAAA...,Breast,Recent_unseen2_GW_direct,R4563,R4563,/mnt/GS_NAS05/ECDTM_repo/R4563/methylation_ext...
...,...,...,...,...,...,...,...
7381,K0AAAL02,/mnt/BACKUP_NAS05/ECDTM_repo/R4823/align/103-K...,Control,Recent_unseen4,R4823,R4823,/mnt/BACKUP_NAS05/ECDTM_repo/R4823/methylation...
7382,K0AAAL05,/mnt/GS_NAS05/ECDTM_repo/R4823/align/106-K0AAA...,Control,Recent_unseen4,R4823,R4823,/mnt/GS_NAS05/ECDTM_repo/R4823/methylation_ext...
7383,K0AAAL06,/mnt/GS_NAS05/ECDTM_repo/R4823/align/107-K0AAA...,Control,Recent_unseen4,R4823,R4823,/mnt/GS_NAS05/ECDTM_repo/R4823/methylation_ext...
7384,K0AAAL07,/mnt/BACKUP_NAS05/ECDTM_repo/R4823/align/108-K...,Control,Recent_unseen4,R4823,R4823,/mnt/BACKUP_NAS05/ECDTM_repo/R4823/methylation...
