In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

output_version = "20250327"

maindir = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis"
outputdir = "/media/hieunguyen/HNSD_mini/outdir"

Vi_runs = ["R6801", "R6829Vi", "R6873", "R6914"]
Truong_runs = ["R6782", "R6823", "R6829Truong"]

regions = {
    "Vi": "Vi_amplicons.hg19.bed",
    "Truong": "Truong_amplicons.hg19.bed"
}

regions_hg38 = {
    "Vi": "Vi_amplicons.hg38.bed",
    "Truong": "Truong_amplicons.hg38.bed"
}

mode = "directional"
# mode = "non_directional"

# pic = "Truong"
pic = "Vi"

path_to_main_output = os.path.join(outputdir, "targetMethyl_analysis", output_version, f"{pic}_output", mode)
os.system(f"mkdir -p {path_to_main_output}")


print(f"All output are saved at {path_to_main_output}")

all_cov_files = [item for item in pathlib.Path(maindir).glob("target_methylation_*/06*/*.cov")]

metadata_dict = {
    "filename"  : [item.name.split(".no_deduplicated")[0] for item in all_cov_files],
    "Run" : [str(item).split("/")[6].replace("_no_dedup", "").replace("target_methylation_", "") for item in all_cov_files],
    "path": [str(item) for item in all_cov_files]
}

metadata = pd.DataFrame.from_dict(metadata_dict, orient="columns")
metadata["mode"] = metadata["Run"].apply(lambda x: "directional" if "_without_non_directional" in x else "non_directional")
metadata["Run"] = metadata["Run"].apply(lambda x: x.split("_")[0])
metadata["PIC"] = metadata["Run"].apply(lambda x: "Vi" if x in Vi_runs else "Truong")
metadata["bam_path"] = metadata["path"].apply(lambda x: str(x).replace("06_methylation_extract", "05_sorted_bam").replace(".bedGraph.gz.bismark.zero.cov", ".sorted.bam"))
metadata["CpG_context_read_file"] = metadata["path"].apply(lambda x:  "/".join(x.split("/")[0:-1]) + "/CpG_context_" + x.split("/")[-1].replace(".bedGraph.gz.bismark.zero.cov", ".txt"))
metadata = metadata[(metadata["mode"] == mode) & (metadata["PIC"] == pic)]

##### regiondf for hg19
regiondf = pd.read_csv(regions[pic], sep = "\t", header = None)
regiondf.columns = ["chrom", "start", "end"]
regiondf = regiondf[["chrom", "start", "end"]]
regiondf["region_name"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf["bam_region"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)

##### regiondf for hg38
regiondf_hg38 = pd.read_csv(regions_hg38[pic], sep = ",", header = None)
regiondf_hg38.columns = ["chrom", "start", "end"]
regiondf_hg38 = regiondf_hg38[["chrom", "start", "end"]]
regiondf_hg38["region_name"] = regiondf_hg38[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf_hg38["bam_region"] = regiondf_hg38[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)


In [None]:
for run in metadata.Run.unique():
      for filename in metadata[metadata["Run"] == run]["filename"].unique():
        # run = "R6914"
        # filename = "TMM1P4"
        path_to_save_cov = os.path.join(path_to_main_output, run, filename)
        # region = "region_chr16_22825587_22825729"

        for region in regiondf.region_name.unique():
            path_to_save_falseCpG_readdf = os.path.join(path_to_main_output, run, filename, "FalseCpG_reads", region)
            os.system(f"mkdir -p {path_to_save_falseCpG_readdf}")

            save_covdf = pd.read_excel(f"{path_to_save_cov}/{region}.xlsx")
            false_cpg = save_covdf[save_covdf["check_context"] == "False"].CpG.values
            if len(false_cpg) != 0:
                # cpg = "chr16:22825595-22825596"
                for cpg in tqdm(false_cpg):
                    if os.path.isfile(os.path.join(path_to_save_falseCpG_readdf, f"falseCpG_{cpg.replace(':', '_').replace('-', '_')}.xlsx")) == False:
                        checkdf = pd.read_csv(metadata[(metadata["filename"] == filename) & (metadata["Run"] == run)].CpG_context_read_file.values[0], header=None, skiprows=1, sep = "\t")
                        checkdf.columns = ["readid", "strand", "chrom", "start","methyl_context"]
                        chrom = cpg.split(":")[0].replace("chr", "")
                        start = int(cpg.split(":")[1].split("-")[0])
                        end = int(cpg.split(":")[1].split("-")[1])
                        bam_region = f"{chrom}:{start}-{end}"
                        false_readid = checkdf[(checkdf["start"] == int(start + 1)) & (checkdf["chrom"] == int(chrom))]

                        bampath = metadata[metadata["filename"] == filename].bam_path.values[0]

                        bamfile = pysam.AlignmentFile(bampath, "rb")

                        all_reads = []
                        count = 0
                        special_reads = []
                        for read in bamfile.fetch(region = bam_region):
                            all_reads.append(read)
                            if read.to_dict()["name"] in false_readid["readid"].values:
                                special_reads.append(read)

                        false_cpg_readdf = pd.DataFrame()
                        for read in special_reads:
                            read_dict = read.to_dict()
                            del read_dict["tags"]
                            tmpdf = pd.DataFrame.from_dict(read_dict, orient="index").T  
                            false_cpg_readdf = pd.concat([false_cpg_readdf, tmpdf], axis = 0)
                        false_cpg_readdf.to_excel(os.path.join(path_to_save_falseCpG_readdf, f"falseCpG_{cpg.replace(':', '_').replace('-', '_')}.xlsx"), index = False)


In [None]:
mode


In [None]:
metadata[metadata["Run"] == run]