In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam
import pyfaidx
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
from panel_configs import *
from liftover_TMD450 import *
from helper_functions import *


pic = "Vi"  
for region_version in regions[pic].keys():
    # INPUT
    path_to_all_fa = "/media/hieunguyen/GSHD_HN01/storage/resources/hg19"
    maindir = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis"
    outputdir = "/media/hieunguyen/HNSD01/src/ampliconSeq_UMI/CONTROL114_ref_data"
    # region_version = "CRC_1.1"
    output_version = "20250601"
    control114_cov = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/CONTROL114/65-CONTROL114CT836T_M550-M750.cg_cov.bed"
    beddir = "../panel_design/beds"

    ##### MAIN RUN
    covdf = pd.read_csv(control114_cov, sep = "\t", header = None)

    regiondf = pd.read_csv(os.path.join(beddir, regions[pic][region_version]), sep = "\t", header = None)
    regiondf.columns = ["chrom", "start", "end", "amplicon_name"]
    regiondf = regiondf[["chrom", "start", "end", "amplicon_name"]]
    regiondf["region_name"] = regiondf[["chrom", "start", "end"]].apply(
        lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
    )
    regiondf["bam_region"] = regiondf[["chrom", "start", "end"]].apply(
        lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
    )

    ##### get list of all real cpg for this panel
    real_cpgdf = pd.DataFrame()
    for region in regiondf.region_name.unique():
        chrom = region.split("_")[1].replace("chr", "")
        start = int(region.split("_")[2])
        end = int(region.split("_")[3])
        refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "chr{}.fa".format(chrom)))
        refseq_at_region = str.upper(refseq.get_seq(name = "chr{}".format(chrom), start = start, end = end).seq)

        all_cpg_in_cluster = [m.start(0) for m in re.finditer("CG", refseq_at_region)]
        cpg_coords = [f"chr{chrom}:{item + start}-{item + start + 1}" for item in all_cpg_in_cluster]

        tmp_real_cpgdf = pd.DataFrame(data = cpg_coords, columns = ["CpG"])
        tmp_real_cpgdf["region"] = region
        real_cpgdf = pd.concat([real_cpgdf, tmp_real_cpgdf], axis = 0)

    real_cpgdf = real_cpgdf[["region", "CpG"]]

    tqdm.pandas()

    path_to_save_cov = os.path.join(outputdir, f"region_version_{region_version}")
    os.system(f"mkdir -p {path_to_save_cov}")

    covdfhg19 = liftOver_hg38_to_hg19(covdf)
    covdfhg19["seq"] = covdfhg19[["chrom", "start"]].progress_apply(lambda x: get_refseq(path_to_all_fa= path_to_all_fa, 
                                                                            chrom = x[0], start = x[1], end = x[1] + 1), axis = 1)
    covdfhg19["strand"] = covdfhg19["seq"].apply(lambda x: "+" if x != "CG" else "-")
    covdfhg19_raw = covdfhg19.copy()
    covdfhg19["start"] = covdfhg19[["seq", "start"]].apply(lambda x: x[1] + 1 if x[0] != "CG" else x[1], axis = 1)

    strand_name = {"+": "plus", "-": "minus"}

    for strand in ["+", "-"]:
        for region in regiondf.region_name.unique():
                chrom = region.split("_")[1]
                start = int(region.split("_")[2])
                end = int(region.split("_")[3])
                save_covdf = covdfhg19[(covdfhg19["chrom"] == chrom) & 
                    (covdfhg19["start"] >= start) & (covdfhg19["start"] <= end) & (covdfhg19["strand"] == strand)]
                save_covdf["CpG"] = save_covdf[["chrom", "start"]].apply(lambda x: f"{str(x[0])}:{x[1]}-{x[1] + 1}", axis = 1)
                save_covdf["check_context"] = save_covdf["CpG"].apply(lambda x: "CpG_context" if x in real_cpgdf["CpG"].values else "False")
                save_covdf.to_excel(f"{path_to_save_cov}/{region}_strand_{strand_name[strand]}.xlsx", index = False)
                

100%|██████████| 9439/9439 [00:01<00:00, 7378.42it/s]
100%|██████████| 9439/9439 [00:01<00:00, 8273.92it/s]
100%|██████████| 9439/9439 [00:01<00:00, 8429.04it/s]
100%|██████████| 9439/9439 [00:01<00:00, 8411.08it/s]
100%|██████████| 9439/9439 [00:01<00:00, 8434.62it/s]


In [3]:
covdfhg19.strand.unique()

array(['-', '+'], dtype=object)