In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam
import pyfaidx
import warnings
import re
import argparse
import sys
from tqdm import tqdm 
tqdm.pandas()
warnings.filterwarnings("ignore")
from panel_configs import *
from helper_functions import *

inputdir = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6782_no_dedup_without_non_directional/06_methylation_extract"
outputdir = "/media/hieunguyen/HNHD01/outdir/ampliconSeq/data_analysis/test_get_methylation_data"
path_to_all_fa = "/media/hieunguyen/GSHD_HN01/storage/resources/hg19"
output_version = "20250601"
mode = "directional"
run = "R6782"
region_version = "CRC_1.1"
pic = "Vi"
path_to_control450 = "/media/hieunguyen/HNSD01/src/ampliconSeq_UMI/CONTROL114_ref_data"
beddir = "/media/hieunguyen/HNSD01/src/ampliconSeq_UMI/panel_design/beds"

print(f'Input directory containing all .cov files for this run: {inputdir}')

path_to_main_output = os.path.join(outputdir,
                                "targetMethyl_analysis", 
                                output_version, 
                                f"{pic}_output", 
                                mode, 
                                f"region_version_{region_version}", 
                                run)

os.system(f"mkdir -p {path_to_main_output}")

all_cov_files = [item for item in pathlib.Path(inputdir).glob("*.cov")]

print(f"Number of samples in this run {run}: {len(all_cov_files)}")

metadata_dict = {
    "filename"  : [item.name.split(".no_deduplicated")[0] for item in all_cov_files],
    # "Run" : [str(item).split("/")[6].replace("_no_dedup", "").replace("target_methylation_", "") for item in all_cov_files],
    "Run" : [str(item).split("/")[-3] for item in all_cov_files],
    "path": [str(item) for item in all_cov_files]
}

metadata = pd.DataFrame.from_dict(metadata_dict, orient="columns")
metadata["mode"] = metadata["Run"].apply(lambda x: "directional" if "_without_non_directional" in x else "non_directional")
metadata["Run"] = run
metadata["PIC"] = pic
metadata["bam_path"] = metadata["path"].apply(lambda x: str(x)\
                                                .replace("06_methylation_extract", "05_sorted_bam")\
                                                .replace(".bedGraph.gz.bismark.zero.cov", ".sorted.bam"))
metadata = metadata[(metadata["mode"] == mode) & (metadata["PIC"] == pic)]
print(metadata)
metadata.to_excel(os.path.join(path_to_main_output, "metadata.xlsx"), index = False)

##### regiondf for hg19
regiondf = pd.read_csv(os.path.join(beddir, regions[pic][region_version]), sep = "\t", header = None)
regiondf.columns = ["chrom", "start", "end", "amplicon_name"]
regiondf = regiondf[["chrom", "start", "end", "amplicon_name"]]
regiondf["region_name"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf["bam_region"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)

##### get list of all real cpg for this panel
cpgdf = pd.DataFrame()
for region in regiondf.region_name.unique():
    chrom = region.split("_")[1].replace("chr", "")
    start = int(region.split("_")[2])
    end = int(region.split("_")[3])
    refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "chr{}.fa".format(chrom)))
    refseq_at_region = str.upper(refseq.get_seq(name = "chr{}".format(chrom), start = start, end = end).seq)

    all_cpg_in_cluster = [m.start(0) for m in re.finditer("CG", refseq_at_region)]
    cpg_coords = [f"chr{chrom}:{item + start}-{item + start + 1}" for item in all_cpg_in_cluster]

    tmp_cpgdf = pd.DataFrame(data = cpg_coords, columns = ["CpG"])
    tmp_cpgdf["region"] = region
    cpgdf = pd.concat([cpgdf, tmp_cpgdf], axis = 0)

cpgdf = cpgdf[["region", "CpG"]]

cpgdf.to_excel(os.path.join(path_to_main_output, f"{pic}_panel_correct_cpgdf.xlsx"), index = False)

###### count on/off target reads
metadata["num_total_reads"] = metadata["bam_path"].apply(lambda x: int(pysam.samtools.view("-c", x, catch_stdout=True)))

if os.path.isfile(os.path.join(path_to_main_output, f"read_count_in_region.xlsx")) == False:
    for region_name in regiondf.region_name.unique():
        bam_region = regiondf[regiondf["region_name"] == region_name]["bam_region"].values[0]
        metadata[region_name] = metadata["bam_path"].apply(lambda x: count_read_in_region(x, bam_region))
        metadata[f"pct_{region_name}"] = metadata[region_name] / metadata["num_total_reads"] * 100
        
    metadata.to_excel(os.path.join(path_to_main_output, f"read_count_in_region.xlsx"), index = False)
else:
    print("File already exists, skip counting reads in regions. Reading existing data... ")
    metadata = pd.read_excel(os.path.join(path_to_main_output, f"read_count_in_region.xlsx"))

##### Process the main COV filesss
all_covdf = dict()
for run in metadata.Run.unique():
    for filename in metadata[metadata["Run"] == run]["filename"].unique():
            path_to_save_cov = os.path.join(path_to_main_output, filename)
            os.system(f"mkdir -p {path_to_save_cov}")

            covdf = pd.read_csv(metadata[metadata["filename"] == filename]["path"].values[0], header = None, sep = "\t")
            covdf.columns = ["chrom", "start", "end", "meth_density", "countC", "countT"]
            covdf = covdf[covdf["chrom"].isin(["X", "Y", "MT"]) == False]
            covdf["seq"] = covdf[["chrom", "start"]].progress_apply(lambda x: get_refseq(path_to_all_fa= path_to_all_fa, 
                                                                    chrom = x[0], start = x[1], end = x[1] + 1), axis = 1)
            covdf["strand"] = covdf["seq"].apply(lambda x: "+" if x != "CG" else "-")
            covdf_raw = covdf.copy()
            covdf_raw.to_excel(f"{path_to_save_cov}/{filename}_before_modifying_start_coords.xlsx", index = False)
            covdf["start"] = covdf[["seq", "start"]].apply(lambda x: x[1] + 1 if x[0] != "CG" else x[1], axis = 1)

            all_covdf[filename] = covdf
            covdf["chrom"] = covdf["chrom"].apply(lambda x: str(x))
            strand_name = {"+": "plus", "-": "minus"}
            for strand in ["+", "-"]:
                for region in regiondf.region_name.unique():
                    chrom = str(region.split("_")[1].replace("chr", ""))
                    start = int(region.split("_")[2])
                    end = int(region.split("_")[3])
                    save_covdf = covdf[(covdf["chrom"] == chrom) & 
                            (covdf["start"] >= start) & 
                            (covdf["start"] <= end) & 
                            (covdf["strand"] == strand)]
                    save_covdf["CpG"] = save_covdf[["chrom", "start"]].apply(lambda x: f"chr{str(x[0])}:{x[1]}-{x[1] + 1}", axis = 1)
                    save_covdf["check_context"] = save_covdf["CpG"].apply(lambda x: "CpG_context" if x in cpgdf["CpG"].values else "False")
                    save_covdf.to_excel(f"{path_to_save_cov}/{region}_{strand_name[strand]}.xlsx", index = False)

mean_methyldf = pd.DataFrame(data = regiondf.region_name.unique(), columns = ["region"])
def get_mean_methyl_in_region(region, filename, strand, remove_non_cpg = True):
    '''
    Calculate average methylation density in a region for a given filename and strand.
    If remove_non_cpg is True, it will only keep CpG context sites and remove others.
    '''
    strand_name = {"+": "plus", "-": "minus"}
    if filename == "CONTROL114":
        df = pd.read_excel(f"{path_to_control450}/region_version_{region_version}/{region}_strand_{strand_name[strand]}.xlsx")
        run = "R6447"
    else:
        run = metadata[metadata["filename"] == filename]["Run"].values[0]
        df = pd.read_excel(f"{os.path.join(path_to_main_output, filename)}/{region}_{strand_name[strand]}.xlsx")
    if df.shape[0] == 0:
        return "no data available"
    else:
        if remove_non_cpg:
            # keep only CpG context Cs
            df = df[df["check_context"] == "CpG_context"]
        df_strand = df[df["strand"] == strand]
        if df_strand.shape[0] == 0:
            mean_methyl = f"no read in this region at strand {strand}"
        else:
            mean_methyl = df_strand.meth_density.mean()
        return mean_methyl

for filename in list(all_covdf.keys()) + ["CONTROL114"]:
    for strand in ["+", "-"]:
        mean_methyldf[f"{filename}_{strand}"] = mean_methyldf["region"].apply(lambda x: get_mean_methyl_in_region(x, filename, strand = strand, remove_non_cpg = True))

mean_methyldf.to_excel(os.path.join(path_to_main_output, f"mean_methyl_in_region_compare_CONTROL114.xlsx"), index = False)

Input directory containing all .cov files for this run: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6782_no_dedup_without_non_directional/06_methylation_extract
Number of samples in this run R6782: 6
                   filename    Run  \
0  5-DML4BCDMR__S7505-S7705  R6782   
1  4-DML4BCDMR__S7504-S7704  R6782   
2  1-DML4BCDMR__S7501-S7701  R6782   
3  6-DML4BCDMR__S7506-S7706  R6782   
4  2-DML4BCDMR__S7502-S7702  R6782   
5  3-DML4BCDMR__S7503-S7703  R6782   

                                                path         mode PIC  \
0  /media/hieunguyen/HNHD01/raw_data/targetMethyl...  directional  Vi   
1  /media/hieunguyen/HNHD01/raw_data/targetMethyl...  directional  Vi   
2  /media/hieunguyen/HNHD01/raw_data/targetMethyl...  directional  Vi   
3  /media/hieunguyen/HNHD01/raw_data/targetMethyl...  directional  Vi   
4  /media/hieunguyen/HNHD01/raw_data/targetMethyl...  directional  Vi   
5  /media/hieunguyen/HNHD01/raw_data/targetMethyl...  direction

100%|██████████| 8/8 [00:00<00:00, 2935.13it/s]
100%|██████████| 8/8 [00:00<00:00, 3594.48it/s]
100%|██████████| 7/7 [00:00<00:00, 3888.24it/s]
100%|██████████| 8/8 [00:00<00:00, 3788.46it/s]
100%|██████████| 8/8 [00:00<00:00, 4280.99it/s]
100%|██████████| 7/7 [00:00<00:00, 3659.49it/s]


5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
5-DML4BCDMR__S7505-S7705
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
4-DML4BCDMR__S7504-S7704
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701
1-DML4BCDMR__S7501-S7701


In [2]:
mean_methyldf

Unnamed: 0,region,5-DML4BCDMR__S7505-S7705_+,5-DML4BCDMR__S7505-S7705_-,4-DML4BCDMR__S7504-S7704_+,4-DML4BCDMR__S7504-S7704_-,1-DML4BCDMR__S7501-S7701_+,1-DML4BCDMR__S7501-S7701_-,6-DML4BCDMR__S7506-S7706_+,6-DML4BCDMR__S7506-S7706_-,2-DML4BCDMR__S7502-S7702_+,2-DML4BCDMR__S7502-S7702_-,3-DML4BCDMR__S7503-S7703_+,3-DML4BCDMR__S7503-S7703_-,CONTROL114_+,CONTROL114_-
0,region_chr16_22825587_22825729,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,2.083333
1,region_chr16_22825770_22825948,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,8.484848
2,region_chr16_22825920_22826053,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,4.886574
3,region_chr16_22826024_22826152,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,5.354811
4,region_chr16_22826130_22826286,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,4.304348
5,region_chr17_75369424_75369590,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,0.0
6,region_chr17_75369739_75369887,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,2.792632
7,region_chr17_75369604_75369763,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,no data available,0.892857
