In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam

import warnings
warnings.filterwarnings("ignore")

output_version = "20250318"

# Generate metadata

In [2]:
maindir = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis"
outputdir = "/media/hieunguyen/HNSD_mini/outdir"

Vi_runs = ["R6801", "R6829Vi", "R6873"]
Truong_runs = ["R6782", "R6823", "R6829Truong"]

regions = {
    "Vi": "Vi_amplicons.hg19.bed",
    "Truong": "Truong_amplicons.hg19.bed"
}

# mode = "directional"
mode = "non_directional"

# pic = "Truong"
pic = "Vi"

path_to_main_output = os.path.join(outputdir, "targetMethyl_analysis", output_version, f"{pic}_output", mode)
os.system(f"mkdir -p {path_to_main_output}")


print(f"All output are saved at {path_to_main_output}")


All output are saved at /media/hieunguyen/HNSD_mini/outdir/targetMethyl_analysis/20250318/Vi_output/non_directional


In [3]:
all_cov_files = [item for item in pathlib.Path(maindir).glob("target_methylation_*/06*/*.cov")]

metadata_dict = {
    "filename"  : [item.name.split(".no_deduplicated")[0] for item in all_cov_files],
    "Run" : [str(item).split("/")[6].replace("_no_dedup", "").replace("target_methylation_", "") for item in all_cov_files],
    "path": [str(item) for item in all_cov_files]
}

metadata = pd.DataFrame.from_dict(metadata_dict, orient="columns")
metadata["mode"] = metadata["Run"].apply(lambda x: "directional" if "_without_non_directional" in x else "non_directional")
metadata["Run"] = metadata["Run"].apply(lambda x: x.split("_")[0])
metadata["PIC"] = metadata["Run"].apply(lambda x: "Vi" if x in Vi_runs else "Truong")
metadata["bam_path"] = metadata["path"].apply(lambda x: str(x).replace("06_methylation_extract", "05_sorted_bam").replace(".bedGraph.gz.bismark.zero.cov", ".sorted.bam"))
metadata = metadata[(metadata["mode"] == mode) & (metadata["PIC"] == pic)]

regiondf = pd.read_csv(regions[pic], sep = "\t", header = None)
regiondf.columns = ["chrom", "start", "end"]
regiondf = regiondf[["chrom", "start", "end"]]
regiondf["region_name"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf["bam_region"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)
metadata

Unnamed: 0,filename,Run,path,mode,PIC,bam_path
0,TM010,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
1,TM100,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
2,TM002,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
3,TM050,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
4,TM005,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
5,TM001,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
29,3-TMH3_S7527-S7727,R6873,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
30,4-TMH4_S7528-S7728,R6873,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
31,11-TMM3_S7535-S7735,R6873,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
32,1-TMH1_S7525-S7725,R6873,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...


# Count on/off target reads

In [4]:
metadata["num_total_reads"] = metadata["bam_path"].apply(lambda x: int(pysam.samtools.view("-c", x, catch_stdout=True)))

def count_read_in_region(bam_path, region):
    all_reads = []
    bamfile = pysam.AlignmentFile(bam_path, "rb")
    fetched_obj = bamfile.fetch(region = region)
    for read in fetched_obj:
        all_reads.append(read)
    return(len(all_reads))

for region_name in regiondf.region_name.unique():
    bam_region = regiondf[regiondf["region_name"] == region_name]["bam_region"].values[0]
    metadata[region_name] = metadata["bam_path"].apply(lambda x: count_read_in_region(x, bam_region))
    metadata[f"pct_{region_name}"] = metadata[region_name] / metadata["num_total_reads"] * 100
    
metadata.to_excel(os.path.join(path_to_main_output, f"{pic}_read_count_in_region.xlsx"), index = False)


[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6873_Vi_no_dedup/05_sorted_bam/11-TMM3_S7535-S7735.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6873_Vi_no_dedup/05_sorted_bam/1-TMH1_S7525-S7725.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6873_Vi_no_dedup/05_sorted_bam/11-TMM3_S7535-S7735.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6873_Vi_no_dedup/05_sorted_bam/1-TMH1_S7525-S7725.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis

# Process cov file

In [5]:
metadata.head()

Unnamed: 0,filename,Run,path,mode,PIC,bam_path,num_total_reads,region_chr16_22825587_22825729,pct_region_chr16_22825587_22825729,region_chr16_22825770_22825948,...,region_chr16_22826024_22826152,pct_region_chr16_22826024_22826152,region_chr16_22826130_22826286,pct_region_chr16_22826130_22826286,region_chr7_77002659_77002825,pct_region_chr7_77002659_77002825,region_chr7_77002974_77003122,pct_region_chr7_77002974_77003122,region_chr7_77002839_77002998,pct_region_chr7_77002839_77002998
0,TM010,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,26,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,TM100,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,36,16,44.444444,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,TM002,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,32,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,TM050,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,38,14,36.842105,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,TM005,R6801,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Vi,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,10,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [6]:
regiondf.head()

Unnamed: 0,chrom,start,end,region_name,bam_region
0,chr16,22825587,22825729,region_chr16_22825587_22825729,16:22825587-22825729
1,chr16,22825770,22825948,region_chr16_22825770_22825948,16:22825770-22825948
2,chr16,22825920,22826053,region_chr16_22825920_22826053,16:22825920-22826053
3,chr16,22826024,22826152,region_chr16_22826024_22826152,16:22826024-22826152
4,chr16,22826130,22826286,region_chr16_22826130_22826286,16:22826130-22826286


In [7]:
for run in metadata.Run.unique():
      for filename in metadata[metadata["Run"] == run]["filename"].unique():
            path_to_save_cov = os.path.join(path_to_main_output, run, filename)
            os.system(f"mkdir -p {path_to_save_cov}")

            covdf = pd.read_csv(metadata[metadata["filename"] == filename]["path"].values[0], header = None, sep = "\t")
            covdf.columns = ["chrom", "start", "end", "meth_density", "countC", "countT"]

            for region in regiondf.region_name.unique():
                  chrom = int(region.split("_")[1].replace("chr", ""))
                  start = int(region.split("_")[2])
                  end = int(region.split("_")[3])

                  covdf[(covdf["chrom"] == chrom) & 
                        (covdf["start"] >= start) & (covdf["start"] <= end)].to_excel(f"{path_to_save_cov}/{region}.xlsx", index = False)
