In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam

import warnings
warnings.filterwarnings("ignore")

# Generate metadata

In [2]:
maindir = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis"
outputdir = "/media/hieunguyen/HNSD_mini/outdir"

Vi_runs = ["R6801", "R6829Vi"]
Truong_runs = ["R6782", "R6823", "R6829Truong"]

regions = {
    "Vi": "Vi_amplicons.hg19.bed",
    "Truong": "Truong_amplicons.hg19.bed"
}

# mode = "directional"
mode = "non_directional"

pic = "Truong"
# pic = "Vi"

path_to_main_output = os.path.join(outputdir, "targetMethyl_analysis", f"{pic}_output", mode)
os.system(f"mkdir -p {path_to_main_output}")

all_cov_files = [item for item in pathlib.Path(maindir).glob("target_methylation_*/06*/*.cov")]

metadata_dict = {
    "filename"  : [item.name.split(".no_deduplicated")[0] for item in all_cov_files],
    "Run" : [str(item).split("/")[6].replace("_no_dedup", "").replace("target_methylation_", "") for item in all_cov_files],
    "path": [str(item) for item in all_cov_files]
}

metadata = pd.DataFrame.from_dict(metadata_dict, orient="columns")
metadata["mode"] = metadata["Run"].apply(lambda x: "directional" if "_without_non_directional" in x else "non_directional")
metadata["Run"] = metadata["Run"].apply(lambda x: x.split("_")[0])
metadata["PIC"] = metadata["Run"].apply(lambda x: "Vi" if x in Vi_runs else "Truong")
metadata["bam_path"] = metadata["path"].apply(lambda x: str(x).replace("06_methylation_extract", "05_sorted_bam").replace(".bedGraph.gz.bismark.zero.cov", ".sorted.bam"))
metadata = metadata[(metadata["mode"] == mode) & (metadata["PIC"] == pic)]

regiondf = pd.read_csv(regions[pic], sep = "\t", header = None)
regiondf.columns = ["chrom", "start", "end"]
regiondf = regiondf[["chrom", "start", "end"]]
regiondf["region_name"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf["bam_region"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)
metadata



Unnamed: 0,filename,Run,path,mode,PIC,bam_path
6,2-DML4BCDMR__S7536-S7736,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
7,5-DML4BCDMR__S7539-S7739,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
8,6-DML4BCDMR__S7540-S7740,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
9,4-DML4BCDMR__S7538-S7738,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
10,1-DML4BCDMR__S7535-S7735,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
11,3-DML4BCDMR__S7537-S7737,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
25,5-DML4BCDMR__S7505-S7705,R6782,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
26,4-DML4BCDMR__S7504-S7704,R6782,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
27,1-DML4BCDMR__S7501-S7701,R6782,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...
28,6-DML4BCDMR__S7506-S7706,R6782,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...


# Count on/off target reads

In [3]:
metadata["num_total_reads"] = metadata["bam_path"].apply(lambda x: int(pysam.samtools.view("-c", x, catch_stdout=True)))

def count_read_in_region(bam_path, region):
    all_reads = []
    bamfile = pysam.AlignmentFile(bam_path, "rb")
    fetched_obj = bamfile.fetch(region = region)
    for read in fetched_obj:
        all_reads.append(read)
    return(len(all_reads))

for region_name in regiondf.region_name.unique():
    bam_region = regiondf[regiondf["region_name"] == region_name]["bam_region"].values[0]
    metadata[region_name] = metadata["bam_path"].apply(lambda x: count_read_in_region(x, bam_region))
    metadata[f"pct_{region_name}"] = metadata[region_name] / metadata["num_total_reads"] * 100
    
metadata.to_excel(os.path.join(path_to_main_output, f"{pic}_read_count_in_region.xlsx"), index = False)


[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6829Truong_no_dedup/05_sorted_bam/2-DML4BCDMR__S7536-S7736.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6829Truong_no_dedup/05_sorted_bam/5-DML4BCDMR__S7539-S7739.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6829Truong_no_dedup/05_sorted_bam/6-DML4BCDMR__S7540-S7740.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6829Truong_no_dedup/05_sorted_bam/4-DML4BCDMR__S7538-S7738.no_deduplicated.sorted.bam.bai
[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNH

# Process cov file

In [4]:
metadata.head()

Unnamed: 0,filename,Run,path,mode,PIC,bam_path,num_total_reads,region_chr4_187125634_187125723,pct_region_chr4_187125634_187125723
6,2-DML4BCDMR__S7536-S7736,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,477812,477810,99.999581
7,5-DML4BCDMR__S7539-S7739,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,567656,567652,99.999295
8,6-DML4BCDMR__S7540-S7740,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,777456,777456,100.0
9,4-DML4BCDMR__S7538-S7738,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,412002,412002,100.0
10,1-DML4BCDMR__S7535-S7735,R6829Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,non_directional,Truong,/media/hieunguyen/HNHD01/raw_data/targetMethyl...,35342,35342,100.0


In [5]:
regiondf.head()

Unnamed: 0,chrom,start,end,region_name,bam_region
0,chr4,187125634,187125723,region_chr4_187125634_187125723,4:187125634-187125723


In [6]:
for run in metadata.Run.unique():
      for filename in metadata[metadata["Run"] == run]["filename"].unique():
            path_to_save_cov = os.path.join(path_to_main_output, run, filename)
            os.system(f"mkdir -p {path_to_save_cov}")

            covdf = pd.read_csv(metadata[metadata["filename"] == filename]["path"].values[0], header = None, sep = "\t")
            covdf.columns = ["chrom", "start", "end", "meth_density", "countC", "countT"]

            for region in regiondf.region_name.unique():
                  chrom = int(region.split("_")[1].replace("chr", ""))
                  start = int(region.split("_")[2])
                  end = int(region.split("_")[3])

                  covdf[(covdf["chrom"] == chrom) & 
                        (covdf["start"] >= start) & (covdf["start"] <= end)].to_excel(f"{path_to_save_cov}/{region}.xlsx", index = False)
