In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import pathlib
import pysam

import warnings
warnings.filterwarnings("ignore")

output_version = "test"

maindir = "/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis"
outputdir = "/media/hieunguyen/HNSD_mini/outdir"

Vi_runs = ["R6801", "R6829Vi", "R6873", "R6914"]
Truong_runs = ["R6782", "R6823", "R6829Truong"]

regions = {
    "Vi": "Vi_amplicons.hg19.bed",
    "Truong": "Truong_amplicons.hg19.bed"
}

regions_hg38 = {
    "Vi": "Vi_amplicons.hg38.bed",
    "Truong": "Truong_amplicons.hg38.bed"
}

# mode = "directional"
mode = "non_directional"

# pic = "Truong"
pic = "Vi"

path_to_main_output = os.path.join(outputdir, "targetMethyl_analysis", output_version, f"{pic}_output", mode)
os.system(f"mkdir -p {path_to_main_output}")


print(f"All output are saved at {path_to_main_output}")

all_cov_files = [item for item in pathlib.Path(maindir).glob("target_methylation_*/06*/*.cov")]

metadata_dict = {
    "filename"  : [item.name.split(".no_deduplicated")[0] for item in all_cov_files],
    "Run" : [str(item).split("/")[6].replace("_no_dedup", "").replace("target_methylation_", "") for item in all_cov_files],
    "path": [str(item) for item in all_cov_files]
}

metadata = pd.DataFrame.from_dict(metadata_dict, orient="columns")
metadata["mode"] = metadata["Run"].apply(lambda x: "directional" if "_without_non_directional" in x else "non_directional")
metadata["Run"] = metadata["Run"].apply(lambda x: x.split("_")[0])
metadata["PIC"] = metadata["Run"].apply(lambda x: "Vi" if x in Vi_runs else "Truong")
metadata["bam_path"] = metadata["path"].apply(lambda x: str(x).replace("06_methylation_extract", "05_sorted_bam").replace(".bedGraph.gz.bismark.zero.cov", ".sorted.bam"))
metadata = metadata[(metadata["mode"] == mode) & (metadata["PIC"] == pic)]

##### regiondf for hg19
regiondf = pd.read_csv(regions[pic], sep = "\t", header = None)
regiondf.columns = ["chrom", "start", "end"]
regiondf = regiondf[["chrom", "start", "end"]]
regiondf["region_name"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf["bam_region"] = regiondf[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)

##### regiondf for hg38
regiondf_hg38 = pd.read_csv(regions_hg38[pic], sep = ",", header = None)
regiondf_hg38.columns = ["chrom", "start", "end"]
regiondf_hg38 = regiondf_hg38[["chrom", "start", "end"]]
regiondf_hg38["region_name"] = regiondf_hg38[["chrom", "start", "end"]].apply(
    lambda x: f"region_{x[0]}_{x[1]}_{x[2]}", axis = 1
)
regiondf_hg38["bam_region"] = regiondf_hg38[["chrom", "start", "end"]].apply(
    lambda x: f"{x[0].replace('chr', '')}:{x[1]}-{x[2]}", axis = 1
)


checkdf = pd.read_csv("/media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6914_Vi_no_dedup_without_non_directional/tmp/check_list.txt", 
                      sep = "\t", header = None)

All output are saved at /media/hieunguyen/HNSD_mini/outdir/targetMethyl_analysis/test/Vi_output/non_directional


In [13]:
import pyfaidx
import re
path_to_all_fa = "/media/hieunguyen/GSHD_HN01/storage/resources/hg19"
cpgdf = pd.DataFrame()
for region in regiondf.region_name.unique():
    chrom = region.split("_")[1].replace("chr", "")
    start = int(region.split("_")[2])
    end = int(region.split("_")[3])
    refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "chr{}.fa".format(chrom)))
    refseq_at_region = str.upper(refseq.get_seq(name = "chr{}".format(chrom), start = start, end = end).seq)

    all_cpg_in_cluster = [m.start(0) for m in re.finditer("CG", refseq_at_region)]
    cpg_coords = [f"chr{chrom}:{item + start - 1}-{item + start}" for item in all_cpg_in_cluster]

    tmp_cpgdf = pd.DataFrame(data = cpg_coords, columns = ["CpG"])
    tmp_cpgdf["region"] = region
    cpgdf = pd.concat([cpgdf, tmp_cpgdf], axis = 0)

cpgdf = cpgdf[["region", "CpG"]]

In [46]:
bampath = metadata[metadata["filename"] == "TMM1P4"].bam_path.values[0]

bamfile = pysam.AlignmentFile(bampath, "rb")

all_reads = []
count = 0
special_reads = []
for read in bamfile.fetch(region = "16:22825595-22825596"):
    all_reads.append(read)
    if read.to_dict()["name"] in checkdf[0].values:
        special_reads.append(read)


[W::hts_idx_load3] The index file is older than the data file: /media/hieunguyen/HNHD01/raw_data/targetMethyl_analysis/target_methylation_R6914_Vi_no_dedup/05_sorted_bam/TMM1P4.no_deduplicated.sorted.bam.bai


In [50]:
for read in special_reads:
    print(f"read start: {read.to_dict()['ref_pos']}, cigar = {read.to_dict()['cigar']}")

read start: 22825586, cigar = 11M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M2D75M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M2D75M
read start: 22825587, cigar = 10M2D75M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cigar = 10M1D74M
read start: 22825587, cig