In [1]:
import gc
import os
import pandas as pd
import pathlib
from tqdm import tqdm
import re
import numpy as np
import pyfaidx
gc.collect()

radius = 200
min_num_Cpg_in_region = 3

path_to_project_src = "/media/hieunguyen/HNSD01/src/tmd_features"

bedfile = pd.read_csv(os.path.join(path_to_project_src, "panel.hg19_liftover_to_hg38.bed"), sep="\t", header=None)

bedfile['len'] = bedfile[2] - bedfile[1]
bedfile['region_name'] = bedfile.apply(lambda row: f"{row[0]}_{row[1]}_{row[2]}", axis=1)
bedfile = bedfile.sort_values(by='len', ascending=False)

path_to_all_fa = "/media/hieunguyen/HNSD01/resources/hg38"

all_fa_files = [file for file in pathlib.Path(path_to_all_fa).glob("*.fa")]

tmd_regions = bedfile.region_name.unique()


def get_refseq(path_to_all_fa, chrom, start, end):
    
    refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "{}.fa".format(chrom)))
    return(str.upper(refseq.get_seq(name = "{}".format(chrom), start = start, end = end).seq))

def find_CpG(ref_seq):
    import re
    all_C_positions = [m.start(0) for m in re.finditer("CG", ref_seq)]
    all_c_positions = [m.start(0) for m in re.finditer("cg", ref_seq)]
    all_cG_positions = [m.start(0) for m in re.finditer("cG", ref_seq)]
    all_Cg_positions = [m.start(0) for m in re.finditer("Cg", ref_seq)]
    return all_C_positions + all_c_positions + all_cG_positions + all_Cg_positions

df = pd.DataFrame()
for region in tqdm(tmd_regions):
    region_chrom = region.split("_")[0]
    region_start = int(region.split("_")[1])
    region_end = int(region.split("_")[2])
    refseq_at_cluster = get_refseq(path_to_all_fa = path_to_all_fa, 
                                        chrom = region_chrom, 
                                        start = region_start, 
                                        end = region_end + 1)
    all_cpg_in_cluster = sorted(find_CpG(refseq_at_cluster))
    cpg_coords = [item + region_start for item in all_cpg_in_cluster]
    tmpdf = pd.DataFrame(data = cpg_coords, columns = ["pos"])
    tmpdf["chrom"] = region_chrom
    tmpdf["region"] = region
    df = pd.concat([df, tmpdf], axis = 0)

100%|██████████| 450/450 [00:00<00:00, 813.37it/s]


In [2]:
df.to_csv("all_cpg_in_450_tmd_regions.csv")

In [3]:
df

Unnamed: 0,pos,chrom,region
0,144050604,chr8,chr8_144050603_144051516
1,144050633,chr8,chr8_144050603_144051516
2,144050635,chr8,chr8_144050603_144051516
3,144050656,chr8,chr8_144050603_144051516
4,144050659,chr8,chr8_144050603_144051516
...,...,...,...
0,93056041,chr14,chr14_93056040_93056042
0,1040870,chr2,chr2_1040869_1040871
0,171458455,chr3,chr3_171458454_171458456
0,38671668,chr4,chr4_38671667_38671669


In [None]:


final_subregiondf = pd.DataFrame()
for region in df.region.unique():
# for region in ["chr10_72318365_72318398"]:
    tmpdf = df[df["region"] == region]
    region_chrom = region.split("_")[0]
    region_start = int(region.split("_")[1])
    region_end = int(region.split("_")[2])
    refseq_at_cluster = get_refseq(path_to_all_fa = path_to_all_fa, 
                                        chrom = region_chrom, 
                                        start = region_start, 
                                        end = region_end + 1)
    all_pos = sorted(find_CpG(refseq_at_cluster))

    anchor = all_pos[0]
    tmp_cpg_clusters = dict()
    cpg_cluster_idx = 0
    tmp_cpg_clusters[cpg_cluster_idx] = [anchor]

    for item in tqdm(all_pos[1:]):
        if (item - anchor) <= radius:
            tmp_cpg_clusters[cpg_cluster_idx].append(item)
        else:
            cpg_cluster_idx += 1
            anchor = item        
            tmp_cpg_clusters[cpg_cluster_idx] = [anchor]

    all_cpg_cluster = dict()
    for key in tmp_cpg_clusters.keys():
        if (len(tmp_cpg_clusters[key]) >= min_num_Cpg_in_region):
            all_cpg_cluster[key] = [item + region_start for item in tmp_cpg_clusters[key]]
            
    if len(all_cpg_cluster.keys()) != 0:
        sub_regiondf = pd.DataFrame()
        for key in all_cpg_cluster.keys():
            cluster = all_cpg_cluster[key]
            start = np.min(cluster) - 1
            end = np.max(cluster)  + 1
            chrom = region.split("_")[0]
            num_cpg = len(cluster)
            region_name = "{}:{}-{}".format(chrom, start, end)
            tmp_subregiondf = pd.DataFrame(data = all_cpg_cluster[key], columns = ["pos"])
            tmp_subregiondf["sub_region_name"] = region_name
            sub_regiondf = pd.concat([sub_regiondf, tmp_subregiondf], axis = 0)
            
        tmpdf = tmpdf.merge(sub_regiondf, right_on = "pos", left_on="pos", how = "outer")
        final_subregiondf = pd.concat([final_subregiondf, tmpdf], axis = 0)
    

In [None]:
len(final_subregiondf.region.unique())

In [None]:
final_subregiondf