In [1]:
import gc
import os
import pandas as pd
import pathlib
from tqdm import tqdm
import re
import numpy as np
import pyfaidx
gc.collect()

radius = 200
min_num_Cpg_in_region = 3

path_to_project_src = "/media/hieunguyen/HNSD01/src/tmd_features"

bedfile = pd.read_csv(os.path.join(path_to_project_src, "panel.hg19_liftover_to_hg38.bed"), sep="\t", header=None)

bedfile['len'] = bedfile[2] - bedfile[1]
bedfile['region_name'] = bedfile.apply(lambda row: f"{row[0]}_{row[1]}_{row[2]}", axis=1)
bedfile = bedfile.sort_values(by='len', ascending=False)

path_to_all_fa = "/media/hieunguyen/HNSD01/resources/hg19"

all_fa_files = [file for file in pathlib.Path(path_to_all_fa).glob("*.fa")]

tmd_regions = bedfile.region_name.unique()


def get_refseq(path_to_all_fa, chrom, start, end):
    
    refseq = pyfaidx.Fasta(os.path.join(path_to_all_fa, "{}.fa".format(chrom)))
    return(str.upper(refseq.get_seq(name = "{}".format(chrom), start = start, end = end).seq))

def find_CpG(ref_seq):
    import re
    all_C_positions = [m.start(0) for m in re.finditer("CG", ref_seq)]
    all_c_positions = [m.start(0) for m in re.finditer("cg", ref_seq)]
    all_cG_positions = [m.start(0) for m in re.finditer("cG", ref_seq)]
    return all_C_positions + all_c_positions + all_cG_positions

In [2]:
df = pd.DataFrame()
for region in tqdm(tmd_regions):
    region_chrom = region.split("_")[0]
    region_start = int(region.split("_")[1])
    region_end = int(region.split("_")[2])
    refseq_at_cluster = get_refseq(path_to_all_fa = path_to_all_fa, 
                                        chrom = region_chrom, 
                                        start = region_start, 
                                        end = region_end + 1)
    all_cpg_in_cluster = sorted(find_CpG(refseq_at_cluster))
    cpg_coords = [item + region_start for item in all_cpg_in_cluster]
    tmpdf = pd.DataFrame(data = cpg_coords, columns = ["pos"])
    tmpdf["chrom"] = region_chrom
    tmpdf["region"] = region
    df = pd.concat([df, tmpdf], axis = 0)

final_subregiondf = pd.DataFrame()
for region in df.region.unique():
# for region in ["chr10_72318365_72318398"]:
    tmpdf = df[df["region"] == region]
    region_chrom = region.split("_")[0]
    region_start = int(region.split("_")[1])
    region_end = int(region.split("_")[2])
    refseq_at_cluster = get_refseq(path_to_all_fa = path_to_all_fa, 
                                        chrom = region_chrom, 
                                        start = region_start, 
                                        end = region_end + 1)
    all_pos = sorted(find_CpG(refseq_at_cluster))

    anchor = all_pos[0]
    tmp_cpg_clusters = dict()
    cpg_cluster_idx = 0
    tmp_cpg_clusters[cpg_cluster_idx] = [anchor]

    for item in tqdm(all_pos[1:]):
        if (item - anchor) <= radius:
            tmp_cpg_clusters[cpg_cluster_idx].append(item)
        else:
            cpg_cluster_idx += 1
            anchor = item        
            tmp_cpg_clusters[cpg_cluster_idx] = [anchor]

    all_cpg_cluster = dict()
    for key in tmp_cpg_clusters.keys():
        if (len(tmp_cpg_clusters[key]) >= min_num_Cpg_in_region):
            all_cpg_cluster[key] = [item + region_start for item in tmp_cpg_clusters[key]]
            
    if len(all_cpg_cluster.keys()) != 0:
        sub_regiondf = pd.DataFrame()
        for key in all_cpg_cluster.keys():
            cluster = all_cpg_cluster[key]
            start = np.min(cluster) - 1
            end = np.max(cluster)  + 1
            chrom = region.split("_")[0]
            num_cpg = len(cluster)
            region_name = "{}:{}-{}".format(chrom, start, end)
            tmp_subregiondf = pd.DataFrame(data = all_cpg_cluster[key], columns = ["pos"])
            tmp_subregiondf["sub_region_name"] = region_name
            sub_regiondf = pd.concat([sub_regiondf, tmp_subregiondf], axis = 0)
            
        tmpdf = tmpdf.merge(sub_regiondf, right_on = "pos", left_on="pos", how = "outer")
        final_subregiondf = pd.concat([final_subregiondf, tmpdf], axis = 0)
    

100%|██████████| 450/450 [00:00<00:00, 747.03it/s]
100%|██████████| 10/10 [00:00<00:00, 199728.76it/s]
100%|██████████| 10/10 [00:00<00:00, 184771.10it/s]
100%|██████████| 85/85 [00:00<00:00, 748982.86it/s]
100%|██████████| 2/2 [00:00<00:00, 42366.71it/s]
100%|██████████| 4/4 [00:00<00:00, 62368.83it/s]
100%|██████████| 3/3 [00:00<00:00, 59918.63it/s]
100%|██████████| 12/12 [00:00<00:00, 221725.32it/s]
100%|██████████| 2/2 [00:00<00:00, 40524.68it/s]
100%|██████████| 6/6 [00:00<00:00, 118149.41it/s]
100%|██████████| 2/2 [00:00<00:00, 40136.88it/s]
100%|██████████| 2/2 [00:00<00:00, 31655.12it/s]
100%|██████████| 2/2 [00:00<00:00, 32388.45it/s]
100%|██████████| 1/1 [00:00<00:00, 20661.60it/s]
100%|██████████| 2/2 [00:00<00:00, 35544.95it/s]
100%|██████████| 13/13 [00:00<00:00, 181753.17it/s]
100%|██████████| 52/52 [00:00<00:00, 635871.16it/s]
0it [00:00, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 78545.02it/s]
100%|██████████| 7/7 [00:00<00:00, 131659.77it/s]
100%|██████████| 45/45 [00:0

In [3]:
len(final_subregiondf.region.unique())

178

In [5]:
final_subregiondf

Unnamed: 0,pos,chrom,region,sub_region_name
0,144050629,chr8,chr8_144050603_144051516,chr8:144050628-144050780
1,144050697,chr8,chr8_144050603_144051516,chr8:144050628-144050780
2,144050719,chr8,chr8_144050603_144051516,chr8:144050628-144050780
3,144050779,chr8,chr8_144050603_144051516,chr8:144050628-144050780
4,144051192,chr8,chr8_144050603_144051516,chr8:144051191-144051320
...,...,...,...,...
10,109792248,chr1,chr1_109792189_109792254,chr1:109792195-109792254
11,109792253,chr1,chr1_109792189_109792254,chr1:109792195-109792254
0,109792369,chr1,chr1_109792367_109792397,chr1:109792368-109792384
1,109792376,chr1,chr1_109792367_109792397,chr1:109792368-109792384
