In [1]:
from pathlib import Path

In [2]:
from workspace import nometools as nome
from workspace import utils
from workspace import bedtools

In [3]:
main_path = Path('..')
data_path = main_path / 'Data' / 'Preprocessing_LNDR_HNDR'

#### Retriving regions for Promoter, 1st Intron (start and end), 2nd Intron (start)

In [4]:
infile = data_path / 'GRCh38.p14.annotations'
outfile = data_path / 'GRCh38.p14.promoter.bed'
utils.get_promoters_refGene(infile, outfile) # for promoters

no of promoters defined : 106261
promoters saved to ../Data/Preprocessing_LNDR_HNDR/GRCh38.p14.promoter.bed


In [5]:
infile = data_path / 'GRCh38.p14.annotations'
outfile = data_path / 'GRCh38.p14.intron.1.start.bed'
utils.get_introns_refGene(infile, outfile, anchor=1, pos='start') # for introns

no of introns defined : 104611
introns saved to ../Data/Preprocessing_LNDR_HNDR/GRCh38.p14.intron.1.start.bed


In [6]:
infile = data_path / 'GRCh38.p14.annotations'
outfile = data_path / 'GRCh38.p14.intron.1.end.bed'
utils.get_introns_refGene(infile, outfile, anchor=1, pos='end') # for introns

no of introns defined : 104611
introns saved to ../Data/Preprocessing_LNDR_HNDR/GRCh38.p14.intron.1.end.bed


In [7]:
infile = data_path / 'GRCh38.p14.annotations'
outfile = data_path / 'GRCh38.p14.intron.2.start.bed'
utils.get_introns_refGene(infile, outfile, anchor=2, pos='start') # for introns

no of introns defined : 98596
introns saved to ../Data/Preprocessing_LNDR_HNDR/GRCh38.p14.intron.2.start.bed


In [10]:
print(f"num reads for ref file: {nome.get_num_reads(data_path / 'GRCh38.p14.annotations')}")
print(f"num reads promoter: {nome.get_num_reads(data_path / 'GRCh38.p14.promoter.bed')}")
print(f"num reads 1st intron start: {nome.get_num_reads(data_path / 'GRCh38.p14.intron.1.start.bed')}")
print(f"num reads 1st intron end: {nome.get_num_reads(data_path / 'GRCh38.p14.intron.1.end.bed')}")
print(f"num reads 2nd intron start: {nome.get_num_reads(data_path / 'GRCh38.p14.intron.2.start.bed')}")

num reads for ref file: 278221
num reads promoter: 106261
num reads 1st intron start: 104611
num reads 1st intron end: 104611
num reads 2nd intron start: 98596


#### Filter GpC and CpG  sites

Filter out read with less than 5 coverage

In [8]:
infile = data_path / 'cpg.raw.sort.GCH.bed'
outfile = data_path / 'GCH.filtered.baseline.bed'
nome.filter_bed_files(infile, outfile, min_cov=0)

start traversing bed file  ../Data/Preprocessing_LNDR_HNDR/cpg.raw.sort.GCH.bed n =  177722748
progress : 10.0% and time elapsed 0.77 min
progress : 20.0% and time elapsed 1.52 min
progress : 30.0% and time elapsed 2.27 min
progress : 40.0% and time elapsed 3.04 min
progress : 50.0% and time elapsed 3.82 min
progress : 60.0% and time elapsed 4.6 min
progress : 70.0% and time elapsed 5.38 min
progress : 80.0% and time elapsed 6.17 min
progress : 90.0% and time elapsed 6.97 min
progress : 100.0% and time elapsed 7.76 min
filered bed file with min coverage 0 and saved to ../Data/Preprocessing_LNDR_HNDR/GCH.filtered.baseline.bed
Non standard chrs seen :  {'GL383526.1', 'GL383572.1', 'KI270713.1', 'KI270859.1', 'KQ031389.1', 'KI270417.1', 'KI270715.1', 'ML143380.1', 'KN538371.1', 'KI270745.1', 'KI270893.1', 'KN538367.1', 'MU273395.1', 'KI270769.1', 'GL383555.2', 'KI270508.1', 'KV575245.1', 'KI270800.1', 'KI270711.1', 'KQ759761.1', 'KI270872.1', 'KI270583.1', 'KI270710.1', 'KI270877.1', 'KV5

In [4]:
infile = data_path / 'cpg.raw.sort.GCH.bed'
outfile = data_path / 'GCH.filtered.bed'

nome.filter_bed_files(infile, outfile, min_cov=5)

start traversing bed file  ../Data/Preprocessing_LNDR_HNDR/cpg.raw.sort.GCH.bed n =  177722748
progress : 10.0% and time elapsed 0.47 min
progress : 20.0% and time elapsed 1.0 min
progress : 30.0% and time elapsed 1.46 min
progress : 40.0% and time elapsed 1.95 min
progress : 50.0% and time elapsed 2.43 min
progress : 60.0% and time elapsed 2.91 min
progress : 70.0% and time elapsed 3.38 min
progress : 80.0% and time elapsed 3.88 min
progress : 90.0% and time elapsed 4.38 min
progress : 100.0% and time elapsed 4.88 min
filered bed file with min coverage 5 and saved to ../Data/Preprocessing_LNDR_HNDR/GCH.filtered.bed
Non standard chrs seen :  {'GL383526.1', 'GL383572.1', 'KI270713.1', 'KI270859.1', 'KQ031389.1', 'KI270417.1', 'KI270715.1', 'ML143380.1', 'KN538371.1', 'KI270745.1', 'KI270893.1', 'KN538367.1', 'MU273395.1', 'KI270769.1', 'GL383555.2', 'KI270508.1', 'KV575245.1', 'KI270800.1', 'KI270711.1', 'KQ759761.1', 'KI270872.1', 'KI270583.1', 'KI270710.1', 'KI270877.1', 'KV575244.1',

In [5]:
infile = data_path / 'cpg.raw.sort.GCH.bed'
outfile = data_path / 'GCH.filtered.3.bed'

nome.filter_bed_files(infile, outfile, min_cov=3)

start traversing bed file  ../Data/Preprocessing_LNDR_HNDR/cpg.raw.sort.GCH.bed n =  177722748
progress : 10.0% and time elapsed 0.63 min
progress : 20.0% and time elapsed 1.31 min
progress : 30.0% and time elapsed 1.94 min
progress : 40.0% and time elapsed 2.59 min
progress : 50.0% and time elapsed 3.23 min
progress : 60.0% and time elapsed 3.88 min
progress : 70.0% and time elapsed 4.52 min
progress : 80.0% and time elapsed 5.17 min
progress : 90.0% and time elapsed 5.82 min
progress : 100.0% and time elapsed 6.45 min
filered bed file with min coverage 3 and saved to ../Data/Preprocessing_LNDR_HNDR/GCH.filtered.3.bed
Non standard chrs seen :  {'MU273365.1', 'ML143378.1', 'GL339449.2', 'KI270315.1', 'KI270417.1', 'KI270724.1', 'KQ983258.1', 'KI270908.1', 'KV880763.1', 'ML143356.1', 'KI270745.1', 'KI270515.1', 'KN196475.1', 'KI270799.1', 'GL949750.2', 'KI270786.1', 'GL383576.1', 'MU273355.1', 'ML143359.1', 'KI270938.1', 'ML143366.1', 'KI270746.1', 'KQ090017.1', 'GL000255.2', 'KI270414.

In [9]:
infile = data_path / 'cpg.raw.sort.HCG.bed'
outfile = data_path / 'HCG.filtered.baseline.bed'

nome.filter_bed_files(infile, outfile, min_cov=0)

start traversing bed file  ../Data/Preprocessing_LNDR_HNDR/cpg.raw.sort.HCG.bed n =  27888745
progress : 10.0% and time elapsed 0.12 min
progress : 20.0% and time elapsed 0.24 min
progress : 30.0% and time elapsed 0.36 min
progress : 40.0% and time elapsed 0.48 min
progress : 50.0% and time elapsed 0.6 min
progress : 60.0% and time elapsed 0.72 min
progress : 70.0% and time elapsed 0.85 min
progress : 80.0% and time elapsed 0.97 min
progress : 90.0% and time elapsed 1.1 min
progress : 100.0% and time elapsed 1.22 min
filered bed file with min coverage 0 and saved to ../Data/Preprocessing_LNDR_HNDR/HCG.filtered.baseline.bed
Non standard chrs seen :  {'GL383526.1', 'GL383572.1', 'KI270713.1', 'KI270859.1', 'KQ031389.1', 'KI270417.1', 'KI270715.1', 'ML143380.1', 'KN538371.1', 'KI270745.1', 'KI270893.1', 'KN538367.1', 'MU273395.1', 'KI270769.1', 'GL383555.2', 'KI270508.1', 'KV575245.1', 'KI270800.1', 'KI270711.1', 'KQ759761.1', 'KI270872.1', 'KI270583.1', 'KI270710.1', 'KI270877.1', 'KV575

In [5]:
infile = data_path / 'cpg.raw.sort.HCG.bed'
outfile = data_path / 'HCG.filtered.bed'

nome.filter_bed_files(infile, outfile, min_cov=5)

start traversing bed file  ../Data/Preprocessing_LNDR_HNDR/cpg.raw.sort.HCG.bed n =  27888745
progress : 10.0% and time elapsed 0.07 min
progress : 20.0% and time elapsed 0.15 min
progress : 30.0% and time elapsed 0.22 min
progress : 40.0% and time elapsed 0.29 min
progress : 50.0% and time elapsed 0.36 min
progress : 60.0% and time elapsed 0.44 min
progress : 70.0% and time elapsed 0.51 min
progress : 80.0% and time elapsed 0.59 min
progress : 90.0% and time elapsed 0.66 min
progress : 100.0% and time elapsed 0.74 min
filered bed file with min coverage 5 and saved to ../Data/Preprocessing_LNDR_HNDR/HCG.filtered.bed
Non standard chrs seen :  {'GL383526.1', 'GL383572.1', 'KI270713.1', 'KI270859.1', 'KQ031389.1', 'KI270417.1', 'KI270715.1', 'ML143380.1', 'KN538371.1', 'KI270745.1', 'KI270893.1', 'KN538367.1', 'MU273395.1', 'KI270769.1', 'GL383555.2', 'KI270508.1', 'KV575245.1', 'KI270800.1', 'KI270711.1', 'KQ759761.1', 'KI270872.1', 'KI270583.1', 'KI270710.1', 'KI270877.1', 'KV575244.1',

In [13]:
infile = data_path / 'cpg.raw.sort.HCG.bed'
outfile = data_path / 'HCG.filtered.3.bed'

nome.filter_bed_files(infile, outfile, min_cov=3)

start traversing bed file  ../Data/Preprocessing_LNDR_HNDR/cpg.raw.sort.HCG.bed n =  27888745
progress : 10.0% and time elapsed 0.09 min
progress : 20.0% and time elapsed 0.19 min
progress : 30.0% and time elapsed 0.28 min
progress : 40.0% and time elapsed 0.37 min
progress : 50.0% and time elapsed 0.47 min
progress : 60.0% and time elapsed 0.56 min
progress : 70.0% and time elapsed 0.65 min
progress : 80.0% and time elapsed 0.75 min
progress : 90.0% and time elapsed 0.84 min
progress : 100.0% and time elapsed 0.93 min
filered bed file with min coverage 3 and saved to ../Data/Preprocessing_LNDR_HNDR/HCG.filtered.3.bed
Non standard chrs seen :  {'GL383526.1', 'GL383572.1', 'KI270713.1', 'KI270859.1', 'KQ031389.1', 'KI270417.1', 'KI270715.1', 'ML143380.1', 'KN538371.1', 'KI270745.1', 'KI270893.1', 'KN538367.1', 'MU273395.1', 'KI270769.1', 'GL383555.2', 'KI270508.1', 'KV575245.1', 'KI270800.1', 'KI270711.1', 'KQ759761.1', 'KI270872.1', 'KI270583.1', 'KI270710.1', 'KI270877.1', 'KV575244.1

In [4]:
print(f"num reads GCH: {nome.get_num_reads(data_path / 'cpg.raw.sort.GCH.bed')}")
print(f"num reads GCH baseline: {nome.get_num_reads(data_path / 'GCH.filtered.baseline.bed')}")
print(f"num reads GCH filtered (3): {nome.get_num_reads(data_path / 'GCH.filtered.bed')}")
print(f"num reads GCH filtered (5): {nome.get_num_reads(data_path / 'GCH.filtered.5.bed')}")
print(f"num reads HCG: {nome.get_num_reads(data_path / 'cpg.raw.sort.HCG.bed')}")
print(f"num reads HCG baseline: {nome.get_num_reads(data_path / 'HCG.filtered.baseline.bed')}")
print(f"num reads HCG filtered (3): {nome.get_num_reads(data_path / 'HCG.filtered.bed')}")
print(f"num reads HCG filtered (5): {nome.get_num_reads(data_path / 'HCG.filtered.5.bed')}")

num reads GCH: 177722748
num reads GCH baseline: 176954899
num reads GCH filtered (3): 95269428
num reads GCH filtered (5): 37061169
num reads HCG: 27888745
num reads HCG baseline: 27712165
num reads HCG filtered (3): 13684294
num reads HCG filtered (5): 4970020


In [5]:
print(f'percent removed GCH : {1 - 95269428/176954899}')
print(f'percent removed HCG : {1 - 13684294/27712165}')

percent removed GCH : 0.4616174599381959
percent removed HCG : 0.5061990284772049


#### Sorting Ref and NOMe files

In [4]:
infile = data_path / 'GRCh38.p14.promoter.bed'
bedtools.sort_bed(infile)

In [5]:
infile = data_path / 'GRCh38.p14.intron.1.start.bed'
bedtools.sort_bed(infile)

In [6]:
infile = data_path / 'GRCh38.p14.intron.1.end.bed'
bedtools.sort_bed(infile)

In [7]:
infile = data_path / 'GRCh38.p14.intron.2.start.bed'
bedtools.sort_bed(infile)

In [6]:
infile = data_path / 'GCH.filtered.bed'
bedtools.sort_bed(infile)

In [4]:
infile = data_path / 'HCG.filtered.bed'
bedtools.sort_bed(infile)

#### LNDR and HNDR Detection

In [6]:
infile = data_path / 'GCH.filtered.sorted.bed'
nome.findNDRs(infile=infile)

Run : chr1
no of reads for chr1 : 7419512
chr1 res check - 7419512 ['chr1', 16209, 16210, '+', 0.0, 3] ['chr1', 248945307, 248945308, '+', 33.33, 3]
Finding NDR windows for chr1 with window len 200
Run : chr2
no of reads for chr2 : 11479992
chr2 res check - 11479992 ['chr2', 10664, 10665, '-', 0.0, 3] ['chr2', 242181926, 242181927, '-', 0.0, 3]
Finding NDR windows for chr2 with window len 200
Run : chr3
no of reads for chr3 : 6312270
chr3 res check - 6312270 ['chr3', 12033, 12034, '+', 0.0, 3] ['chr3', 198140418, 198140419, '-', 0.0, 3]
Finding NDR windows for chr3 with window len 200
Run : chr4
no of reads for chr4 : 5524172
chr4 res check - 5524172 ['chr4', 11850, 11851, '+', 33.33, 3] ['chr4', 190122435, 190122436, '-', 0.0, 3]
Finding NDR windows for chr4 with window len 200
Run : chr5
progress for chr1 : 5.0% and time elapsed 3.68 min
no of reads for chr5 : 5656142
chr5 res check - 5656142 ['chr5', 13382, 13383, '-', 40.0, 5] ['chr5', 181423774, 181423775, '+', 33.33, 3]
Finding N

In [7]:
infile = data_path / 'NDRp.bed'
outfile = data_path / 'NDR.bed'
with open(outfile, 'w') as fout:
    with open(infile) as fin:
        for line in fin:
            fout.write('\t'.join(line.strip().split()[:-1]) + '\n')

In [8]:
infile = data_path / 'NDR.bed'
genome_sizes = data_path / 'hg38.p14.chrom.sizes'
outfile = data_path / 'NOR.bed'
bedtools.complement_bed(infile=infile, 
                        genome_sizes=genome_sizes, 
                        outfile=outfile)

In [4]:
print(f"NDR (3): {nome.get_num_reads(data_path / 'NDR.bed')}")
print(f"NDR (5): {nome.get_num_reads(data_path / 'NDR.5.bed')}")
print(f"NOR (3): {nome.get_num_reads(data_path / 'NOR.bed')}")
print(f"NOR (5): {nome.get_num_reads(data_path / 'NOR.5.bed')}")

NDR (3): 86844
NDR (5): 39226
NOR (3): 86868
NOR (5): 39250
