In [1]:
from pathlib import Path
import pandas as pd

In [2]:
from workspace import nometools as nome
from workspace import utils
from workspace import bedtools

In [3]:
main_path = Path('..')
preprocessing_path = main_path / 'Data' / 'Preprocessing_LNDR_HNDR'
intersect_path = main_path / 'Data' / 'intersect_regions'

#### Intersect/map GpC and CpG to defined regions

In [13]:
region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
# region = 'intron.2.start'

In [14]:
region_sorted = preprocessing_path / f'GRCh38.p14.{region}.sorted.bed'
gch_sorted = preprocessing_path / 'GCH.filtered.sorted.bed'
outfile = intersect_path / f'GCH.{region}.intersect.bed'

bedtools.intersect_bed(region_sorted, gch_sorted, outfile)

In [15]:
region_sorted = preprocessing_path / f'GRCh38.p14.{region}.sorted.bed'
hcg_sorted = preprocessing_path / 'HCG.filtered.sorted.bed'
outfile = intersect_path / f'HCG.{region}.intersect.bed'

bedtools.intersect_bed(region_sorted, hcg_sorted, outfile)

#### Randomize HCG Methylation

In [22]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
region = 'intron.2.start'

In [23]:
infile = intersect_path / f'HCG.{region}.intersect.bed'
cols = ['chrom', 'region_start', 'region_end', 'refid', 
        'TSS', 'TES', 'strand', 'chrom_', 'meth_start_genome', 'meth_end_genome', 
        'strand_', 'meth_rate', 'coverage', 'nt']
df_HCG_intersect_random = pd.read_csv(infile, sep='\t', names=cols)
df_HCG_intersect_random['meth_rate'] = df_HCG_intersect_random['meth_rate'].sample(frac=1).reset_index(drop=True)
outfile = intersect_path / f'HCG.{region}.intersect.random.bed'
df_HCG_intersect_random.to_csv(outfile, sep='\t', header=False, index=False)

#### Intersect/map NDR and NOR to defined regions

In [33]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
region = 'intron.2.start'

In [34]:
region_sorted = preprocessing_path / f'GRCh38.p14.{region}.sorted.bed'
ndrfile = preprocessing_path / 'NDR.bed'
outfile = intersect_path / f'NDR.{region}.intersect.bed'

bedtools.intersect_bed(region_sorted, ndrfile, outfile)

In [35]:
region_sorted = preprocessing_path / f'GRCh38.p14.{region}.sorted.bed'
norfile = preprocessing_path / 'NOR.bed'
outfile = intersect_path / f'NOR.{region}.intersect.bed'

bedtools.intersect_bed(region_sorted, norfile, outfile)

#### Mapping CpGs to NDRs and HNDRs of specified regions 

In [56]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
region = 'intron.2.start'

In [57]:
NDR_file = intersect_path / f'NDR.{region}.intersect.bed'
HCG_file = intersect_path / f'HCG.{region}.intersect.bed'
outfile = intersect_path / f'{region}.NDR.HCG.intersect.bed'
bedtools.intersect_bed(NDR_file, HCG_file, outfile)

In [58]:
NDR_file = intersect_path / f'NDR.{region}.intersect.bed'
HCG_file = intersect_path / f'HCG.{region}.intersect.random.bed'
outfile = intersect_path / f'{region}.NDR.HCG.random.intersect.bed'
bedtools.intersect_bed(NDR_file, HCG_file, outfile)

In [None]:
NDR_file = intersect_path / f'NOR.{region}.intersect.bed'
HCG_file = intersect_path / f'HCG.{region}.intersect.bed'
outfile = intersect_path / f'{region}.NOR.HCG.intersect.bed'
bedtools.intersect_bed(NDR_file, HCG_file, outfile)

In [None]:
NDR_file = intersect_path / f'NOR.{region}.intersect.bed'
HCG_file = intersect_path / f'HCG.{region}.intersect.random.bed'
outfile = intersect_path / f'{region}.NOR.HCG.random.intersect.bed'
bedtools.intersect_bed(NDR_file, HCG_file, outfile)