<a href="https://colab.research.google.com/github/irinaachikhmina/Triplexes/blob/main/1_04_Data_preprocessing_cCREs%2Cchromatine_states.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data mining and preprocessing: preprocessing of regulatory data

##Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -c bioconda bedtools

⏬ Downloading https://github.com/conda-forge/miniforge/releases/download/23.1.0-1/Mambaforge-23.1.0-1-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:12
🔁 Restarting kernel...


In [None]:
!pip install pybedtools

In [None]:
from pybedtools import BedTool

In [None]:
import pandas as pd
#import pyranges as pr

##Chromatin states and cCREs

Data from Encode, current version as of 27.02.2023, hg38. <br>
Cell types:
* B cells
* NAMALWA
* GM06990
* GM08714
* GM10248
* GM10266
* GM12801 
* GM12864
* GM12865
* GM12872 
* GM12873 
* GM12874 
* GM12875
* GM12878
* GM12891
* GM12892
* GM13976 
* GM13977 
* GM19238
* GM19239
* GM19240
* GM20000 
* MM.1S
* RPMI8226
* acctivated B cell
* igD-negative memory B cell

Cell-agnistic cCREs:

In [None]:
!zcat /content/drive/MyDrive/Triplexes/data/ccres_agnostic/ENCFF924IMH.bed.gz | cut -f 1-3,10 > /content/drive/MyDrive/Triplexes/data/ccres_ag.bed

In [None]:
!sort -k1,1V -k2,2n /content/drive/MyDrive/Triplexes/data/ccres_ag.bed > /content/drive/MyDrive/Triplexes/data/ccres_agn.bed

Cell-specific cCREs:

In [None]:
!zcat /content/drive/MyDrive/Triplexes/data/ccres_specific/*.bed.gz | cut -f 1-3,10 > /content/drive/MyDrive/Triplexes/data/ccres_s.bed

In [None]:
!sort -k1,1V -k2,2n /content/drive/MyDrive/Triplexes/data/ccres_s.bed > /content/drive/MyDrive/Triplexes/data/ccres_spec.bed

Chromosome states:

In [None]:
!zcat /content/drive/MyDrive/Triplexes/data/chrom_states/*.bed.gz | cut -f 1-4 > /content/drive/MyDrive/Triplexes/data/chr_st.bed

In [None]:
!sort -k1,1V -k2,2n /content/drive/MyDrive/Triplexes/data/chr_st.bed > /content/drive/MyDrive/Triplexes/data/chr_states.bed

##Filtering for black-listed regions

In [None]:
!gdown https://github.com/Boyle-Lab/Blacklist/raw/master/lists/hg38-blacklist.v2.bed.gz

Downloading...
From: https://github.com/Boyle-Lab/Blacklist/raw/master/lists/hg38-blacklist.v2.bed.gz
To: /content/hg38-blacklist.v2.bed.gz
  0% 0.00/5.87k [00:00<?, ?B/s]100% 5.87k/5.87k [00:00<00:00, 18.4MB/s]


###Cell-agnostic cCREs

In [None]:
ccres_agn = !bedtools subtract -a '/content/drive/MyDrive/Triplexes/data/ccres_agn.bed' -b '/content/hg38-blacklist.v2.bed.gz'
ccres_agn = BedTool(ccres_agn)
ccres_agn.saveas('/content/drive/MyDrive/Triplexes/data/ccres_agn.bed')

<BedTool(/content/drive/MyDrive/Triplexes/data/ccres_agn.bed)>

In [None]:
print(len(ccres_agn))

923578


In [None]:
ccres_agn.Name.value_counts()

dELS                        448044
dELS,CTCF-bound             217991
pELS,CTCF-bound              77056
pELS                         64184
CTCF-only,CTCF-bound         56400
PLS,CTCF-bound               27057
DNase-H3K4me3                16600
DNase-H3K4me3,CTCF-bound      8706
PLS                           7540
Name: Name, dtype: int64

###Cell-specific cCREs

In [None]:
ccres = !bedtools subtract -a '/content/drive/MyDrive/Triplexes/data/ccres_spec.bed' -b '/content/hg38-blacklist.v2.bed.gz'
len(ccres)

43453317

In [None]:
ccres = BedTool(ccres)

In [None]:
ccres.Name.value_counts()

Low-DNase                    23756041
Unclassified                 15170426
DNase-only                    2101460
High-H3K4me3                   804073
High-H3K4me3,High-H3K27ac      380206
High-CTCF                      357039
High-H3K27ac                   230199
DNase-only,CTCF-bound          191768
PLS                            135801
pELS                           100849
dELS                            97633
DNase-H3K4me3                   35247
CTCF-only,CTCF-bound            33429
PLS,CTCF-bound                  19962
High-H3K4me3,High-CTCF          15449
pELS,CTCF-bound                 10206
dELS,CTCF-bound                  8036
DNase-H3K4me3,CTCF-bound         5493
Name: Name, dtype: int64

In [None]:
print(len(ccres))
print(len(ccres) - len(ccres[ccres['Name'] == 'Unclassified']))

43453317
28282891


In [None]:
ccres.saveas('/content/drive/MyDrive/Triplexes/data/ccres_spec.bed')

<BedTool(/content/drive/MyDrive/Triplexes/ccres_spec.bed)>

###Chromatine states

In [None]:
chr_states = !bedtools subtract -a '/content/drive/MyDrive/Triplexes/data/chr_states.bed' -b '/content/hg38-blacklist.v2.bed.gz'
len(chr_states)

456066

In [None]:
chr_states = BedTool(chr_states)
chr_states.saveas('/content/drive/MyDrive/Triplexes/data/chr_states.bed')

<BedTool(/content/drive/MyDrive/Triplexes/data/chr_states.bed)>

In [None]:
chr_states = pd.read_csv('/content/drive/MyDrive/Triplexes/data/chr_states.bed', sep='\t', names=('Chromosome', 'Start', 'End', 'Name'))
chr_states.head()

Unnamed: 0,Chromosome,Start,End,Name
0,chr1,792500,826620,Quies
1,chr1,826620,827620,TssFlnk
2,chr1,827620,865420,Quies
3,chr1,865420,866620,EnhWk
4,chr1,866620,869420,TxWk


In [None]:
print(chr_states['Name'].unique())

['Quies' 'TssFlnk' 'EnhWk' 'TxWk' 'ReprPCWk' 'ReprPC' 'EnhBiv' 'TssBiv'
 'EnhA2' 'Tx' 'TssFlnkD' 'TssFlnkU' 'TssA' 'EnhA1' 'ZNF/Rpts' 'Het'
 'EnhG2' 'EnhG1']
