<a href="https://colab.research.google.com/github/irinaachikhmina/Triplexes/blob/main/1_07_Data_processing_hg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install ray

In [None]:
!pip install pyranges

In [None]:
!pip install pybedtools

In [None]:
!pip install biopython

In [None]:
import os
import numpy as np
import pandas as pd
import sys
from collections import defaultdict

import Bio
from Bio import SeqIO

from joblib import Parallel, delayed, dump, load
from tqdm import trange
from tqdm.notebook import tqdm

import pickle

In [None]:
# Sparse vector
!git clone --recurse-submodules https://github.com/Nazar1997/Sparse_vector
from Sparse_vector.sparse_vector import SparseVector

#Chromosomes

In [None]:
chrom_names = [f'chr{i}' for i in list(range(1, 23)) + ['X', 'Y']]

In [None]:
chroms = {}
length_genome = 0
for name in tqdm(chrom_names):
    chroms[name] = SeqIO.read('/content/drive/MyDrive/Triplexes/data/hg38/' 
                              + 'Homo_sapiens.GRCh38.dna.' 
                              + name[:3] 
                              + 'omosome.' 
                              + name[3:] 
                              + '.fa', 
                              'fasta')
    length_genome += len(chroms[name])
lens_of_chroms = {chrom: len(chroms[chrom]) for chrom in chroms}

  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
del chroms

##Triplex regions

### Sparse vector

In [None]:
loc_dd = {chrm:SparseVector(lens_of_chroms[chrm]) for chrm in chroms}
df = pd.read_csv(f'/content/drive/MyDrive/Triplexes/GF.bed', sep='\t', header=None)
df.columns = ['chrom', 'Start', 'End', 'Name', 'Score']
for chrom, sub_df in tqdm(df.groupby('chrom')):
    if chrom not in chroms:
        continue
    vec = np.zeros(lens_of_chroms[chrom])
    for inter in sub_df.values:
        vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
    loc_dd[chrom] = SparseVector(vec)

dump(loc_dd, f'/content/drive/MyDrive/Triplexes/data/omics_sparse/triplex.pkl', 3)

  0%|          | 0/24 [00:00<?, ?it/s]

['/content/drive/MyDrive/Triplexes/data/omics_sparse/triplex.pkl']

### Intervals positive and negative

In [None]:
length = 34
overlap = 2

In [None]:
GF = BedTool('/content/drive/MyDrive/Triplexes/GF.bed')

In [None]:
def get_splices(record, subseq_len=length):
  if record.end - record.start > subseq_len:
    output = []
    start = record.start
    while start < record.end:          
      if (start + subseq_len) >= record.end:
        temp = [record.chrom, start, record.end]
        start += subseq_len
      else:
        temp = [record.chrom, start, start + subseq_len]
        start += subseq_len - overlap
      output.append(BedTool(' '.join(str(el) for el in temp), from_string=True)[0])
    return output
  else:
    return [record]

In [None]:
def get_subseq(record, subseq_len=length):
  right_add = subseq_len - (record.end - record.start)
  return [chroms[record.chrom][record.start:record.end + right_add].seq]

In [None]:
def seq_to_features(seq):
    feat = np.zeros((len(seq), 4))
    for i, nuc in enumerate(seq):
        if nuc.lower() == 'a':
            feat[i][0] = 1
        elif nuc.lower() == 't':
            feat[i][1] = 1
        elif nuc.lower() == 'g':
            feat[i][2] = 1
        elif nuc.lower() == 'c':
            feat[i][3] = 1
    return feat

Cutting sequences over 34 nc long to subsequences, padding shorter sequences, one-hot encoding

In [None]:
intervals = []
intervals.extend([get_splices(record) for record in GF])
intervals = [item for sublist in intervals for item in sublist]
intervals = BedTool(intervals)

ints_in = []
for record in intervals:
  right_add = length - (record.end - record.start)
  ints_in.append((record.chrom, record.start, record.start + length))
ints_in = BedTool(ints_in)

HDNA_pos = []
chroms_used = {}

for name in chrom_names:
    chroms_used[name] = set()

for record in intervals:
  HDNA_pos.extend([seq_to_features(x) for x in get_subseq(record)])
  chroms_used[record.chrom].update(set(range(record.start, record.end)))

print(len(HDNA_pos))
print(HDNA_pos[0].shape)

9019
(34, 4)


In [None]:
ints_in.saveas('/content/drive/MyDrive/Triplexes/data/ints_in.bed')
with open('/content/drive/MyDrive/Triplexes/data/HDNA_pos.pkl', 'wb') as fp:
   pickle.dump(HDNA_pos, fp)

Negative class

In [None]:
from random import choice, randrange

In [None]:
portions = [1, 3, 5, 10, 20, 50, 70]
for portion in portions:
  num_GF_neg = len(ints_in) * portion
 
  GF_neg = []
 
  for i in tqdm(range(num_GF_neg)):
    chrom = choice(chrom_names)
    start = randrange(len(chroms[chrom]) - length)
    end = start + length
    while chroms[chrom][start:end].seq.count('N') / length >= 0.01 or \
    not chroms_used[chrom].isdisjoint(set(range(start, end))):
        chrom = choice(chrom_names)
        start = randrange(len(chroms[chrom]) - length)
        end = start + length
    chroms_used[chrom].update(set(range(start, end)))
    GF_neg.append((chrom, start, end))
  
  ints_out = BedTool(GF_neg)

  HDNA_neg = []
  for record in ints_out:
    HDNA_neg.extend([seq_to_features(x) for x in get_subseq(record)])

  print(len(HDNA_neg))
  print(HDNA_neg[0].shape)  
  
  ints_out.saveas(f'/content/drive/MyDrive/Triplexes/data/ints_out{portion}.bed')
#  with open(f'/content/drive/MyDrive/Triplexes/data/HDNA_neg{portion}.pkl', 'wb') as fp:
#    pickle.dump(HDNA_neg, fp)

  0%|          | 0/9019 [00:00<?, ?it/s]

9019
(34, 4)


  0%|          | 0/27057 [00:00<?, ?it/s]

27057
(34, 4)


  0%|          | 0/45095 [00:00<?, ?it/s]

45095
(34, 4)


  0%|          | 0/90190 [00:00<?, ?it/s]

90190
(34, 4)


  0%|          | 0/180380 [00:00<?, ?it/s]

180380
(34, 4)


  0%|          | 0/450950 [00:00<?, ?it/s]

450950
(34, 4)


  0%|          | 0/631330 [00:00<?, ?it/s]

631330
(34, 4)


##Regulatory data

###Histone marks, transcription factors, RNA polymerase, DNase

In [None]:
files = os.listdir('/content/drive/MyDrive/Triplexes/data/omics_raw/')[::-1]

In [None]:
factors = []
for file in files:
  if file.endswith('.bed'):
    df = pd.read_csv(f'/content/drive/MyDrive/Triplexes/data/omics_raw/{file}', sep='\t')
    print(file[:-4], len(df.Factor.unique()))
    for factor in df.Factor.unique():
      factors.append((file[:-4], factor))
factors_df = pd.DataFrame(factors, columns=['Group', 'Factor'])
factors_df.to_csv('/content/drive/MyDrive/Triplexes/data/factors_df.bed')

In [None]:
! head -2 /content/drive/MyDrive/Triplexes/data/omics_raw/tf.bed

In [None]:
def sparser(file, lens_of_chroms, chrom_names, done_files):
  df = pd.read_csv(f'/content/drive/MyDrive/Triplexes/data/omics_raw/{file}', sep='\t')
  for factor, factor_df in tqdm(df.groupby(df.Factor), desc=file):
    if factor in done_files:
      continue
    loc_dd = {}
    for chrom, sub_df in tqdm(factor_df.groupby(factor_df.Chromosome), desc=factor):
        if chrom not in chrom_names:
          print(chrom)
          continue
        vec = np.zeros(lens_of_chroms[chrom])
        for inter in sub_df.values:
          vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], inter[5])
        loc_dd[chrom] = SparseVector(vec)
    for chrom in chrom_names:
      if chrom not in loc_dd:
        loc_dd[chrom] = SparseVector(np.zeros(lens_of_chroms[chrom]))
    dump(loc_dd, f'/content/drive/MyDrive/Triplexes/data/omics_sparse/{factor}.pkl', 3)

In [None]:
done_files = set([i[:-4] for i in os.listdir('/content/drive/MyDrive/Triplexes/data/omics_sparse/')])
len(done_files)

In [None]:
for file in files:
  if file.endswith('.bed'):
    sparser(file, lens_of_chroms, chrom_names, done_files)

##Chromatin states and TAD boundaries

In [None]:
files_others = os.listdir('/content/drive/MyDrive/Triplexes/data/omics_raw_others/')[::-1]
files_others

['tad.bed', 'chr_states.bed']

In [None]:
!head -2 /content/drive/MyDrive/Triplexes/data/omics_raw_others/chr_states.bed

chr1	792500	826620	Quies
chr1	826620	827620	TssFlnk


In [None]:
! tr ZNF/Rpts ZNF_Rpts < /content/drive/MyDrive/Triplexes/data/omics_raw_others/chr_states.bed > /content/drive/MyDrive/Triplexes/data/omics_raw_others/chr_states2.bed

In [None]:
import os
os.remove('/content/drive/MyDrive/Triplexes/data/omics_raw_others/chr_states.bed')

In [None]:
def sparser(file, lens_of_chroms, chrom_names):
  df = pd.read_csv(f'/content/drive/MyDrive/Triplexes/data/omics_raw_others/{file}', sep='\t')
  df.columns = ['chrom', 'Start', 'End', 'Factor']
  for factor in df.Factor.unique():
      factor_df = df[df.Factor == factor]
      loc_dd = {}
      for chrom, sub_df in tqdm(factor_df.groupby(factor_df.columns[0]), desc=factor):
          if chrom not in chrom_names:
              continue
          vec = np.zeros(lens_of_chroms[chrom])
          for inter in sub_df.values:
              vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
          loc_dd[chrom] = SparseVector(vec)
      dump(loc_dd, f'/content/drive/MyDrive/Triplexes/data/states_sparse/{factor}.pkl', 3)

In [None]:
sparser('chr_states2.bed', lens_of_chroms, chrom_names)

TssFlnk:   0%|          | 0/23 [00:00<?, ?it/s]

Quies:   0%|          | 0/23 [00:00<?, ?it/s]

EnhWk:   0%|          | 0/23 [00:00<?, ?it/s]

TxWk:   0%|          | 0/23 [00:00<?, ?it/s]

ReprPCWk:   0%|          | 0/23 [00:00<?, ?it/s]

ReprPC:   0%|          | 0/23 [00:00<?, ?it/s]

EnhBiv:   0%|          | 0/23 [00:00<?, ?it/s]

TssBiv:   0%|          | 0/23 [00:00<?, ?it/s]

EnhA2:   0%|          | 0/23 [00:00<?, ?it/s]

Tx:   0%|          | 0/23 [00:00<?, ?it/s]

TssFlnkD:   0%|          | 0/23 [00:00<?, ?it/s]

TssFlnkU:   0%|          | 0/23 [00:00<?, ?it/s]

TssA:   0%|          | 0/23 [00:00<?, ?it/s]

EnhA1:   0%|          | 0/23 [00:00<?, ?it/s]

ZNF_Rpts:   0%|          | 0/23 [00:00<?, ?it/s]

Het:   0%|          | 0/23 [00:00<?, ?it/s]

EnhG2:   0%|          | 0/23 [00:00<?, ?it/s]

EnhG1:   0%|          | 0/23 [00:00<?, ?it/s]

In [None]:
!head -2 /content/drive/MyDrive/Triplexes/data/omics_raw_others/tad.bed

chr1	3403436	3443436	chr1:3320001-3360000	1
chr1	6019940	6059940	chr1:6080001-6120000	1


In [None]:
loc_dd = {chrm:SparseVector(lens_of_chroms[chrm]) for chrm in chrom_names}
df = pd.read_csv(f'/content/drive/MyDrive/Triplexes/data/omics_raw_others/tad.bed', sep='\t', header=None)
df.columns = ['chrom', 'Start', 'End', 'Name', 'Score']
for chrom, sub_df in tqdm(df.groupby('chrom')):
    if chrom not in chrom_names:
        continue
    vec = np.zeros(lens_of_chroms[chrom])
    for inter in sub_df.values:
        vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
    loc_dd[chrom] = SparseVector(vec)

dump(loc_dd, f'/content/drive/MyDrive/Triplexes/data/others/tad.pkl', 3)

  0%|          | 0/23 [00:00<?, ?it/s]

['/content/drive/MyDrive/Triplexes/data/others/tad.pkl']

# Checks

## Signal per chromosome

### TF

In [None]:
file_tf = '/content/drive/MyDrive/Triplexes/data/omics_raw/tf.bed'
df_tf = pd.read_csv(file_tf, sep='\t')
for factor, factor_df in df_tf.groupby(df_tf.Factor):
  counts = 0
  for chrom in factor_df.Chromosome.unique():
    if chrom in chrom_names:
      counts += 1
  if counts <18:
    print(factor, counts)

AATF 7
ELF2 17
IKZF3 2
IRF8 3
SUPT5H 2
YY2 4
ZNF274 17


In [None]:
factor = 'AATF'
factor_df = df_tf[df_tf.Factor == factor]
print(factor_df.Chromosome.unique())

['chr1' 'chr1_KI270706v1_random' 'chr7' 'chr10' 'chr14_GL000225v1_random'
 'chr14_KI270723v1_random' 'chr16' 'chr17' 'chr17_KI270729v1_random'
 'chr20' 'chr21' 'chr22_KI270731v1_random' 'chr22_KI270733v1_random'
 'chr22_KI270734v1_random' 'chr22_KI270735v1_random'
 'chr22_KI270736v1_random' 'chr22_KI270737v1_random' 'chrUn_GL000214v1'
 'chrUn_GL000216v2' 'chrUn_GL000220v1' 'chrUn_GL000224v1'
 'chrUn_KI270303v1' 'chrUn_KI270304v1' 'chrUn_KI270310v1'
 'chrUn_KI270317v1' 'chrUn_KI270330v1' 'chrUn_KI270333v1'
 'chrUn_KI270336v1' 'chrUn_KI270337v1' 'chrUn_KI270411v1'
 'chrUn_KI270422v1' 'chrUn_KI270435v1' 'chrUn_KI270438v1'
 'chrUn_KI270442v1' 'chrUn_KI270466v1' 'chrUn_KI270467v1'
 'chrUn_KI270519v1' 'chrUn_KI270544v1' 'chrUn_KI270744v1'
 'chrUn_KI270756v1']


In [None]:
print(len(df_tf.Cell_type.unique()), df_tf.Cell_type.unique())

24 ['NALM-6' 'GM12878' 'RAJI' 'GM12891' 'Lymphoblastoidcellline' 'GM19239'
 'GM19238' 'GM12892' 'GM19240' 'RAMOS' 'GM18951' 'GM19099' 'Toledo'
 'GM10847' 'GM19193' 'GM18505' 'GM18526' 'Bcells' 'CD20+' 'P493-6'
 'Lymphoblastoidcells' 'BCBL1' 'Lymphoma,B-Cell' 'RPMI8226']


In [None]:
#No chr3
AATF = pd.read_csv('/content/drive/MyDrive/Triplexes/data/Oth.Bld.05.AATF.AllCell.bed', sep='\t', header=None)
AATF.columns = ['Chromosome', 'Start', 'End', 'Info', '1', '2', '3', '4', '5']
AATF_chr3 = AATF[AATF.Chromosome == 'chr3'].copy()
chrom3 = []
for i, line in AATF_chr3.iterrows():
  experiment = line.Info.split('ID=')[1].split(';')[0] 
  cell_type = line.Info.split('@%20')[1].split('%')[0]
  chrom3.append((line.Chromosome, line.Start, line.End, cell_type, experiment))
chrom3 = pd.DataFrame(chrom3)
chrom3.columns = ['Chromosome', 'Start', 'End', 'Cell_line', 'Experiment']
chrom3.Cell_line.unique()

array(['KMS-27);Title=GSM2690227:', 'NALM-6);Title=GSM2459096:'],
      dtype=object)

In [None]:
chrom3[chrom3.Cell_line == 'NALM-6);Title=GSM2459096:'] #B cells, experiment SRX2493169

Unnamed: 0,Chromosome,Start,End,Cell_line,Experiment
1,chr3,75669035,75669631,NALM-6);Title=GSM2459096:,SRX2493169
3,chr3,93470252,93470907,NALM-6);Title=GSM2459096:,SRX2493169
5,chr3,196898656,196899001,NALM-6);Title=GSM2459096:,SRX2493169


Blacklist:

In [None]:
blacklist = pd.read_csv('hg38-blacklist.v2.bed.gz', sep='\t')
blacklist.columns = ['Chromosome', 'Start', 'End', 'Description']
d = blacklist[blacklist.Chromosome == 'chr3']
d[d.Start > 75600035].head(1)

Unnamed: 0,Chromosome,Start,End,Description
259,chr3,75630100,75707800,High Signal Region


In [None]:
d[d.Start > 91000252].head(1)

Unnamed: 0,Chromosome,Start,End,Description
266,chr3,91516200,93749200,High Signal Region


In [None]:
d[d.Start > 196897000].head(1)

Unnamed: 0,Chromosome,Start,End,Description
313,chr3,196897800,196899800,High Signal Region


Conclusion: correct, intervals are within 'high signal region' and were filtered out 

In [None]:
#No chr2
AATF_chr2 = AATF[AATF.Chromosome == 'chr2']
chrom2 = []
for i, line in AATF_chr2.iterrows():
  experiment = line.Info.split('ID=')[1].split(';')[0] 
  cell_type = line.Info.split('@%20')[1].split('%')[0]
  chrom2.append((line.Chromosome, line.Start, line.End, cell_type, experiment))
chrom2 = pd.DataFrame(chrom2)
chrom2.columns = ['Chromosome', 'Start', 'End', 'Cell_line', 'Experiment']
chrom2.Cell_line.unique()

array(['NALM-6);Title=GSM2459096:', 'KMS-27);Title=GSM2690227:'],
      dtype=object)

In [None]:
chrom2[chrom2.Cell_line == 'NALM-6);Title=GSM2459096:'] #B cells, the same experiment

Unnamed: 0,Chromosome,Start,End,Cell_line,Experiment
0,chr2,32916226,32916750,NALM-6);Title=GSM2459096:,SRX2493169
4,chr2,89829178,89829498,NALM-6);Title=GSM2459096:,SRX2493169
5,chr2,89830928,89831185,NALM-6);Title=GSM2459096:,SRX2493169
6,chr2,89840172,89840543,NALM-6);Title=GSM2459096:,SRX2493169
8,chr2,109199265,109199912,NALM-6);Title=GSM2459096:,SRX2493169


Blacklist:

In [None]:
d = blacklist[blacklist.Chromosome == 'chr2']
d[d.Start > 32915000].head(1)

Unnamed: 0,Chromosome,Start,End,Description
195,chr2,32915300,32918400,High Signal Region


### Histone marks

In [None]:
file = '/content/drive/MyDrive/Triplexes/data/omics_raw/his.bed'
df = pd.read_csv(file, sep='\t')
for factor, factor_df in df.groupby(df.Factor):
  counts = 0
  for chrom in factor_df.Chromosome.unique():
    if chrom in chrom_names:
      counts += 1
  print(factor, counts)

H2A.XS139ph 19
H2A.Z 24
H2BK20ac 22
H3 23
H3.3 22
H3K27ac 24
H3K27me1 7
H3K27me3 24
H3K36me3 24
H3K4me1 24
H3K4me2 24
H3K4me3 24
H3K79me2 22
H3K9ac 24
H3K9me3 24
H3ac 24
H4K20me1 24
H4K5K8ac 23
H4R3me2 2


In [None]:
print(len(df.Cell_type.unique()), df.Cell_type.unique())

27 [' Lymphoblastoid cell line' ' RAJI' ' GM18951' ' GM19238' ' GM10847'
 ' GM19240' ' GM18526' ' GM12891' ' GM19239' ' GM12892' ' GM12878'
 ' GM19193' ' GM19099' ' RAMOS' ' GM18505' ' P493-6' ' B cells' ' Toledo'
 ' CD20+' ' B-cell (CD19+)' ' Lymphoma, B-Cell' ' BCBL1' ' CD20+ B cells'
 ' NALM-6' ' RPMI 8226' ' Lymphoblastoid cells' ' GM09027A']


In [None]:
#No chr3
H3K27me1 = pd.read_csv('/content/drive/MyDrive/Triplexes/data/His.Bld.05.H3K27me1.AllCell.bed', sep='\t', header=None)
H3K27me1.columns = ['Chromosome', 'Start', 'End', 'Info', '1', '2', '3', '4', '5']
H3K27me1_chr3 = H3K27me1[H3K27me1.Chromosome == 'chr3'].copy()
chrom3 = []
for i, line in H3K27me1_chr3.iterrows():
  experiment = line.Info.split('ID=')[1].split(';')[0] 
  cell_type = line.Info.split('@%20')[1].split('%')[0]
  chrom3.append((line.Chromosome, line.Start, line.End, cell_type, experiment))
chrom3 = pd.DataFrame(chrom3)
chrom3.columns = ['Chromosome', 'Start', 'End', 'Cell_line', 'Experiment']
chrom3.Cell_line.unique()

array(['CD34', 'CD4', 'Erythroblasts);Title=GSM1427063:', 'CD36',
       'Hematopoietic', 'B'], dtype=object)

In [None]:
chrom3[chrom3.Cell_line == 'B'] #B cells, experiment SRX170351

Unnamed: 0,Chromosome,Start,End,Cell_line,Experiment
561,chr3,93470296,93470871,B,SRX170351
1296,chr3,196898754,196898899,B,SRX170351


Check with blacklist:

In [None]:
d = blacklist[blacklist.Chromosome == 'chr3']
d[d.Start > 91000000].head(1)

Unnamed: 0,Chromosome,Start,End,Description
266,chr3,91516200,93749200,High Signal Region


In [None]:
d[d.Start > 196897000].head(1)

Unnamed: 0,Chromosome,Start,End,Description
313,chr3,196897800,196899800,High Signal Region


Conclusion: correct, the intervals were filterred out 

In [None]:
#No chr4
H3K27me1_chr4 = H3K27me1[H3K27me1.Chromosome == 'chr4'].copy()
chrom4 = []
for i, line in H3K27me1_chr4.iterrows():
  experiment = line.Info.split('ID=')[1].split(';')[0] 
  cell_type = line.Info.split('@%20')[1].split('%')[0]
  chrom4.append((line.Chromosome, line.Start, line.End, cell_type, experiment))
chrom4 = pd.DataFrame(chrom4)
chrom4.columns = ['Chromosome', 'Start', 'End', 'Cell_line', 'Experiment']
chrom4.Cell_line.unique()

array(['CD4', 'CD34', 'Erythroblasts);Title=GSM1427063:', 'CD36',
       'Hematopoietic', 'B'], dtype=object)

In [None]:
chrom4[chrom4.Cell_line == 'B'] #B cells, the same experiment.

Unnamed: 0,Chromosome,Start,End,Cell_line,Experiment
618,chr4,49709136,49709574,B,SRX170351
624,chr4,49709698,49710233,B,SRX170351
628,chr4,49710448,49711913,B,SRX170351
656,chr4,51107184,51107648,B,SRX170351
664,chr4,51418737,51418945,B,SRX170351
1764,chr4,190179596,190179788,B,SRX170351


In [None]:
#No chr6
H3K27me1_chr6 = H3K27me1[H3K27me1.Chromosome == 'chr6'].copy()
chrom6 = []
for i, line in H3K27me1_chr6.iterrows():
  experiment = line.Info.split('ID=')[1].split(';')[0] 
  cell_type = line.Info.split('@%20')[1].split('%')[0]
  chrom6.append((line.Chromosome, line.Start, line.End, cell_type, experiment))
chrom6 = pd.DataFrame(chrom6)
chrom6.columns = ['Chromosome', 'Start', 'End', 'Cell_line', 'Experiment']
chrom6.Cell_line.unique()

array(['Erythroblasts);Title=GSM1427063:', 'CD34', 'CD36',
       'Hematopoietic', 'CD4', 'B'], dtype=object)

In [None]:
chrom6[chrom6.Cell_line == 'B'] #B cells, the same experiment.

Unnamed: 0,Chromosome,Start,End,Cell_line,Experiment
1091,chr6,157310833,157310990,B,SRX170351
1265,chr6,170398627,170399895,B,SRX170351
1270,chr6,170400011,170400248,B,SRX170351


###Conclusion:
Regulatory marks present in 1-7 chromosomes will be deleted from the analysis.