## Extract labels from DREAM

- Nanog: https://www.synapse.org/#!Synapse:syn8442110
- CEBPB: https://www.synapse.org/#!Synapse:syn8442119
- JUND: https://www.synapse.org/#!Synapse:syn8442086

- U = unbound (negatives)
- A = ambiguous (not used for prediction evaluation)
- B = bound (positives)

In [35]:
label_map = {"U": 0, "A": -1, "B": 1}

In [1]:
from m_kipoi.config import get_data_dir

Using TensorFlow backend.


In [2]:
ddir = get_data_dir()

In [5]:
import pandas as pd

In [10]:
from pathlib import Path

In [11]:
tfdir = Path(f"{ddir}/raw/tfbinding/eval/tf-DREAM")

In [12]:
ls {tfdir}

[0m[01;31mCEBPB.train.labels.tsv.gz[0m
chr8_wide_bin101_flank0_stride101.CEBPB.HeLa-S3.intervals_file.tsv
[01;31mchr8_wide_bin101_flank0_stride101.CEBPB.HeLa-S3.intervals_file.tsv.gz[0m
chr8_wide_bin101_flank0_stride101.JUND.HepG2.intervals_file.tsv
[01;31mchr8_wide_bin101_flank0_stride101.JUND.HepG2.intervals_file.tsv.gz[0m
chr8_wide_bin101_flank0_stride101.MAFK.K562.intervals_file.tsv
[01;31mchr8_wide_bin101_flank0_stride101.MAFK.K562.intervals_file.tsv.gz[0m
chr8_wide_bin101_flank0_stride101.NANOG.H1-hESC.intervals_file.tsv
[01;31mchr8_wide_bin101_flank0_stride101.NANOG.H1-hESC.intervals_file.tsv.gz[0m
[01;31mDNASE.H1-hESC.relaxed.narrowPeak.gz[0m
[01;31mDNASE.HeLa-S3.relaxed.narrowPeak.gz[0m
[01;31mDNASE.HepG2.relaxed.narrowPeak.gz[0m
[01;31mDNASE.K562.relaxed.narrowPeak.gz[0m
[01;31mJUND.train.labels.tsv.gz[0m
[01;31mNANOG.train.labels.tsv.gz[0m
README.md


In [19]:
from m_kipoi.exp.tfbinding.config import TF_C_pairs

[31m
---
Installed package 'kipoi=0.6.3' doesn't comply with 'kipoi>=0.6.16'
---
install or update the missing packages
[0m
Note: If you don't want to auto_update the model source, 
add `auto_update: False` to ~/.kipoi/config.yaml

[31m
---
Installed package 'kipoi=0.6.3' doesn't comply with 'kipoi>=0.6.9'
---
install or update the missing packages
[0m
Note: If you don't want to auto_update the model source, 
add `auto_update: False` to ~/.kipoi/config.yaml



### Implement

In [20]:
TF_C_pairs

[('CEBPB', 'HeLa-S3'),
 ('JUND', 'HepG2'),
 ('MAFK', 'K562'),
 ('NANOG', 'H1-hESC')]

In [21]:
cell_types = dict(TF_C_pairs)

In [22]:
cell_types

{'CEBPB': 'HeLa-S3', 'JUND': 'HepG2', 'MAFK': 'K562', 'NANOG': 'H1-hESC'}

In [23]:
tf = 'JUND'

In [60]:
df = pd.read_csv(tfdir / f'{tf}.train.labels.tsv.gz', sep='\t', nrows=2000)
#df = df[df.chr == 'chr8']
df.head()

Unnamed: 0,chr,start,stop,HCT116,HeLa-S3,HepG2,K562,MCF-7
0,chr1,600,800,U,U,U,U,U
1,chr1,650,850,U,U,U,U,U
2,chr1,700,900,U,U,U,U,U
3,chr1,750,950,U,U,U,U,U
4,chr1,800,1000,U,U,U,U,U


In [61]:
cell_types[tf]

'HepG2'

In [62]:
assert cell_types[tf] in df

In [63]:
df = df[['chr', 'start', 'stop', cell_types[tf]]]

In [64]:
df['HepG2'][0]

'U'

In [65]:
df.head()

Unnamed: 0,chr,start,stop,HepG2
0,chr1,600,800,U
1,chr1,650,850,U
2,chr1,700,900,U
3,chr1,750,950,U
4,chr1,800,1000,U


In [66]:
df[cell_types[tf]] = df[cell_types[tf]].map(label_map)

In [67]:
df.head()

df.to_csv(tfdir / f'DREAM.chr8.{tf}.{cell_types[tf]}.bed', sep='\t', index=False, header=None)

In [68]:
a = tfdir / f'DREAM.chr8.{tf}.{cell_types[tf]}.bed'

In [69]:
!head {a}

chr1	600	800	0
chr1	650	850	0
chr1	700	900	0
chr1	750	950	0
chr1	800	1000	0
chr1	850	1050	0
chr1	900	1100	0
chr1	950	1150	0
chr1	1000	1200	0
chr1	1050	1250	0


### Run the script

In [71]:
for tf in ['CEBPB', 'JUND', 'NANOG']:
    print(tf)
    df = pd.read_csv(tfdir / f'{tf}.train.labels.tsv.gz', sep='\t')

    # subset the table
    df = df[df.chr == 'chr8']
    assert cell_types[tf] in df
    df = df[['chr', 'start', 'stop', cell_types[tf]]]

    # map labels
    df[cell_types[tf]] = df[cell_types[tf]].map(label_map)

    # save to file
    df.to_csv(tfdir / f'DREAM.chr8.{tf}.{cell_types[tf]}.bed', sep='\t', index=False, header=None)

CEBPB
JUND
NANOG


In [76]:
[f'DREAM.chr8.{tf}.{cell_types[tf]}.bed' for tf in ['CEBPB', 'JUND', 'NANOG']]

['DREAM.chr8.CEBPB.HeLa-S3.bed',
 'DREAM.chr8.JUND.HepG2.bed',
 'DREAM.chr8.NANOG.H1-hESC.bed']