In [1]:
import argparse
import os
from epitome.models import *
from epitome.functions import *
from epitome.viz  import *

from epitome.constants import *
from epitome.motif_functions import *
import yaml
import subprocess
from timeit import default_timer as timer

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


### Set Up

In [2]:
results_path = "results"
epitome_data_path = "data/epitome_data" 
motif_dir = "data/motif_data/"
feature_path = os.path.join(epitome_data_path, "feature_name")

In [3]:
# create user directories if they do not exist
epitome_results_dir = os.path.join(results_path, "epitome_results")
if not os.path.exists(epitome_results_dir):
    os.makedirs(epitome_results_dir)
tf_epitome_results_dir = os.path.join(epitome_results_dir, "OVERLAP_results")
if not os.path.exists(tf_epitome_results_dir):
    os.makedirs(tf_epitome_results_dir)
model_dir = os.path.join(results_path, "epitome_models")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
tf_model_dir = os.path.join(model_dir, "OVERLAP_models")
if not os.path.exists(tf_model_dir):
    os.makedirs(tf_model_dir)

### Load in Data for Epitome

In [4]:
train_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'train.npz')).toarray()
valid_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'valid.npz')).toarray()
test_data = scipy.sparse.load_npz(os.path.join(epitome_data_path, 'test.npz')).toarray()
data = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
# all_data = np.concatenate((data[Dataset.TRAIN], data[Dataset.VALID], data[Dataset.TEST]), axis=1)

In [5]:
motifmat = np.load(os.path.join(motif_dir, "OVERLAP_HOCOMOCO_unique_motifmat.npz"))["tf"]
motifmap = pd.read_csv(os.path.join(motif_dir, "OVERLAP_HOCOMOCO_unique_motifmap.csv"), 
                       header=None).rename(columns={0:"Index", 1:"TF"})

In [6]:
epitome_tfs = list(motifmap["TF"].unique()) + ["DNase"]
query_cell = 'K562' #'T47D'
eval_results_df = pd.DataFrame(columns=['transcription_factor', 'query_cell', 'auROC', 'auPRC'])

In [7]:
anchor_tfs = ["CTCF", "E2F1", "EGR1", "FOXA1", "FOXA2", "GABPA", "HNF4A", "JUND", 
              "MAX", "NANOG", "REST", "TAF1"]
anchor_overlap_tfs = set(epitome_tfs).intersection(set(anchor_tfs))
len(anchor_tfs), len(anchor_overlap_tfs), anchor_overlap_tfs

(12,
 9,
 {'CTCF', 'E2F1', 'EGR1', 'FOXA1', 'GABPA', 'JUND', 'MAX', 'REST', 'TAF1'})

### Train VLP Model With Motif Data

In [8]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = list(anchor_overlap_tfs),
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2) #10)

In [10]:
model = VLP(anchor_overlap_tfs,
            data = data,
            matrix = matrix,
            cellmap = cellmap,
            assaymap = assaymap,
            motifmat= motifmat, 
            motifmap= motifmap)

model.train(5000) # train for 5000 iterations
model_path = os.path.join(model_dir, query_cell)
model.save(model_path)

using ['T47D', 'SK-N-SH', 'MCF-7', 'K562', 'Ishikawa', 'IMR-90', 'HepG2', 'HeLa-S3', 'HEK293T', 'HCT116', 'H1', 'GM12892', 'GM12891', 'GM12878', 'A549'] as labels for mode Dataset.TRAIN
using ['T47D', 'SK-N-SH', 'MCF-7', 'K562', 'Ishikawa', 'IMR-90', 'HepG2', 'HeLa-S3', 'HEK293T', 'HCT116', 'H1', 'GM12892', 'GM12891', 'GM12878', 'A549'] as labels for mode Dataset.VALID
INFO:tensorflow:Starting Training
INFO:tensorflow:0 tf.Tensor(227.00458, shape=(), dtype=float32)tf.Tensor(20.411797, shape=(), dtype=float32)tf.Tensor(206.59277, shape=(), dtype=float32)
INFO:tensorflow:1000 tf.Tensor(137.86789, shape=(), dtype=float32)tf.Tensor(7.412508, shape=(), dtype=float32)tf.Tensor(130.45538, shape=(), dtype=float32)
INFO:tensorflow:2000 tf.Tensor(81.39781, shape=(), dtype=float32)tf.Tensor(13.743153, shape=(), dtype=float32)tf.Tensor(67.65466, shape=(), dtype=float32)
INFO:tensorflow:3000 tf.Tensor(41.888596, shape=(), dtype=float32)tf.Tensor(9.332393, shape=(), dtype=float32)tf.Tensor(32.556206

In [11]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

157it [01:14,  2.11it/s]
  precision = tps / (tps + fps)


INFO:tensorflow:macro auROC:     0.8383735993828211
INFO:tensorflow:auPRC:     0.17998487067527605
INFO:tensorflow:GINI:     0.8448085598443864
Model auROC: 0.8383735993828211. Model auPRC: 0.17998487067527605.


  precision = tps / (tps + fps)
  precision = tps / (tps + fps)
  precision = tps / (tps + fps)
  precision = tps / (tps + fps)


In [12]:
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : 'Epitome_Anchor_OVERLAP',
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC']}, 
    ignore_index=True)
eval_results_df.to_csv(os.path.join(tf_model_dir,'epitome_Anchor_OVERLAP.csv'), sep="\t")

### VLP Model Without Motif Data

In [None]:
matrix, cellmap, assaymap = get_assays_from_feature_file(feature_path,
                                                         eligible_assays = list(anchor_overlap_tfs),
                                                         eligible_cells = None, 
                                                         min_cells_per_assay = 2, 
                                                         min_assays_per_cell= 2) #10)

In [None]:
model = VLP(anchor_overlap_tfs,
            data = data,
            matrix = matrix,
            cellmap = cellmap,
            assaymap = assaymap)

model.train(5000) # train for 5000 iterations
model_path = os.path.join(model_dir, "no_motif_" + query_cell)
model.save(model_path)

In [None]:
model_results = model.test(10000, calculate_metrics=True)
print('Model auROC: %s. Model auPRC: %s.' % (model_results['auROC'], model_results['auPRC'])) 

In [None]:
eval_results_df = eval_results_df.append({ 
   'transcription_factor' : 'Epitome_Anchor_OVERLAP',
   'query_cell' : query_cell,
   'auROC' : model_results['auROC'],
   'auPRC' : model_results['auPRC']}, 
    ignore_index=True)
eval_results_df.to_csv(os.path.join(tf_model_dir,'no_motif_epitome_Anchor_OVERLAP.csv'), sep="\t")