# Testing TF-ENFORMER

In [1]:
import json, os, sys, h5py, subprocess
import parsl
from parsl.providers import LocalProvider
from parsl.executors import HighThroughputExecutor
from parsl.channels import LocalChannel
from parsl.config import Config

In [2]:
parsl.clear()

script_path = '.'
usage_codes = f'{script_path}/utilities/enformer-usage-codes.py'
#parsl_config = f'{script_path}/parsl-configuration.py'
personal_enformer = f'{script_path}/personal-enformer.py'

# import the enformer-usage_codes.py file
exec(open(usage_codes).read(), globals(), globals())
exec(open(personal_enformer).read(), globals(), globals())

parsl.clear()

rundir = '/projects/covid-ct/imlab/users/temi/projects/running-parsl/runinfo'
workingdir = '/projects/covid-ct/imlab/users/temi/projects/running-parsl'

local_htex = Config(
    executors=[
        HighThroughputExecutor(
            label="htex_Local",
            worker_debug=True,
            cores_per_worker=1,
            working_dir=workingdir,
            provider=LocalProvider(
                channel=LocalChannel(),
                init_blocks=1,
                max_blocks=1,
            ),
        )
    ],
    strategy=None,
    run_dir=rundir
)

parsl.load(local_htex)

FileNotFoundError: [Errno 2] No such file or directory: './enformer-usage-codes.py'

In [3]:
# read the parameters file
with open(f'{script_path}/../metadata/enformer_parameters.json') as f:

    parameters = json.load(f)

    intervals_dir = parameters['interval_list_dir']
    model_path = parameters['model_path']
    fasta_file = parameters['hg38_fasta_file']
    output_dir = parameters['output_dir']
    individuals = parameters['individuals']
    vcf_file = parameters['vcf_file']
    path_to_bcftools = parameters['path_to_bcftools']
    path_to_tabix = parameters['path_to_tabix']
    temporary_vcf_dir = parameters['temporary_vcf_dir']
    TF = parameters['TF']
    logfile_path = parameters['logfile_path']
    sequence_folder = parameters['sequence_folder']

In [4]:
logfile_path = "/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer-minimal/runlog"
sam = 'LuCaP_145'
queries = ['chr2_186155022_186155031', 'chr6_137209238_137209247', 'chr3_185586974_185586983']

In [5]:
if not os.path.exists(f'{output_dir}/{sam}'):
    print(f'\n[CREATING OUTPUT DIRECTORY] at {output_dir}/{sam}')
    os.makedirs(f'{output_dir}/{sam}')

if not os.path.exists(f'{sequence_folder}/{sam}'):
    print(f'\n[CREATING SEQUENCE FOLDER] at {sequence_folder}/{sam}')
    os.makedirs(f'{sequence_folder}/{sam}')

In [6]:
logfile_csv = f'{logfile_path}/{sam}_predictions_log.csv'
if os.path.isfile(logfile_csv):
    logfile = pd.read_csv(logfile_csv)
    open_mode = 'a'
else:
    logfile = None
    open_mode = 'w'

logfile

In [7]:
query_status = []
for i, query in enumerate(queries):
    print(query)
    query_status.append(check_query(sample=sam, query=query, output_dir=output_dir, logfile=logfile))
# evaluate the results >> this should return a list of those that don't have predictions

chr2_186155022_186155031
chr6_137209238_137209247
chr3_185586974_185586983


In [10]:
q_status = [q.result() for q in query_status]
print(f'Query results are: {q_status} ======\n')




Length of sequences list is 3


In [11]:
print(sequences[0]) #['sequence'][sam][0:5])

{'sequence': {'LuCaP_145': 'TATATATGGATATATATATATGTATATGTATATATGGACATATATGGATATATATACATAGGGATACCACTCAGCCATATACTATATATATATATGGATATATATAGGGATATATATATATATGGATATATATATGATATACATATCTCTCATATTTTGTTATCCACTCATTGATTAATTGATGGGCATTTGGGCTGGTTCCATAATTTTGCAATTGTGAATTGTGCTGCTATCAACATGTATATGCAAATGTCCTTTTCGTATAATGACTTCTTTTCTTCCAAGTAGATACCTAGTAGTGGGATTGCTGGATCAAATGGTAGATCTACTTTTAATTCTCTAAGGAATCTCCACACTGTTTTCCACAGTGGTTGTACTAGTTTACATTCCTGCCAACAGTATAAAACTGTTCCCTTTTTACCACATTCATGCCAATATCTATTATTTTTTATTTTTTTTGTCATGGCCATTCTTGCAGGAGTAGGGTGGCATCTCACTGTGGTTTTGATTTGCATGTCCCTGATAATTAGTGATGTTGAGCATTTTTCCATATACTTGTTGCCCATTTGTATATCTTCTTTTGAGAATTGTCTATTCATGTCTTTAGTCTGCTTTTTGGTAGGATTGTTTAATTTTTTCCTGATGATTTGTTTGAGTTCTTGGTAGATTCTGGATGTTGTCCTTTGTTGGATGTGCAGATTGTGAAGATTTTCTCCCACTCTGTGTGTTGTCTGTTAACTCTGCTTATTATTTATTTTTCTGTGTAGAATTTTTTTAGTTTAATTAAGTCTCATCTATTTATCTTTGTTTTTGTTACATTTGCTTTTCGGTTCTTGGCCATGAAGTCTCTCCTTAAGCTAATATCTAGAAGGGCTTTTCTGATGTTATCTTCTAAAATTTTCATGGTTTCAGCTCTTAGATTTAAGTATTTGATCCATCTTGAGTTGATTTTTGTATAAGGTGGGAGATGAA

In [11]:
enf_model = tf.saved_model.load(model_path).model

2022-11-10 16:05:21.306909: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-11-10 16:05:21.306980: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (thetalogin5): /proc/driver/nvidia/version does not exist
2022-11-10 16:05:21.309736: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
@bash_app
def call_single_enformer_run(call_script, sequence_region, sam, stderr='./bashapp_error.err'):
    return(' '.join(['bash', call_script, sequence_region, sam]))

In [17]:
call_script = f"{script_path}/single-enformer-bashapp.sh"

In [18]:
out = [call_single_enformer_run(call_script=call_script, sequence_region=sreg, sam=sam) for sreg in q_status]

In [19]:
out

[<AppFuture at 0x7f649ec63fa0 state=pending>,
 <AppFuture at 0x7f649ec62f50 state=pending>,
 <AppFuture at 0x7f649ec63bb0 state=pending>]

In [20]:
out_finished = [o.result() for o in out]

In [21]:
out_finished

[0, 0, 0]

In [14]:
import tensorflow as tf
import tensorflow_hub as hub # for interacting with saved models and tensorflow hub
import joblib
import gzip # for manipulating compressed files
from kipoiseq import Interval # same as above, really
import pyfaidx # to index our reference genome file
import pandas as pd # for manipulating dataframes
import numpy as np # for numerical computations
import os, sys, re

2022-11-11 14:47:54.274396: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /soft/perftools/darshan/darshan-3.3.0/lib:/opt/cray/pe/papi/6.0.0.1/lib64:/opt/cray/job/2.2.4-7.0.2.1_2.91__g36b56f4.ari/lib64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi/mic/lib:/opt/intel/compilers_and_libraries_2020.0.166/linux/ipp/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/tbb/lib/intel64/gcc4.4:/opt/intel/debugger_2020/libipt/intel64/lib:/opt/intel/comp

In [17]:
sequence_folder = "/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer-minimal-2/sequence_folder"
each_individual = 'LuCaP_145'

model_path = "/projects/covid-ct/imlab/data/enformer/raw"
enformer_model = tf.saved_model.load(model_path).model

2022-11-11 14:48:43.237382: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-11-11 14:48:43.237462: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (thetalogin5): /proc/driver/nvidia/version does not exist
2022-11-11 14:48:43.240278: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [37]:
with open(f'{sequence_folder}/{each_individual}_regions_sequences.txt', 'r') as f:
        sequence_list = [sequence.rstrip() for sequence in f]

In [40]:
len(sequence_list[1])

393216

In [19]:
class Enformer:
    def __init__(self, tfhub_url):
        #self._model = hub.load(tfhub_url).model
        self._model = tf.saved_model.load(tfhub_url).model

    def predict_on_batch(self, inputs):
        predictions = self._model.predict_on_batch(inputs)
        return {k: v.numpy() for k, v in predictions.items()}

In [20]:
enformer_model = Enformer(model_path)

In [30]:
enformer_model.predict_on_batch(one_hot_encode(sequence_list[0])[np.newaxis])

{'mouse': array([[[0.0491372 , 0.08320338, 0.10873456, ..., 0.3303821 ,
          0.5847313 , 0.48271734],
         [0.05075146, 0.09147816, 0.09991135, ..., 0.37044352,
          0.61239594, 0.52627516],
         [0.05172487, 0.10032397, 0.09180237, ..., 0.29891792,
          0.45436087, 0.44946814],
         ...,
         [0.08714775, 0.13549958, 0.09068292, ..., 0.37251794,
          0.58543646, 0.79924405],
         [0.07982449, 0.1285619 , 0.08829714, ..., 0.3081446 ,
          0.4614172 , 0.63513064],
         [0.08735249, 0.12883613, 0.08333824, ..., 0.2663846 ,
          0.3839848 , 0.5310253 ]]], dtype=float32),
 'human': array([[[0.09844707, 0.1113882 , 0.07482027, ..., 0.00768186,
          0.0251844 , 0.0195839 ],
         [0.10723159, 0.10894116, 0.06095279, ..., 0.00522081,
          0.03070465, 0.02535884],
         [0.08797783, 0.09353481, 0.04838892, ..., 0.00406063,
          0.02706015, 0.02034754],
         ...,
         [0.10566902, 0.0864643 , 0.04872039, ..., 0.0

In [22]:
model_predict(one_hot_encode(sequence_list[0])[np.newaxis], enformer_model)

ValueError: Python inputs incompatible with input_signature:
  inputs: (
    [[[0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  ...
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]]])
  input_signature: (
    TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name=None)).

In [28]:
a = one_hot_encode(sequence_list[0])[np.newaxis]
a, type(a), a.shape

(array([[[0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         ...,
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.]]], dtype=float32),
 numpy.ndarray,
 (1, 313216, 4))

In [26]:
enformer_model.predict_on_batch(one_hot_encode(sequence_list[0])[np.newaxis])

ValueError: Python inputs incompatible with input_signature:
  inputs: (
    [[[0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  ...
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]]])
  input_signature: (
    TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name=None)).

In [3]:
def save_h5_prediction(prediction, sample, region, seq_type, output_dir):
    import h5py
    h5save = str(f'{output_dir}/{sample}/{region}_predictions.h5')
    with h5py.File(h5save, 'w') as hf:
        hf.create_dataset(region, data=prediction)
    return([region, sample, 'completed', seq_type])

def one_hot_encode(sequence):
    import kipoiseq
    return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

def model_predict(input, model):
    predictions = model.predict_on_batch(input)
    prediction_dict = {k: v.numpy() for k, v in predictions.items()}

    return(prediction_dict['human'][0])

def run_predictions(sequence, region, sample, seq_type, model_path, output_dir):

    import tensorflow as tf
    import tensorflow_hub as hub # for interacting with saved models and tensorflow hub
    import joblib
    import gzip # for manipulating compressed files
    import numpy as np # for numerical computations
    import os, sys, re # functions for interacting with the operating system
    
    # define the class
    # class Enformer:
    #     def __init__(self, tfhub_url):
    #         #self._model = hub.load(tfhub_url).model
    #         self._model = tf.saved_model.load(tfhub_url).model

    #     def predict_on_batch(self, inputs):
    #         predictions = self._model.predict_on_batch(inputs)
    #         return {k: v.numpy() for k, v in predictions.items()}

    

    enformer_model = tf.saved_model.load(model_path).model

    sequence_encoded = one_hot_encode(sequence)[np.newaxis]
    target_prediction = model_predict(sequence_encoded, enformer_model)
    obj_to_save = target_prediction[range(448 - 8, (448 + 8 + 1)), : ].squeeze()
    h5result = save_h5_prediction(obj_to_save, sample, region, seq_type, output_dir)

    return(h5result)

In [33]:
import parsl
import os
from parsl.app.app import python_app, bash_app
from parsl.configs.local_threads import config
from parsl.data_provider.files import File

import subprocess

parsl.clear()
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7fd2082732e0>

In [34]:
# App that echos an input message to an output file
@bash_app
def slowecho(message, outputs=[]):
    return 'sleep 5; echo %s &> %s' % (message, outputs[0])

@python_app
def py_app_slowecho(message, outputs=''):
    path_to_save = outputs
    return subprocess.call(f'sleep 5; echo {message} &> {path_to_save}', shell=True)

# Call slowecho specifying the output file
hello = slowecho('Hello World!', outputs=[File(os.path.join(os.getcwd(), 'hello-world.txt'))])
hello2 = py_app_slowecho('Hello World!', './hello-world-2.txt')

# The AppFuture's outputs attribute is a list of DataFutures
print(hello.outputs)


# Also check the AppFuture
print('Done: {}'.format(hello.done()))
print('Done: {}'.format(hello2.done()))

# Print the contents of the output DataFuture when complete
with open(hello.outputs[0].result(), 'r') as f:
     print(f.read())
        
# Now that this is complete, check the DataFutures again, and the Appfuture
print(hello.outputs)
print('Done: {}'.format(hello.done()))

# Print the contents of the output DataFuture when complete

print(hello2.result())

[<parsl.app.futures.DataFuture object at 0x7fd205bc6a40 representing <File at 0x7fd205bc6ef0 url=/lus/theta-fs0/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer-minimal/scripts/hello-world.txt scheme=file netloc= path=/lus/theta-fs0/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer-minimal/scripts/hello-world.txt filename=hello-world.txt> not done>]
Done: False
Done: False
Hello World!

[<parsl.app.futures.DataFuture object at 0x7fd205bc6a40 representing <File at 0x7fd205bc6ef0 url=/lus/theta-fs0/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer-minimal/scripts/hello-world.txt scheme=file netloc= path=/lus/theta-fs0/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer-minimal/scripts/hello-world.txt filename=hello-world.txt> done>]
Done: True
0
