# Modified Transformer model for de-novo generation of drug-like molecules
# 用于从头合成类药分子的Modified Transformer模型

In this notebook, we will implement a modified version of the Transformer model for de-novo generation of drug-like molecules. The modified model will be trained on a dataset of drug-like molecules and will be able to generate novel drug-like molecules.


Contributors:
- <NAME> (itwangyang)
- <EMAIL> (itwangyang@gmail.com)

## Introduction

De-novo generation of drug-like molecules is a challenging task, as it requires the development of novel drug design strategies that are not routinely tested in clinical trials. In this notebook, we will implement a modified version of the Transformer model for de-novo generation of drug-like molecules. The modified model will be trained on a dataset of drug-like molecules and will be able to generate novel drug-like molecules.

The modified model will be based on the original Transformer model, but with some modifications to improve the performance of the model in de-novo generation of drug-like molecules. The modifications include:    
- Using a different dataset of drug-like molecules for training and validation.
- Using a different loss function to evaluate the quality of generated drug-like molecules.
- Using a different evaluation metric to measure the performance of the model.
- Using a different optimizer to train the model.

## Installing *RDKIT*



# 检测是否GPU可用

In [None]:
# 检测是否GPU可用
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

In [None]:
##Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd '/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/tf2_output'
!ls
current_loc = !pwd
print(current_loc)



## Importing Packages


In [None]:
## Importing Packages
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from rdkit import Chem,DataStructs
from rdkit.Chem.Fraggle import FraggleSim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
# import rdkit.Contrib.SA_Score.sascorer as sascorer
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
# now you can import sascore!
import sascorer

In [None]:
#imports
import time
import math
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
!pip uninstall numba --yes
!pip install numba==0.52
import numba
from numba import cuda
print('numba_v: ',numba.__version__)
print('tf_v: ',tf.__version__)
tf.executing_eagerly()
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
TF_FORCE_GPU_ALLOW_GROWTH=True

In [None]:
#imports
import time
import math
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
!pip uninstall numba --yes
!pip install numba==0.52
import numba
from numba import cuda
print('numba_v: ',numba.__version__)
print('tf_v: ',tf.__version__)
tf.executing_eagerly()
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
TF_FORCE_GPU_ALLOW_GROWTH=True

In [None]:
#imports
#import tensorflow_datasets as tfds
import time
import pandas as pd
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt

## Attention Mechanism

In [None]:
#attention mechanism(s)
def scaled_dot_product_attention(q, k, v, mask, enc_dec):
  if enc_dec is True:
    q=tf.expand_dims(tf.reduce_sum(q,axis=-1),axis=-1)
    matmul_qk = tf.matmul(q,k,transpose_b=False)  # (...,seq_len_q,seq_len_k)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
      scaled_attention_logits += (mask * -1e9)  
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
    output = tf.matmul(attention_weights,v,transpose_b=True)  # (..., seq_len_q, depth_v)
  else:
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
      scaled_attention_logits += (mask * -1e9)  
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
  return output, attention_weights

## Multi-head Attention

In [None]:
#mha 
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):  #d_model=embded_dim
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    assert d_model % self.num_heads == 0
    self.depth = d_model // self.num_heads  
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask, enc_dec):
    batch_size = tf.shape(q)[0]
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask, enc_dec)  #(batch_size, num_heads, seq_len_q, depth//seq_len_k)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
    if enc_dec is True:
        concat_attention = tf.reshape(scaled_attention, (batch_size,-1,self.num_heads))  # (batch_size, seq_len_q, 1->num_heads)
    else:
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)      
    return output, attention_weights

## Encoder

In [None]:
#encoder
class FFNEncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, dim_s0, dim_n0, dim_m0, dim_m1, rate):
      super(FFNEncoderLayer, self).__init__()
      self.d_model=d_model
      assert d_model==dim_m0[-1] and d_model==dim_m1[-1]
      self.dim_s0,self.dim_n0,self.dim_m1, self.dim_m0=dim_s0,dim_n0,dim_m1,dim_m0
      self.ffn_s0, self.layernorm_s0, self.dropout_s0=[],[],[]
      self.ffn_n0, self.layernorm_n0, self.dropout_n0=[],[],[]
      self.ffn_m1, self.layernorm_m1, self.dropout_m1=[],[],[]
      self.ffn_m0, self.layernorm_m0, self.dropout_m0=[],[],[]
      for d in dim_s0:
          self.ffn_s0.append(tf.keras.layers.Dense(d, activation=tf.keras.layers.LeakyReLU(alpha=0.01)))
          self.layernorm_s0.append(tf.keras.layers.BatchNormalization(epsilon=1e-6, momentum=0.8))
          self.dropout_s0.append(tf.keras.layers.Dropout(rate))
      for d in dim_n0:
          self.ffn_n0.append(tf.keras.layers.Dense(d, activation=tf.keras.layers.LeakyReLU(alpha=0.01)))
          self.layernorm_n0.append(tf.keras.layers.BatchNormalization(epsilon=1e-6, momentum=0.8))
          self.dropout_n0.append(tf.keras.layers.Dropout(rate))
      for d in dim_m1[:-1]:
          self.ffn_m1.append(tf.keras.layers.Dense(d, activation=tf.keras.layers.LeakyReLU(alpha=0.01)))
          self.layernorm_m1.append(tf.keras.layers.BatchNormalization(epsilon=1e-6, momentum=0.8))
          self.dropout_m1.append(tf.keras.layers.Dropout(rate))
      self.ffn_m1.append(tf.keras.layers.Dense(dim_m1[-1], activation='tanh'))
      for d in dim_m0[:-1]:
          self.ffn_m0.append(tf.keras.layers.Dense(d, activation=tf.keras.layers.LeakyReLU(alpha=0.01)))
          self.layernorm_m0.append(tf.keras.layers.BatchNormalization(epsilon=1e-6, momentum=0.8))
          self.dropout_m0.append(tf.keras.layers.Dropout(rate))
      self.ffn_m0.append(tf.keras.layers.Dense(dim_m0[-1], activation='tanh'))

  def call(self, x0, x1):  #(x0,x1)=(signal, noise)  #!try residual/skipped connections
      for i in range(len(self.dim_s0)):
        x0 = self.layernorm_s0[i](self.dropout_s0[i](self.ffn_s0[i](x0)))
      for i in range(len(self.dim_n0)):
        x1 = self.layernorm_n0[i](self.dropout_n0[i](self.ffn_n0[i](x1)))
      x_merged=tf.keras.layers.concatenate([x0,x1])
      for i in range(len(self.dim_m1)-1):
        y1 = self.layernorm_m1[i](self.dropout_m1[i](self.ffn_m1[i](x_merged)))
      y1=self.ffn_m1[-1](y1)
      for i in range(len(self.dim_m0)-1):
        y0 = self.layernorm_m0[i](self.dropout_m0[i](self.ffn_m0[i](x_merged)))
      y0=self.ffn_m0[-1](y0)
      return tf.expand_dims(y0,1),tf.expand_dims(y1,1)  # (batch_size, seq_len, d_model=dim_m1[-1])

## Decoder

In [None]:
#decoder
class MHADecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate):
    super(MHADecoderLayer, self).__init__()
    self.d_model=d_model
    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
    self.ffn = self.point_wise_feed_forward_network(dff)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)

  def point_wise_feed_forward_network(self,dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(self.d_model)  # (batch_size, seq_len, d_model)
    ])

  def call(self, x, enc_output0, enc_output1, training, look_ahead_mask, padding_mask):
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask,enc_dec=False)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)    
    attn2, attn_weights_block2 = self.mha2(enc_output0, enc_output1, out1, padding_mask,enc_dec=True)  #(batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    return out3, attn_weights_block1, attn_weights_block2

class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate):  #d_model can be made different!
    super(Decoder, self).__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = self.positional_encoding(maximum_position_encoding)
    self.dec_layers = [MHADecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

  def get_angles(self,pos, i):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(self.d_model))
    return pos * angle_rates

  def positional_encoding(self,position):
    angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(self.d_model)[np.newaxis, :])
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  #sin to even indices
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  #cos to odd indices
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

  def call(self, x, enc_output0, enc_output1, training, look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1]
    attention_weights = {}    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  #to make embedding bigger than positional values
    y = x+self.pos_encoding[:, :seq_len, :]
    x = self.dropout(y, training=training)
    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output0, enc_output1, training, look_ahead_mask, padding_mask)  # x.shape == (batch_size, target_seq_len, d_model)
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    return x, attention_weights

## Transformer

In [None]:
#transformer
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, inp_vocab_size, tgt_vocab_size, pe_inp, pe_tgt, dim_s0=[512,256], dim_n0=[512,256], dim_m0=[128,64], dim_m1=[128,64], rate=0.1):
    super(Transformer, self).__init__()
    self.encoder = FFNEncoderLayer(d_model, dim_s0, dim_n0, dim_m0, dim_m1, rate)
    self.decoder = Decoder(num_layers, d_model, num_heads, dff, tgt_vocab_size, pe_tgt, rate)
    self.final_layer = tf.keras.layers.Dense(tgt_vocab_size)
    self.soft_layer = tf.keras.layers.Softmax()
    
  def call(self, inp_s, inp_n, tar, training, look_ahead_mask, dec_padding_mask):
    enc_output0, enc_output1 = self.encoder(inp_s, inp_n)  # (batch_size, inp_seq_len, d_model)
    dec_output, attention_weights = self.decoder(tar, enc_output0, enc_output1, training, look_ahead_mask, dec_padding_mask)  #(batch_size, tar_seq_len, d_model)
    final_output = self.soft_layer(self.final_layer(dec_output))  # (batch_size, tar_seq_len, target_vocab_size)
    return final_output, attention_weights

## Load Data

In [None]:
#data_loads
# current_loc=!pwd
print(current_loc)
path_to_cmpds=current_loc[0]+'/compounds.csv'
path_to_ges=current_loc[0]+'/L1000data.csv'
data_ges=pd.read_csv("/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/Datasets/L1000data.csv")
data_cmpds=pd.read_csv("/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/Datasets/compounds.csv")
del data_ges['Unnamed: 0']
del data_cmpds['Unnamed: 0']
data_ges,data_cmpds=np.array(data_ges),np.array(data_cmpds)

## Preprocess

In [None]:
#data_preprocess
chembl_charset = ['#', '%', '(', ')', '+', '-', '/', '0', '1', '2',
                  '3', '4', '5', '6', '7', '8', '9', '=', '@', 'A',
                  'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'M', 'N',
                  'O', 'P', 'R', 'S', 'T', 'V', 'W', 'Z', '[', '\\',
                  ']', 'a', 'b', 'c', 'd', 'f', 'g', 'h', 'i', 'n',
                  'o', 'p', 's', 't']+['L', 'X', 'Y', 'e', 'K', 'U', 'G']

smiles2dummies = {'Ag':'A', 'Al':'D', 'As':'E', 'Ba':'G', 'Br':'J',
                  'Ca':'U', 'Cl':'W', 'He':'Y', 'Kr':'d', 'Li':'L',
                  'Mg':'M', 'Na':'a', 'Ra':'R', 'Rb':'e', 'Se':'f',
                  'Si':'g', 'Sr':'h', 'Te':'T', 'Xe':'X', 'Zn':'Z',
                  'se':'i', 'te':'t', 'B':'B', 'C':'C', 'F':'F',
                  'H':'H', 'I':'I', 'K':'K', 'N':'N', 'O':'O',
                  'P':'P', 'S':'S', 'V':'V', 'b':'b', 'c':'c',
                  'n':'n', 'o':'o', 'p':'p', 's':'s'}

dummies2smiles = dict(
    (dummy, char) for char, dummy in smiles2dummies.items())

chembl_max_seq_length = 120  #!changeable
num_tokens=len(chembl_charset)
token_index = {char:i+1 for i, char in enumerate(chembl_charset)}
invert_tokens = dict(
    (char, token) for token, char in token_index.items())
cmpds_data = np.zeros((len(data_cmpds),chembl_max_seq_length+2),dtype='float32')
for i in range(len(data_cmpds)):
    cmpds_data[i][0:(len(data_cmpds[i][0])+2)]=[num_tokens+1]+[token_index[char] for char in data_cmpds[i][0]]+[num_tokens+2]
#cmpds_tokens = np.array([np.array([num_tokens]+[token_index[char] for char in mol[0]]+[num_tokens+1]) for mol in data_cmpds])
noise=np.array([np.random.normal(0,1,1000) for _ in range(len(data_ges))],dtype='float32')
assert len(cmpds_data)==len(data_ges)==len(noise)
BUFFER_SIZE=len(data_ges)

## Masking

In [None]:
#masks
# @cuda.jit
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1,k, dec_padding_mask) 1, seq_len)
# @cuda.jit
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)
# @cuda.jit
def create_masks(tar):
#  enc_padding_mask = create_padding_mask(inp)  #mask for encoder inp
#  dec_padding_mask = create_padding_mask(inp)  #Used in the 2nd attention block in the decoder to mask encoder's latent rep
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])    # Used in the 1st attention block in the decoder.
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  return combined_mask

In [None]:
#hyperparameters
num_layers = 2
d_model = 256
dff = 64
num_heads = 16
inp_vocab_size = data_ges.shape[-1]
tgt_vocab_size = num_tokens+3  #2/3?
pe_inp,pe_tgt=len(data_ges),chembl_max_seq_length+1
dropout_rate = 0.1
EPOCHS=300
BATCH_SIZE=1000
early_stop_threshold=15
early_stop,min_vloss,max_tacc,max_vacc=0.0,1e9,0.0,0.0
tf2=Transformer(num_layers,d_model,num_heads,dff,inp_vocab_size,tgt_vocab_size,pe_inp,pe_tgt,dim_s0=[256,128],dim_n0=[256,128],dim_m0=[128,d_model],dim_m1=[128,d_model],rate=dropout_rate)  #creating a transformer


In [None]:
## Optimizer
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)
  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.Mean(name='val_accuracy')
checkpoint_path = current_loc[0]+"/checkpoints/8 layers/16 heads"
save_model_path=current_loc[0]+'/checkpoints/8 layers/16 heads/saved_model/'
log_path = current_loc[0]+"/checkpoints/8 layers/log16_8l.txt"
log_file = open(log_path,'w')
ckpt = tf.train.Checkpoint(transformer=tf2,optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:  # if a checkpoint exists, restore the latest checkpoint.
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')


In [None]:
#data
np.random.seed(42)
dataset=tf.data.Dataset.from_tensor_slices((data_ges,noise,cmpds_data))
train_size = int(0.95*BUFFER_SIZE)
val_size = int(0.05*BUFFER_SIZE)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)
train_dataset=train_dataset.cache()
train_dataset=train_dataset.shuffle(train_size).padded_batch(BATCH_SIZE)
assert len(cmpds_data)==len(data_ges)==len(noise)
BUFFER_SIZE=len(data_ges)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
val_dataset=val_dataset.shuffle(val_size).padded_batch(val_size+1)

In [None]:
import pickle
pickle.dump( list(train_dataset), open( "train_data.pkl", "wb" ) )
pickle.dump( list(val_dataset), open( "val_data.pkl", "wb" ) )

## Training

In [None]:
#train func

#train_step_signature = [tf.TensorSpec(shape=(None,inp_s.shape[-1]), dtype=tf.float64),tf.TensorSpec(shape=(None, inp_n.shape[-1]), dtype=tf.float32),tf.TensorSpec(shape=(None, cmpds_data.shape[-1]), dtype=tf.float32)]

@tf.function()  #input_signature
def train_step(inp_s, inp_n, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  combined_mask = create_masks(tar_inp)
  with tf.device("/GPU:0"):
    with tf.GradientTape() as tape:
      predictions,_ = tf2(inp_s,inp_n,tar_inp,True,combined_mask,None)  #(look_ahead_mask, dec_padding_mask)
      loss = loss_function(tar_real, predictions)
      gradients = tape.gradient(loss, tf2.trainable_variables)   
      optimizer.apply_gradients(zip(gradients, tf2.trainable_variables))
  train_loss(loss) 
  train_accuracy(accuracy_function(tf.cast(tar_real,tf.int64), predictions))

@tf.function()
def val_step(inp_s, inp_n, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  combined_mask = create_masks(tar_inp)
  with tf.device("/GPU:0"):
    with tf.GradientTape() as tape:
      predictions,_ = tf2(inp_s,inp_n,tar_inp,True,combined_mask,None)  #(look_ahead_mask, dec_padding_mask)
      loss = loss_function(tar_real, predictions)
  val_loss(loss) 
  val_accuracy(accuracy_function(tf.cast(tar_real,tf.int64), predictions))

for epoch in range(EPOCHS):
  start = time.time()
  log_file = open(log_path,'w')
  for batch, (inp_s, inp_n, tar) in enumerate(train_dataset):  
    train_loss.reset_states()
    train_accuracy.reset_states()
    train_step(inp_s, inp_n, tar)  #(ges, noise, cmpds-target)
    if batch % 10 == 0:
      print ('Epoch {} Batch {} Lrate {:.8f} tLoss {:.4f} tAcc {:.4f}'.format(epoch + 1, batch, optimizer._decayed_lr('float32').numpy(), train_loss.result(), train_accuracy.result()))
      log_file.write('Epoch {} Batch {} Lrate {:.8f} tLoss {:.4f} tAcc {:.4f}'.format(epoch + 1, batch, optimizer._decayed_lr('float32').numpy(), train_loss.result(), train_accuracy.result())+'\n')
  v_batch,v_loss,v_accuracy=0,0.0,0.0
  for val_batch, (val_inp_s, val_inp_n, val_tar) in enumerate(val_dataset):
    val_loss.reset_states()
    val_accuracy.reset_states()
    val_step(val_inp_s,val_inp_n,val_tar)
    v_loss+=val_loss.result()
    v_accuracy+=val_accuracy.result()
    v_batch=val_batch
  print ('Epoch {} Batch {} Lrate {:.8f} vLoss {:.4f} vAcc {:.4f}'.format(epoch + 1, batch, optimizer._decayed_lr('float32').numpy(), v_loss/(v_batch+1), v_accuracy/(v_batch+1)))
  log_file.write('Epoch {} Batch {} Lrate {:.8f} vLoss {:.4f} vAcc {:.4f}'.format(epoch + 1, batch, optimizer._decayed_lr('float32').numpy(), v_loss/(v_batch+1), v_accuracy/(v_batch+1)) +'\n')
  if v_loss<=min_vloss:
    min_vloss=v_loss
    early_stop=0
  else:
    early_stop+=1
  if early_stop>=early_stop_threshold:
    early_stop=0
    print('Early stopped to avoid over-fitting!')
    log_file.write('Early stopped to avoid over-fitting!')
    break
  if max_tacc<train_accuracy.result() and max_vacc<v_accuracy:
    max_tacc,max_vacc=train_accuracy.result(),v_accuracy
    tf2.save_weights(save_model_path)
    print('New better model saved!')
    log_file.write('New better model saved!')
  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
  log_file.write('Time taken for 1 epoch: {} secs\n'.format(time.time() - start)+'\n')
  log_file.close()  

## Evaluate

In [None]:
#eval/test
def evaluate(inp_genes,model,num_cmpds,max_out_len=chembl_max_seq_length+1):  #(float vector,token vector)
#  outputs = []
  with tf.device("/GPU:0"): 
    noise=np.array([np.random.normal(0,1,1000) for _ in range(num_cmpds)],dtype='float32')
    output=tf.repeat(tf.expand_dims([num_tokens+1],0),axis=0,repeats=num_cmpds)
    for i in range(max_out_len):  #(max_seq_length+<eos>)
      combined_mask= create_masks(output)
      predictions, attention_weights = model(inp_genes,noise,output,False,combined_mask,None)  #predictions.shape == (batch_size, (input2decoder)seq_len, vocab_size)
      predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
      predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
#      if predicted_id == tgt_vocab_size-1:  # return the result if the predicted_id is equal to the end token
        # print(output)
        # output = tf.squeeze(output, axis=0)
#        break
#      print(output.shape)
      output = tf.concat([output, predicted_id], axis=-1)  # concatentate the predicted_id to the output which is given to the decoder as its input.
#    outputs.append(tf.squeeze(output, axis=0))
  return output #attention_weights

def plot_attention_weights(attention, sentence, result, layer):
  fig = plt.figure(figsize=(16, 8))
  sentence = tokenizer_pt.encode(sentence)
  attention = tf.squeeze(attention[layer], axis=0)
  for head in range(attention.shape[0]):
    ax = fig.add_subplot(2, 4, head+1)
    ax.matshow(attention[head][:-1, :], cmap='viridis')      # plot the attention weights
    fontdict = {'fontsize': 10}
    ax.set_xticks(range(len(sentence)+2))
    ax.set_yticks(range(len(result)))
    ax.set_ylim(len(result)-1.5, -0.5)
    ax.set_xticklabels(['<start>']+[tokenizer_pt.decode([i]) for i in sentence]+['<end>'],fontdict=fontdict, rotation=90)
    ax.set_yticklabels([tokenizer_en.decode([i]) for i in result if i < tokenizer_en.vocab_size], fontdict=fontdict)
    ax.set_xlabel('Head {}'.format(head+1))
  plt.tight_layout()
  plt.show()

def translate(sentence,num_cmpds,model,plot=''):
  result = evaluate(sentence,model,num_cmpds)  #attention_weights
#  predicted_sentence = ([[i for i in res if i < num_tokens] for res in result])  
#  print('Input: {}'.format(sentence))
#  print('Predicted translation: {}'.format(predicted_sentence))
  if plot:
    plot_attention_weights(attention_weights, sentence, result, plot)
  return result
  
# sample_test,num_cmpds=data_ges[0],1
# print(sample_test.shape)
# sample_test=tf.repeat(tf.expand_dims(sample_test,0),axis=0,repeats=num_cmpds)
# print(sample_test.shape)
# start=time.time()
# ps=translate(sample_test,num_cmpds,tf3)
# print('time:',time.time()-start)


## Token to Smiles

In [None]:
#post_process
def tokens2smiles(predicted):
  smiles,token_vector='',[]
  for p in predicted:
    # print(type(p))
    temp_p=p.numpy()
    # # token_vector.append(temp_p)
    # for key in token_index:
    #   if temp_p==token_index[key]:
    #     token=key
    #     break
    if temp_p == tgt_vocab_size-1:
      break
    elif temp_p == tgt_vocab_size-2:
      continue
    try:
      smiles+=dummies2smiles[invert_tokens[temp_p]]
    except:
      smiles += invert_tokens[temp_p]

    # smiles = ''.join([dummies2smiles[a] if a in dummies2smiles.keys() else a for a in  smiles])
  return smiles #, token_vector

# smiles,token_vector=tokens2smiles(ps[0])
# print(smiles)
# print(token_vector)


In [None]:
@cuda.jit
def my_kernel(io_array,dicti,output_tokens):  #(compounds_tensor,tokens2char_dictionary,output_tokens)
  tx = cuda.threadIdx.x+1
  ty = cuda.blockIdx.x
  output_tokens[ty][tx]=dicti[io_array[ty][tx]]
start=time.time()
invert_tokens
invert_tokens[63]='&'  #eos
invert_tokens[62]='!'  #sos
invert_tokens[0]='*'  #zero
dicti=np.array(['' for _ in range(num_tokens+3)])
for key, value in invert_tokens.items():
  dicti[key]=value
num_cmpds = 100
output_tokens=np.repeat(np.expand_dims(np.array(['']*(chembl_max_seq_length+1)),0),axis=0,repeats=num_cmpds)
# print(time.time()-start)

**Saving weights of Model**

In [None]:
tf2.save_weights(current_loc[0]+"/checkpoints/16 heads train/save models/")

**Loading Model**

In [None]:
tf3=Transformer(num_layers,d_model,num_heads,dff,inp_vocab_size,tgt_vocab_size,pe_inp,pe_tgt,dim_s0=[256,128],dim_n0=[256,128],dim_m0=[128,d_model],dim_m1=[128,d_model],rate=dropout_rate)  #creating a transformer
# tf3.load_weights(save_model_path)
tf3.load_weights(current_loc[0]+"/checkpoints/2 layers/16 heads val 0.2/saved_model/")

## Test Analysis

In [None]:
## Loading Test data
import pandas as pd
test_data = pd.read_csv('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/Datasets/GSE70138_xpr_sig_landmark.csv')
metadata = test_data[['cid','pert_id','pert_iname','cell_id']]
del test_data['cid']
del test_data['pert_id']
del test_data['pert_iname']
del test_data['cell_id']
test_data = np.array(test_data)

In [None]:
from tqdm import tqdm
num_cmpds = 100
ps_dict = {}
for i in tqdm(range(len(test_data))):
  sample_test=tf.repeat(tf.expand_dims(test_data[i],0),axis=0,repeats=num_cmpds)
  ps_dict[i] = translate(sample_test,num_cmpds,tf3)

In [None]:
# Smiles Conversion
smiles_dict = {}
for exp in tqdm(range(len(ps_dict))):
  smiles_dict[exp] = []
  for i in range(100):
    smiles_dict[exp].append(tokens2smiles(ps_dict[exp][i]))

In [None]:
#Save Test Compounds
import pickle
pickle.dump(ps_dict,open('KOtest_predictions.pkl','wb'))

In [None]:
# Save Smiles 
pickle.dump(smiles_dict,open('KOtest_allSMILES.pkl','wb'))

In [None]:
val_smiles_dict = {}
for i in tqdm(range(len(smiles_dict))):
  val_smiles_dict[i] = []
  for smiles in smiles_dict[i]:
    mol = Chem.MolFromSmiles(smiles)
    if mol!=None and smiles not in val_smiles_dict[i]:
      val_smiles_dict[i].append(smiles)

In [None]:
pickle.dump(val_smiles_dict,open('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/tf2_output/checkpoints/2 layers/16 heads val 0.2/KOtest_validSMILES.pkl','wb'))

In [None]:
import pickle
val_smiles_dict = pickle.load(open('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/tf2_output/checkpoints/2 layers/16 heads val 0.2/KOtest_validSMILES.pkl','rb'))

In [None]:
L = [len(val_smiles_dict[l]) for l in range(len(val_smiles_dict))]
# plt.hist(val_uni_counts,20)
import seaborn as sns
plt.figure(figsize=(7,5))
sns.distplot(L)
plt.xlabel('Number of Molecules')
plt.ylabel('Density')
# plt.savefig('val_uni_test.png',dpi = 300)
plt.title(["Mean: "+str(sum(L)/len(L))+"; Sigma: "+str(np.std(np.array(L)))])

In [None]:
##Synthe
synth_mol_dict = {}
for i in tqdm(range(len(val_smiles_dict))):
  synth_mol_dict[i] = []
  for j in range(len(val_smiles_dict[i])):
    if sascorer.calculateScore(Chem.MolFromSmiles(val_smiles_dict[i][j]))<4.5:
      synth_mol_dict[i].append(Chem.MolFromSmiles(val_smiles_dict[i][j]))

In [None]:
# Saving synthesizable molecules
pickle.dump(synth_mol_dict,open('KOtest_synthMol.pkl','wb'))

In [None]:
synth_mol_dict = pickle.load(open('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/tf2_output/checkpoints/2 layers/16 heads val 0.2/KOtest_synthMol.pkl','rb'))

In [None]:
L = [len(synth_mol_dict[l]) for l in range(len(synth_mol_dict))]
import seaborn as sns
plt.figure(figsize=(7,5))
sns.distplot(L,color = 'red')
plt.xlabel('Number of Molecules')
plt.ylabel('Density')
# plt.savefig('synth_test.png',dpi = 300)
plt.title(["Mean: "+str(sum(L)/len(L))+"; Sigma: "+str(np.std(np.array(L)))])

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/Datasets/GSE70138_xpr_sig_landmark.csv')
test_data = test_data[test_data['pert_iname'].isin(Excape_data['Gene_Symbol'])]
test_index = test_data.index

In [None]:
import pickle
synth_mol_dict = pickle.load(open(current_loc[0]+'/checkpoints/2 layers/16 heads val 0.2/KOtest_synthMol.pkl','rb'))
Excape_data = pd.read_csv('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/Datasets/ExcapeDB_28targets_actives.csv')
Excape_data = Excape_data[Excape_data['Gene_Symbol'].isin(Excape_data['Gene_Symbol'].value_counts()[Excape_data['Gene_Symbol'].value_counts()>1000].index)]
keys = [Chem.MolFromSmiles(mol) is not None for mol in Excape_data.SMILES]
Excape_data = Excape_data[keys]
Excape_smiles = Excape_data.SMILES = np.asarray([Chem.MolToSmiles(Chem.MolFromSmiles(mol)) for mol in Excape_data.SMILES])
Excape_data.head()

In [None]:
smiles_actives = Excape_smiles[Excape_data['Gene_Symbol'].isin(metadata.pert_iname.values)]
mols_actives = [Chem.MolFromSmiles(mol) for mol in smiles_actives]

In [None]:
ecfps_actives = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024, useChirality=True) for mol in mols_actives]

In [None]:
#Converting to Morgan FP
synth_MFP_dict = {}
for ind in test_index:
  synth_MFP_dict[ind] = [AllChem.GetMorganFingerprintAsBitVect(mol,3,1024, useChirality=True) for mol in synth_mol_dict[ind]]

In [None]:
from tqdm import tqdm
ecfps_SimMat1 = {}
for ind in tqdm(test_index):
  gene = metadata.loc[ind]['pert_iname']
  ecfps_actives = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(mol), 3, nBits=1024, useChirality=True) for mol in Excape_smiles[Excape_data['Gene_Symbol'].isin([gene])]]
  # ecfps_active_gene = 
  ecfps_SimMat1[ind] = [DataStructs.BulkTanimotoSimilarity(fp, ecfps_actives) for fp in synth_MFP_dict[ind]]
  # break

In [None]:
pickle.dump(ecfps_SimMat1,open('Sim_Excape_generated.pkl','wb'))

In [None]:
ecfps_SimMat1 = pickle.load(open(current_loc[0]+'/checkpoints/2 layers/16 heads val 0.2/Sim_Excape_generated.pkl','rb'))

In [None]:
Excape_data['Gene_Symbol'].value_counts().index

In [None]:
generated_active = {}
for ind in tqdm(test_index):
  M = 0
  gene = metadata.loc[ind]['pert_iname']
  smiles_actives = [smiles for smiles in Excape_smiles[Excape_data['Gene_Symbol'].isin([gene])]]
  for i in range(len(ecfps_SimMat1[ind])):
    if max(ecfps_SimMat1[ind][i])>M:
      M = max(ecfps_SimMat1[ind][i])
      pos = ecfps_SimMat1[ind][i].index(M)
      sim = ecfps_SimMat1[ind][i][pos]
      gen_smiles = Chem.MolToSmiles(synth_mol_dict[ind][i])
      act_smiles = smiles_actives[pos]
  generated_active[ind] = (gen_smiles,act_smiles,sim)


In [None]:
pickle.dump(generated_active,open('generated_active.pkl','wb'))

In [None]:
import pickle
generated_active = pickle.load(open(current_loc[0]+'/checkpoints/2 layers/16 heads val 0.2/'+'/generated_active.pkl','rb'))

In [None]:

gene_KO_generated = {}
M = {key:0 for key in Excape_data['Gene_Symbol'].value_counts().index} 
for ind in tqdm(generated_active):
  gene = metadata.loc[ind]['pert_iname']
  sim = generated_active[ind][2]
  if sim>M[gene]:
    gene_KO_generated[gene] = (generated_active[ind][0],generated_active[ind][1],sim)
    M[gene] = sim

In [None]:
gene_KO_generated

In [None]:
L = [generated_active[i][2] for i in generated_active]
import seaborn as sns
plt.figure(figsize=(7,5))
sns.distplot(L,color = 'green',bins = 20)
plt.xlabel('Similarity')
plt.ylabel('Density')
# plt.savefig('sim_test.png',dpi = 300)
plt.title(["Mean: "+str(sum(L)/len(L))+"; Sigma: "+str(np.std(np.array(L)))])

In [None]:
Excape_data[Excape_data['SMILES'] == 'C#C[C@]1(O)CC[C@H]2[C@@H]3CCc4cc(O)ccc4[C@H]3CC[C@@]21C']

## Unseen disease Drug data

### Metformin

In [None]:
landmark_genes = ['PSME1', 'ATF1', 'RHEB', 'FOXO3', 'RHOA', 'IL1B', 'ASAH1', 'RALA', 'ARHGEF12', 'SOX2', 'SERPINE1', 'HLA-DMA', 'EGF', 'APP', 'NOS3', 'CSNK1A1', 'NFATC4', 'TBP', 'BRCA1', 'PSMD4', 'ETV1', 'TERT', 'EED', 'PTK2B', 'HSPB1', 'PIK3C3', 'CBLB', 'DFFB', 'TGFB3', 'PRKX', 'CCND1', 'NFKBIB', 'GLI2', 'PIK3CA', 'RPS6', 'DFFA', 'SUZ12', 'TICAM1', 'EDN1', 'SQSTM1', 'HIF1A', 'PTGS2', 'PLA2G4A', 'MAT2A', 'EIF4G1', 'BTK', 'MYL9', 'PSMD2', 'CLTC', 'ALDOA', 'STAT5B', 'ETS1', 'HSPD1', 'CEBPA', 'PSMF1', 'ATF6', 'RALB', 'PPARD', 'STAT3', 'DNMT3A', 'RASA1', 'PSME2', 'FGFR2', 'CALM3', 'ERO1A', 'APOE', 'PIK3R4', 'CSNK1E', 'SKP1', 'PROS1', 'PSMD9', 'NFKB2', 'PAK6', 'GHR', 'CDK4', 'PRKAG2', 'PTPRC', 'SPTAN1', 'COL1A1', 'CDC25A', 'TP53', 'EGR1', 'MUC1', 'AURKB', 'NCK2', 'AURKA', 'CDK5R1', 'FYN', 'NRAS', 'BNIP3L', 'MMP2', 'CDK7', 'RAC2', 'HMOX1', 'COL4A1', 'ICAM1', 'C5', 'NFKBIA', 'NFKBIE', 'CCNH', 'HMGA2', 'IGFBP3', 'FRS2', 'IFNAR1', 'MIF', 'DAG1', 'IGF2BP2', 'ERBB3', 'FZD1', 'PIK3R3', 'TFDP1', 'SYK', 'EIF4EBP1', 'CCND3', 'PTPN6', 'DUSP4', 'RB1', 'PARP1', 'CDK6', 'PSMB10', 'FOSL1', 'E2F2', 'CRK', 'HSPA8', 'CCNB1', 'NFATC3', 'PCNA', 'PSMB8', 'PRKCQ', 'SUV39H1', 'BIRC2', 'PAK1', 'CDC42', 'CCNE2', 'SNAP25', 'PGAM1', 'NCK1', 'CD40', 'DAXX', 'CDC20', 'CCL2', 'SRC', 'BMP4', 'MNAT1', 'TLR4', 'IKZF1', 'EPHA3', 'CASP10', 'CASP7', 'TGFBR2', 'NOTCH1', 'SHC1', 'SIRT3', 'CYCS', 'GNAI1', 'FBXO11', 'MEF2C', 'DUSP3', 'PPP1R13B', 'PRKCH', 'CCNA1', 'BAX', 'RPS5', 'SNCA', 'CHP1', 'SPP1', 'AXIN1', 'CASP2', 'FZD7', 'DUSP6', 'ADRB2', 'MMP1', 'CDC25B', 'EPHB2', 'CHEK1', 'PDGFA', 'CXCL2', 'HLA-DRA', 'GNA15', 'RALGDS', 'SATB1', 'PFKL', 'GRB7', 'MAPKAPK2', 'MKNK1', 'HSPA1A', 'PLCB3', 'PLK1', 'MYLK', 'FGFR4', 'CDKN2A', 'CDKN1B', 'GAPDH', 'CISD1', 'SPDEF', 'IGF1R', 'GSTM2', 'SPTLC2', 'TSKU', 'TMEM2', 'SLC2A6', 'EZH2', 'ICAM3', 'PHKG2', 'KDM5B', 'COG4', 'MCM3', 'SNX13', 'PAK4', 'DPH2', 'SNX6', 'WIPF2', 'NENF', 'RPN1', 'C2CD2L', 'GABPB1', 'POLR2K', 'PAF1', 'POLR1C', 'RUVBL1', 'AKT1', 'SENP6', 'EBP', 'CASK', 'RRP8', 'BHLHE40', 'BDH1', 'SH3BP5', 'PPP2R5A', 'EML3', 'BIRC5', 'EPRS', 'PAX8', 'PDS5A', 'XBP1', 'NMT1', 'ARFIP2', 'STAMBP', 'FAH', 'PLP2', 'SOX4', 'TESK1', 'HN1L', 'RRAGA', 'ELOVL6', 'ACBD3', 'PMAIP1', 'TWF2', 'HDAC6', 'PXN', 'PHKB', 'CASC3', 'MLEC', 'USP22', 'TMED10', 'CHERP', 'TRAPPC6A', 'ATP2C1', 'USP14', 'APPBP2', 'ECH1', 'HDAC2', 'MRPL19', 'CSNK2A2', 'KIAA0100', 'PAICS', 'CDK2', 'EFCAB14', 'WRB', 'ETFB', 'CAMSAP2', 'ARNT2', 'SPAG7', 'MAPK1IP1L', 'MSH6', 'UBE2C', 'HS2ST1', 'TXLNA', 'DNAJB1', 'TOPBP1', 'ABCF1', 'LGMN', 'MBTPS1', 'ZNF274', 'CLIC4', 'HSD17B10', 'STX1A', 'CCNA2', 'SDHB', 'FOXO4', 'TOMM34', 'KIAA0907', 'HK1', 'PAPD7', 'PLOD3', 'TRAK2', 'VAPB', 'BUB1B', 'MRPL12', 'PNP', 'USP1', 'RPA1', 'KLHDC2', 'SCARB1', 'TPM1', 'TM9SF2', 'NUP88', 'FUT1', 'PRUNE', 'SYPL1', 'ZNF451', 'TRIM13', 'FBXO7', 'ENOSF1', 'LIG1', 'GNB5', 'INSIG1', 'HIST2H2BE', 'S100A4', 'KTN1', 'ARHGAP1', 'LPAR2', 'COASY', 'ATMIN', 'CASP3', 'NISCH', 'CPSF4', 'COPB2', 'GADD45A', 'SMNDC1', 'NUDCD3', 'HMGCR', 'SLC35B1', 'TIMP2', 'LSM6', 'SACM1L', 'TCERG1', 'KIAA0196', 'DNM1', 'DDIT4', 'ALAS1', 'PAFAH1B1', 'PSRC1', 'RBM15B', 'TOMM70A', 'CBR1', 'PRKCD', 'JUN', 'TMEM50A', 'SCCPDH', 'TSC22D3', 'TFAP2A', 'ZNF395', 'VPS72', 'CLSTN1', 'MAPK9', 'POP4', 'NFE2L2', 'BLCAP', 'FAM20B', 'CRKL', 'ZNF318', 'FDFT1', 'TRAP1', 'GTF2A2', 'MTFR1', 'UBE2A', 'NIT1', 'SYNGR3', 'STK25', 'ABCF3', 'FCHO1', 'OXSR1', 'SMARCC1', 'GLOD4', 'IL4R', 'KIT', 'PSMG1', 'TMEM97', 'LRRC41', 'KIF1BP', 'TCEAL4', 'NUP133', 'NUP62', 'MALT1', 'YKT6', 'KIF2C', 'NNT', 'IQGAP1', 'HSPA4', 'TOP2A', 'B4GAT1', 'SCP2', 'RAD9A', 'DRAP1', 'PCMT1', 'MYC', 'POLB', 'CETN3', 'SMC4', 'SMARCD2', 'MPZL1', 'EBNA1BP2', 'TATDN2', 'LRPAP1', 'TBPL1', 'RAE1', 'CDK1', 'SLC35A1', 'NIPSNAP1', 'BCL7B', 'TIMELESS', 'DHX29', 'TCEA2', 'FHL2', 'FKBP4', 'KAT6A', 'MAP3K4', 'ZFP36', 'INPP1', 'EPB41L2', 'GNAI2', 'TSTA3', 'DNMT1', 'RAB27A', 'DDR1', 'LPGAT1', 'DUSP11', 'TP53BP2', 'CDKN1A', 'TJP1', 'ABCB6', 'MTHFD2', 'PRAF2', 'DNTTIP2', 'HTATSF1', 'NCAPD2', 'TMEM109', 'NUP93', 'PLA2G15', 'PIP4K2B', 'MYCBP2', 'PGRMC1', 'AARS', 'GPC1', 'ELAC2', 'PLSCR1', 'CTSL', 'CTSD', 'ARHGEF2', 'ILK', 'NFIL3', 'ARL4C', 'CPNE3', 'SCRN1', 'MAN2B1', 'DCTD', 'MBNL1', 'MYCBP', 'PTPN1', 'MPC2', 'ECD', 'CIRBP', 'PRSS23', 'P4HA2', 'ATP6V0B', 'CAPN1', 'STXBP1', 'ITGB5', 'SLC11A2', 'PDIA5', 'RAP1GAP', 'NOLC1', 'CAT', 'DUSP14', 'TRIB1', 'TP53BP1', 'BECN1', 'KDELR2', 'ELAVL1', 'IGF2R', 'TIPARP', 'CRTAP', 'GMNN', 'CYTH1', 'PRCP', 'SMARCA4', 'PYGL', 'RPA2', 'GTF2E2', 'ANXA7', 'TXNRD1', 'CRYZ', 'GAA', 'SKIV2L', 'BLMH', 'RRP1B', 'TSPAN3', 'KEAP1', 'SMC3', 'LBR', 'PPIE', 'TERF2IP', 'BNIP3', 'BCL2', 'RBM6', 'CSK', 'ALDOC', 'UBE2L6', 'PCBD1', 'ICMT', 'GNPDA1', 'NUCB2', 'RSU1', 'PTPN12', 'PCK2', 'GFPT1', 'BLVRA', 'S100A13', 'MYBL2', 'STX4', 'TMCO1', 'CCDC85B', 'DDB2', 'BAMBI', 'HADH', 'GLRX', 'MVP', 'STMN1', 'GALE', 'PTPRF', 'PGM1', 'LOXL1', 'MYO10', 'LIPA', 'PHGDH', 'ST7', 'IKBKB', 'PCM1', 'IER3', 'FAT1', 'RNMT', 'EGFR', 'NET1', 'DECR1', 'NPC1', 'MEST', 'PAFAH1B3', 'CALU', 'LGALS8', 'LRP10', 'ATP1B1', 'OXCT1', 'SLC25A4', 'ASCC3', 'TRAM2', 'PYCR1', 'NCOA3', 'POLD4', 'PRKACA', 'LYRM1', 'VDAC1', 'SORBS3', 'SPAG4', 'SLC5A6', 'RGS2', 'SCYL3', 'TBX2', 'DNAJA3', 'DCUN1D4', 'DYRK3', 'POLG2', 'TRAPPC3', 'TCFL5', 'RBM34', 'ITGB1BP1', 'ACLY', 'RAD51C', 'SMC1A', 'YME1L1', 'ACAA1', 'WFS1', 'UBE3B', 'NPDC1', 'ALDH7A1', 'POLR2I', 'AKAP8L', 'PLEKHJ1', 'HAT1', 'G3BP1', 'TPD52L2', 'PIH1D1', 'SHB', 'UTP14A', 'TSEN2', 'LYPLA1', 'MAP2K5', 'PUF60', 'DHRS7', 'ARID5B', 'IGHMBP2', 'ADO', 'TCTN1', 'AKR7A2', 'PPP2R5E', 'LAMA3', 'FEZ2', 'IKBKAP', 'SGCB', 'ENOPH1', 'CNOT4', 'JMJD6', 'SLC35F2', 'ANO10', 'STUB1', 'BAD', 'PSMD10', 'RAB11FIP2', 'FPGS', 'RNH1', 'ATG3', 'CFLAR', 'HEATR1', 'WDR7', 'AKAP8', 'SLC1A4', 'GRWD1', 'STAT1', 'FOXJ3', 'MTERF3', 'CHIC2', 'USP7', 'NOSIP', 'DLD', 'IDE', 'PRPF4', 'MELK', 'SMAD3', 'OXA1L', 'SNX7', 'CDCA4', 'TBC1D9B', 'NARFL', 'DDX42', 'IKBKE', 'WDTC1', 'PLSCR3', 'H2AFV', 'NUSAP1', 'FAM69A', 'XPO7', 'FAIM', 'PXMP2', 'TIMM17B', 'STXBP2', 'HPRT1', 'RRS1', 'ADAT1', 'SCAND1', 'MFSD10', 'COPS7A', 'FIS1', 'NUDT9', 'CSRP1', 'METRN', 'TCTA', 'TLK2', 'KIAA1033', 'DNAJB6', 'HMGCS1', 'SPRED2', 'MAP7', 'CHAC1', 'GNAS', 'TEX10', 'ZNF586', 'APBB2', 'RNF167', 'TXNDC9', 'VGLL4', 'YTHDF1', 'LSM5', 'ZMYM2', 'CHMP6', 'PIGB', 'CHEK2', 'ACD', 'SUPV3L1', 'GPATCH8', 'ARPP19', 'RTN2', 'NPEPL1', 'UBQLN2', 'DMTF1', 'CLPX', 'TXNL4B', 'PRR15L', 'ST6GALNAC2', 'ADH5', 'GOLT1B', 'PPOX', 'CANT1', 'HOMER2', 'CLTB', 'PHKA1', 'MAP4K4', 'PAN2', 'PPIC', 'EIF5', 'TM9SF3', 'CCDC86', 'SLC25A14', 'TBXA2R', 'HERPUD1', 'RPL39L', 'BPHL', 'MAPKAPK3', 'MBOAT7', 'PNKP', 'SLC35A3', 'LSR', 'KDM5A', 'FAM63A', 'RAB4A', 'IFRD2', 'BACE2', 'LYN', 'ISOC1', 'MOK', 'DNAJB2', 'CERK', 'LAP3', 'MAPKAPK5', 'HDGFRP3', 'RAB21', 'PSIP1', 'CREB1', 'MBNL2', 'PACSIN3', 'MTA1', 'MACF1', 'HIST1H2BK', 'PCCB', 'STAP2', 'PMM2', 'HSD17B11', 'FBXL12', 'WDR61', 'FAS', 'CENPE', 'CREG1', 'PRR7', 'TMEM110', 'ZNF589', 'CNDP2', 'TNIP1', 'P4HTM', 'REEP5', 'BZW2', 'CGRRF1', 'AMDHD2', 'PEX11A', 'GFOD1', 'PECR', 'SOCS2', 'ZDHHC6', 'ITGAE', 'TMEM5', 'ATP5S', 'NGRN', 'HOXA5', 'GRN', 'NUP85', 'GTPBP8', 'PTK2', 'EPN2', 'MCUR1', 'C2CD5', 'MRPS16', 'NOL3', 'NR1H2', 'PLS1', 'DUSP22', 'RPS6KA1', 'DCK', 'TRIM2', 'VPS28', 'LRRC16A', 'NR2F6', 'EXOSC4', 'PDLIM1', 'CRELD2', 'RRP12', 'ACAT2', 'THAP11', 'CEBPD', 'MAST2', 'DHDDS', 'GADD45B', 'KIAA0355', 'DYNLT3', 'CHMP4A', 'IPO13', 'ITFG1', 'HYOU1', 'FOS', 'MCOLN1', 'ID2', 'KCNK1', 'RAI14', 'DNAJC15', 'HOXA10', 'CTNND1', 'ZNF131', 'HOOK2', 'ERBB2', 'INTS3', 'RFNG', 'MICALL1', 'KLHL21', 'TLE1', 'COG2', 'MRPS2', 'KCTD5', 'CNPY3', 'ADAM10', 'CD58', 'KIF5C', 'HMG20B', 'PIN1', 'LAGE3', 'TNFRSF21', 'KIF14', 'CYB561', 'HTRA1', 'WASF3', 'TSPAN6', 'FBXO21', 'PPP2R3C', 'GRB10', 'ZW10', 'CORO1A', 'XPNPEP1', 'MAMLD1', 'DSG2', 'SLC27A3', 'ADGRE5', 'IL13RA1', 'AGL', 'GATA3', 'RPA3', 'TES', 'SPR', 'RELB', 'RPIA', 'ADGRG1', 'NVL', 'ME2', 'TBC1D31', 'RFC2', 'TIMM9', 'ORC1', 'PARP2', 'FASTKD5', 'HACD3', 'MSRA', 'SYNE2', 'UBE3C', 'CCDC92', 'TIMM22', 'ATF5', 'NSDHL', 'ADI1', 'ADCK3', 'GDPD5', 'TARBP1', 'RBKS', 'KDM3A', 'BID', 'MTF2', 'PIK3C2B', 'SLC25A46', 'CDK19', 'KIAA0753', 'KAT6B', 'CHN1', 'MAPK13', 'UFM1', 'IARS2', 'GATA2', 'NRIP1', 'CCNB2', 'RNPS1', 'ARID4B', 'GSTZ1', 'SESN1', 'SPEN', 'PDHX', 'DENND2D', 'CTTN', 'UBR7', 'CD44', 'VAV3', 'ABHD6', 'CAB39', 'TIAM1', 'DNM1L', 'DERA', 'PTPRK', 'HERC6', 'ACOT9', 'ST3GAL5', 'CTNNAL1', 'ABHD4', 'INPP4B', 'ZMIZ1', 'PPARG', 'RPP38', 'CAST', 'BAG3', 'GNA11', 'TOR1A', 'SLC25A13', 'TSPAN4', 'NR3C1', 'TRIB3', 'JADE2', 'CXCR4', 'FKBP14', 'CIAPIN1', 'COG7', 'TUBB6', 'ABL1', 'CCNF', 'FAM57A', 'EVL', 'GPER1', 'EAPP', 'EXT1', 'POLE2', 'C2CD2', 'ANKRD10', 'DDX10', 'CEP57', 'RAB31', 'ABCC5', 'RFC5', 'HES1', 'KLHL9', 'UBE2J1', 'SLC37A4', 'NPRL2', 'PWP1', 'PLEKHM1', 'VAT1', 'KIF20A', 'CDH3', 'SSBP2', 'SNX11', 'SFN', 'RFX5', 'PKIG', 'FSD1', 'ATP6V1D', 'CDC45', 'EDEM1', 'USP6NL', 'NT5DC2', 'CCP110', 'STK10', 'UGDH', 'SQRDL', 'HEBP1', 'ATP11B', 'CD320', 'MLLT11', 'CEBPZ', 'CBR3']

In [None]:
import json
with open('/content/drive/MyDrive/MyPC/Sem 7/Algorithmic Approaches to Computational Biology/AACB Project/Datasets/DiseaseDrug/Metformin (T2D) - MCF7.json') as f:
  gene_exp_sig = json.load(f)['data'][0]['data']

In [None]:
metfor_sig = np.array([-0.06, -0.76, -0.38, 0.64, -0.37, 0.08, -0.67, -0.85, 0.86, 0.48, -0.53, 0.45, -0.05, -1.25, 0.63, 0.77, 0.03, -0.2, 0.21, 0.56, 0.67, 0.15, -0.57, 0.08, -1.57, 0.69, 1.26, 0.64, 0.32, -0.23, 0.21, 4.08, -0.5, -0.3, -0.3, 0.67, 0.94, 0.87, 1.17, -0.58, 0.24, -0.06, 0.36, 0.23, 0.72, 0.26, 0.6, -0.36, 0.06, 0.7, 0.7, 0.43, 0.19, 1.42, 0.06, 0.58, -0.16, 0.65, 0.23, -0.1, 0.03, 0.23, -0.32, 0.07, -0.23, 0.34, 0.29, 0.72, -0.38, 0.05, 1.12, 0.43, 0.84, -0.69, -0.37, -0.22, 5.25, 0.61, 0.08, 1.44, 0.89, -0.55, 0.15, 1.12, -0.63, 0.64, 1.81, -0.95, -0.22, 0.72, 0.91, -0.5, -0.42, -0.06, 0.3, -0.67, 0.55, -0.04, 0.03, -0.6, -0.68, -0.2, 0.25, 0.04, -0.6, 0.78, 1.06, -0.4, 0.45, 0.23, 0.57, 0.24, -0.2, 0.26, 2.09, 1.73, -0.71, 0.14, 0.84, 1.29, 0.39, -0.34, -0.2, -0.02, -1.19, 0.81, -0.28, 8.38, -0.04, 1.08, -0.03, 0.09, -0.48, -1.44, -0.4, -0.25, 0.33, -0.19, -0.6, -0.27, -0.75, 1.62, 0.08, 0.0, -1.66, -0.12, 0.3, 0.53, -0.43, -1.11, 0.81, -0.24, 1.03, -0.93, -0.03, -0.06, 0.13, 0.52, 0.07, -1.38, 0.16, 0.45, -4.25, 0.23, -0.3, -0.71, -0.38, 5.78, 0.65, -1.51, -0.33, -0.77, 0.81, 9.18, -0.29, 0.24, -0.5, -0.02, 1.61, 1.45, -0.06, 0.2, -1.38, -0.0, -0.24, -0.11, 0.28, 0.34, 0.88, 1.04, -0.41, -0.67, -0.03, 0.24, 1.55, -0.1, 1.14, 0.55, 0.09, 0.9, -0.53, 0.4, 0.23, -0.14, 1.89, 1.12, -0.89, -0.63, 0.85, 0.55, 0.97, 0.93, 0.76, 0.2, -0.09, 0.12, -0.86, 0.21, -0.66, -0.0, 0.2, -0.26, 1.09, 0.11, 0.14, 0.57, 0.33, -0.52, 0.47, 1.1, 0.87, 0.06, 0.69, 0.41, 0.22, 0.4, 0.93, -0.31, -0.46, -0.18, 0.75, 0.62, 0.23, 0.56, 0.52, -0.77, 0.3, 0.53, 0.19, -0.24, -0.36, 0.14, -0.08, -0.62, 0.22, 0.77, 0.55, -0.45, 0.4, -0.85, -0.16, 0.01, -0.62, 0.28, 1.05, -0.49, -0.31, -0.19, -0.7, 0.55, -0.71, -0.71, 0.25, -0.98, -0.1, 0.46, 0.05, 0.85, -0.86, -1.14, 0.23, -0.49, -1.24, -0.68, -1.69, 1.61, 0.03, 0.25, -0.14, 0.02, -0.11, -0.25, 0.02, 0.01, 0.19, 1.35, 0.1, -0.21, 0.07, -0.15, -0.89, -0.46, -0.28, 0.52, -0.16, 0.54, 0.81, 0.66, 1.05, -0.19, -0.42, -0.1, 1.08, 0.54, 1.17, 0.79, -0.74, 0.47, 0.39, -0.69, 0.97, 0.75, 0.68, -0.86, 0.8, 0.49, 0.13, -0.27, -0.47, -1.12, 1.16, 0.43, 0.05, 0.39, -0.26, 0.17, 0.45, -0.13, 1.2, 0.12, 0.97, 1.06, 0.68, -0.84, 0.01, -0.75, 0.65, 0.02, -1.71, -0.15, -0.01, -0.88, -0.09, 0.14, 0.97, 0.7, -0.82, 0.74, 1.12, -0.09, -0.02, -0.82, -0.03, 0.54, -0.35, -0.97, -0.35, 0.55, 1.01, 0.75, -0.5, 0.52, -0.3, 1.47, -0.71, 0.35, -0.31, -1.02, 0.42, 0.81, -0.35, -0.7, -0.34, -0.19, 0.21, 0.4, -0.52, -1.03, -1.04, -1.25, -0.97, 0.36, 0.14, 0.45, 0.38, -0.33, 0.43, -0.23, -0.32, -0.53, 1.01, -0.69, -1.8, -0.2, 0.46, -0.35, -0.03, 0.35, -0.57, -0.3, 0.95, 0.45, 0.31, 0.57, -0.58, 0.01, 1.48, 0.1, -0.76, -0.39, -0.67, 0.31, 0.98, -0.21, -0.07, 1.71, 0.69, 0.12, -0.34, -0.07, -0.28, 0.63, -1.24, 1.15, 0.32, 0.22, 0.08, 0.18, 0.34, 0.77, -0.51, 0.28, -0.06, -0.51, -0.5, -0.06, -0.59, 0.04, 0.31, 0.61, 0.82, -0.46, 0.43, -0.86, 0.18, 0.58, -0.1, -0.02, 0.08, 0.94, -1.03, -0.02, 0.52, 0.24, 1.26, -0.26, 0.71, -0.39, 0.37, 0.44, 0.54, 0.54, 1.07, -1.01, 0.95, -0.26, 1.46, -0.02, -0.84, 1.17, 0.96, -0.25, -0.11, -2.0, -1.35, 0.58, 0.34, 1.76, 0.33, -0.42, -0.61, -0.63, -1.03, -0.12, 0.68, 0.12, -0.57, -0.08, 1.16, -0.54, 0.29, 0.81, -0.3, -0.53, -0.48, 0.72, 0.67, 0.37, 0.06, 0.56, 0.74, 0.52, 0.36, 0.41, 0.21, 0.18, 0.72, -0.44, -0.73, 1.48, 0.79, 0.24, 0.54, 0.05, -0.75, -0.23, -0.4, 0.32, -0.05, 0.73, -0.02, 0.47, 0.36, -0.73, -1.04, -1.66, -0.34, -0.96, -0.14, 0.54, -1.03, -0.42, -0.87, 0.01, -1.16, -0.7, 0.73, 1.36, 0.43, -0.88, 1.3, 0.7, -1.04, -1.17, -0.11, 0.65, 0.97, 0.03, 1.5, 0.06, 1.16, -0.72, 7.92, -1.38, 0.38, -1.03, -0.72, 0.92, 0.31, 0.38, 0.17, -6.15, -1.33, 0.86, -0.91, 0.48, -0.1, 0.22, -1.03, -0.37, 0.09, 3.43, -1.03, 0.49, -0.65, 0.62, 0.4, 1.85, -0.03, -0.25, -0.31, -0.29, 0.31, 0.38, -0.72, -0.23, -0.95, -0.83, 0.19, -0.64, 0.61, -0.09, 2.1, 0.75, 0.36, -0.34, 0.47, -1.12, 0.26, -0.88, 0.32, 0.38, 0.32, -0.6, 0.0, -0.08, -0.83, 0.59, 0.63, -0.1, 0.98, -0.1, 1.1, -0.23, 1.05, 0.89, 1.16, -1.0, -0.02, -0.32, 0.23, -0.29, 0.65, -1.41, 1.2, -0.79, -0.31, -0.47, 0.12, -0.21, 0.14, -0.61, 0.1, -0.14, 0.14, 0.32, 1.45, -0.28, -0.99, 0.03, -0.49, -0.71, 0.21, -0.55, 0.74, -0.26, 0.32, -0.93, 0.39, 0.45, 0.12, 0.67, 1.06, -0.37, -0.3, -0.22, 1.2, -0.42, 0.26, 0.06, -1.22, 1.46, 0.59, 0.62, 0.08, 1.76, 1.64, 0.05, -0.54, -0.23, 1.33, 1.03, 0.2, 0.58, -0.4, -0.34, -0.2, 0.67, 0.04, -1.63, 0.93, 0.95, 0.04, -1.08, -0.31, -0.33, 0.36, -0.98, -0.19, -1.68, -0.3, -0.46, -0.36, -0.05, -0.19, -0.23, 0.74, 0.3, -0.01, 0.83, 0.12, 0.13, -0.07, -0.03, -0.53, 0.01, 0.28, 1.08, 0.31, -0.02, 0.49, -0.12, 0.84, -0.53, -0.12, -0.19, -0.98, -0.17, 0.73, 0.18, 0.45, -0.86, -0.47, -0.42, -0.1, -0.53, -1.33, -0.96, 0.24, -2.97, 0.16, -0.84, -0.21, 1.54, 0.24, 0.51, 0.35, -0.24, -0.52, 0.05, -0.13, 0.5, 0.32, 0.85, -0.2, 0.5, 0.04, -0.67, -0.52, -0.72, 0.35, 0.39, 0.28, -0.16, -0.65, -0.83, 0.47, 0.59, -0.46, -0.08, -0.99, 0.32, 0.36, -0.53, 0.58, 0.47, -1.07, -0.4, 0.56, 0.43, 0.35, -0.72, 0.09, 0.74, -0.65, -0.76, -0.28, 0.24, 1.52, 0.85, 0.12, -0.19, 0.81, -0.94, 2.75, 0.39, 0.53, 1.15, -0.12, -0.32, -0.8, -0.57, 0.05, 0.32, 1.35, 0.47, 0.1, -1.11, -0.97, -0.4, -0.83, -0.32, 0.43, -0.11, 0.73, -0.02, 0.66, -0.31, 1.44, 0.88, 0.27, 0.25, 0.78, -0.1, -0.07, -0.24, 0.2, -0.22, 0.19, 0.24, 0.56, 0.4, 1.4, -0.29, 0.19, -0.12, -1.31, 0.67, 0.29, 0.3, 0.63, 1.65, 1.01, 0.32, -1.24, -0.79, 1.24, -0.01, -1.48, 0.48, 0.72, -0.09, -0.57, 0.34, -0.87, -0.56, -0.15, -0.7, -0.05, 0.01, -0.36, -0.04, 0.54, -0.9, 0.12, 0.35, -0.62, -1.19, 1.04, 0.14, -0.43, -0.5, -0.81, 0.06, 0.51, -0.76, -0.94, -0.19, 0.84, 0.3, -0.11, -0.59, 0.85, 1.25, 0.77, -0.32, -0.22, -0.42, 0.07, 0.68, -1.18, -0.94, 0.44, -0.6, -1.27, -0.82, 0.31, 0.44, -0.34, -0.43, -0.53, -0.18, -0.1, 0.35, -0.2, 1.42, -0.9, -0.11, 0.2, 4.65, 0.23, 0.96, -0.52, 0.87, 0.78, 0.58, 0.79, 0.22, -0.89, 0.04, -0.29, 0.93, -0.81, 0.11, 1.55, -0.21, 0.45, 0.37, 0.22, -0.04, 0.23, -8.38, -2.51, 0.4, 0.53, 0.59, 0.67, -0.11, 0.51, -0.28, 0.86, -0.48, 0.2, 0.35, 0.72, 1.56, 0.16, -0.03, 0.04, 0.34, 0.77, 0.42, 0.46, 1.95, 0.49, -0.02, 0.93, -0.64, -0.23, 0.46, 0.78, 1.09, 0.02, 0.38, -0.17, 0.52, -0.24, 0.88, -0.27, 0.68, 0.18, 0.21, -0.22, 0.49, 0.46, -0.67, 0.49, -0.32])

In [None]:
metfor_sig.shape

In [None]:
num_cmpds = 100
metfor_sig_tokens = evaluate(tf.repeat(tf.expand_dims(metfor_sig,0),axis=0,repeats=num_cmpds),tf3,100)

In [None]:
metfor_smiles = []
for i in range(len(metfor_sig_tokens)):
  smiles = tokens2smiles(metfor_sig_tokens[i])
  mol = Chem.MolFromSmiles(smiles)
  if mol!=None and smiles not in metfor_smiles and sascorer.calculateScore(mol)<4.5:
    metfor_smiles.append(smiles)

In [None]:
known_cmp_metfor = 'CN(C)C(=N)NC(=N)N'
known_cmp_mol = Chem.MolFromSmiles(known_cmp_metfor)
known_cmp_ecfp = AllChem.GetMorganFingerprintAsBitVect(known_cmp_mol, 3, nBits=1024, useChirality=True)
M = 0
for comp in metfor_smiles:
  mol_pred = Chem.MolFromSmiles(comp)
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_pred, 3, nBits=1024, useChirality=True)
  if DataStructs.TanimotoSimilarity(mol_ecfp,known_cmp_ecfp)>M:
    M = DataStructs.TanimotoSimilarity(mol_ecfp,known_cmp_ecfp)
    best_cmp = comp

In [None]:
best_cmp,round(M,3)

In [None]:
import pickle
pickle.dump(metfor_smiles,open('metformin_smiles.pkl','wb'))

### Methotrexate

In [None]:
mtx_sig = np.array([-0.49, 0.09, -0.32, 1.29, 0.03, 0.1, 1.08, 0.52, 0.06, -0.45, -0.65, -0.2, -1.14, 0.87, 0.18, -0.2, -0.75, -0.84, -0.53, 0.3, -0.96, -0.48, 0.4, 0.1, 0.46, -0.37, 0.36, -0.41, -0.5, 0.33, 0.06, 0.16, 0.69, -0.04, 2.97, -0.59, -0.24, -0.25, -0.81, -0.07, 0.05, 0.49, -0.82, -1.52, -2.96, -0.18, -0.69, -1.3, -1.07, -0.72, 0.02, -0.32, -1.67, -1.23, -0.97, -2.35, 0.25, 0.39, -0.1, -0.49, 0.59, -1.42, -1.15, -1.81, -0.49, -0.34, -0.68, 0.17, -0.42, 0.46, 0.07, -1.25, -0.49, -0.72, 0.75, 1.37, -0.15, 0.49, -0.38, -1.82, 1.25, 2.88, -0.03, -2.11, 0.56, -3.42, 0.88, 0.83, -1.47, 4.97, -0.92, 2.94, 5.1, -0.57, -0.1, 0.96, -0.07, 0.96, -0.25, -1.5, -0.2, 1.39, -0.52, -0.33, 0.38, -0.04, -0.38, 1.36, -0.93, 0.11, -2.94, -1.15, 0.27, -0.45, 0.72, 4.56, 0.42, 0.33, 0.6, -1.46, 1.61, -1.45, 0.18, 0.15, -3.09, -0.13, 0.36, -0.08, 0.3, -0.23, -0.62, -0.53, 2.29, 0.12, -0.15, 0.23, 4.14, -1.26, 0.25, -4.68, -0.24, 0.04, -0.73, -1.01, 0.01, 0.06, -1.63, -0.65, 0.14, -0.51, 1.05, -1.05, 0.15, -0.78, 0.06, 0.39, 1.05, -0.02, -0.41, 0.32, -0.42, 1.66, 1.11, -0.86, 0.45, -0.48, -0.38, -0.37, -0.79, 0.05, 0.41, -0.85, -0.03, 0.17, -0.05, 0.8, 0.04, -0.21, -1.34, 1.24, 0.2, 0.56, 0.01, 1.16, -0.04, -1.3, 0.66, -7.42, -0.2, 0.84, 0.66, -0.26, 0.0, -0.97, 0.35, 0.11, 1.51, -0.48, -0.4, 2.24, -0.36, 0.1, -0.8, -0.44, -0.25, -1.67, 0.13, -1.03, -0.22, -0.61, -0.85, -0.93, 0.15, -0.95, -0.07, -3.01, 0.03, -1.29, -0.68, -2.45, -3.09, 0.14, -1.35, 0.6, 0.89, 1.41, 0.67, 0.42, -1.72, -0.21, -4.12, -0.44, 1.37, 0.64, -0.54, -0.26, -0.07, -0.51, 0.29, -1.89, 0.87, -0.79, -1.92, -0.1, -0.43, 0.44, -0.35, 0.3, 1.39, 0.69, 0.03, 0.74, 0.59, 0.28, -0.92, 0.48, 1.41, 0.46, -0.9, 0.67, 1.68, -1.09, -0.45, 0.16, -0.68, 0.41, 0.89, -0.65, -0.21, -0.65, -0.05, -1.1, 3.64, 0.08, -0.27, -1.97, -0.63, -0.23, -0.07, -0.02, -0.44, -1.71, 1.64, 1.25, 0.52, -1.29, 0.59, -4.72, -1.3, 1.52, -0.61, 1.95, 0.29, -0.6, 0.54, -1.38, -0.43, -1.23, -1.38, -1.07, -0.33, 0.95, 0.82, 0.32, 0.14, -0.09, -2.47, -0.6, 0.25, 1.42, 1.96, -0.49, 0.61, 0.36, 1.77, 0.74, 0.31, -0.21, 1.01, -0.37, -0.21, 0.68, -0.3, 1.09, 0.0, 1.87, 0.94, -0.42, 1.98, -0.92, 0.02, -0.64, -0.63, -0.79, -0.91, -0.45, -1.55, -0.72, 0.37, 2.12, -0.17, -0.72, -5.01, 0.8, 0.02, -1.13, 0.24, 0.09, -0.14, 3.38, 1.24, -0.21, -0.56, 0.16, 0.71, -1.43, -2.46, -2.48, 1.58, -0.21, -0.29, 0.35, -1.24, -5.17, -1.56, -1.34, 0.13, -1.13, 2.34, 0.7, 0.32, -0.5, 0.09, -0.73, 0.08, 0.68, 0.62, -0.34, 1.57, -0.06, -0.78, 0.07, -0.32, 0.08, -0.46, -1.17, -0.83, 1.15, 1.7, -1.11, -0.01, -0.87, 0.57, -0.65, -0.29, -0.87, 1.75, -0.89, 0.36, -2.38, 0.74, 0.3, -0.06, 0.55, -2.99, -0.51, -1.95, -1.65, 0.21, 0.54, 1.06, 1.0, -0.71, 1.06, 3.73, -0.43, -0.13, -0.03, 0.45, 0.32, 2.15, -0.92, -0.87, -0.58, 0.1, 0.51, 0.35, -0.68, -0.71, 1.02, 0.74, -1.14, 0.8, 2.22, -0.77, -0.08, -2.14, -0.07, -2.2, 0.07, 0.67, -0.15, -0.84, 0.03, 0.21, -0.45, 0.16, 0.58, -0.92, 0.22, 0.06, 0.73, -0.51, 0.14, 0.93, 1.46, 0.65, 0.11, -0.41, -0.06, -1.1, -1.19, -0.09, 1.56, 0.66, -1.48, 0.03, 0.65, 1.19, -0.21, -0.18, 2.26, -0.82, -0.16, -0.45, 0.8, -1.63, 2.1, -0.11, -0.56, 0.04, 0.49, 1.48, -0.81, 0.47, -1.07, 0.2, 0.11, -0.11, -2.15, -0.52, -0.22, 0.29, 1.27, -1.15, -0.67, 0.01, 0.29, 0.51, -0.16, -2.75, -0.42, 1.07, 0.87, 0.17, 1.36, -0.41, -0.28, -0.03, 1.06, -0.14, 0.11, -0.77, 1.27, 1.56, 0.12, -1.15, 1.15, 0.97, -1.48, 0.51, -0.43, -0.2, 2.22, 1.62, -0.02, -0.45, 0.83, -1.29, -1.26, 0.6, 3.3, 1.75, 2.55, 0.09, -1.39, -0.33, 1.35, -0.2, -2.09, 1.48, 1.7, 2.29, 1.03, -0.03, 0.78, -0.07, -1.88, -1.14, 2.91, 3.11, 2.52, -0.26, 0.1, -0.46, 0.5, -0.54, -0.04, 1.94, -0.41, -0.57, 0.84, 5.27, 0.28, -0.13, 4.26, 0.18, 0.16, -1.0, 0.18, -0.53, -0.23, 0.5, 0.41, 0.67, -0.96, -0.18, -0.03, -1.0, 3.01, -0.72, -2.21, 0.74, 1.53, 1.73, -1.61, 2.82, 0.48, -1.12, -0.05, -0.22, -0.11, -1.25, 0.21, 0.16, 0.38, 1.25, -0.97, 0.64, 0.82, -0.65, 0.82, 1.76, 0.91, -1.6, -0.05, -0.47, 0.31, 0.36, -0.98, 3.08, -3.23, 1.62, 0.55, -0.36, 1.83, 0.13, -0.15, -0.15, 0.78, -1.06, -0.51, 0.25, -0.52, -0.11, 0.4, -4.12, 0.94, 1.01, -0.17, -0.99, -0.17, -0.74, 0.14, -1.47, -0.07, -1.53, 2.91, 1.45, 0.14, -0.88, 1.47, -0.51, 0.47, -0.28, 0.94, 0.78, -0.69, -0.88, -0.51, 0.48, 0.6, -0.17, -0.84, -1.03, 0.28, -1.17, 0.52, -2.51, 0.41, 0.04, 0.97, -0.24, 0.8, -1.71, 1.08, -0.01, 0.33, -0.07, 0.95, 1.01, -0.18, 1.6, 0.12, -1.25, -0.72, -0.21, 0.6, -0.93, 1.15, -0.12, -1.29, -0.03, -0.98, -0.92, -0.41, 0.55, -0.1, 0.86, -0.29, -0.33, 1.11, -0.14, 0.1, 0.75, 0.69, 0.31, 1.88, -0.77, -0.62, -0.0, 0.23, -1.11, 1.06, -0.14, 0.57, 1.48, 0.81, -0.9, 0.14, -1.39, 0.71, -0.89, -0.17, -0.51, -0.18, -0.13, 0.38, 2.44, -0.31, 0.59, 0.98, 0.88, -0.17, -0.01, 1.02, -0.03, -0.97, -1.19, 2.03, 0.35, -0.33, -0.24, 1.81, -1.77, 0.37, -0.73, 1.25, 0.41, 1.69, 0.66, -0.09, 1.2, -0.3, -2.69, 1.2, -0.15, -1.1, 1.22, -1.91, 0.14, -0.15, -0.18, 0.73, 0.68, 2.01, 0.82, 0.43, 2.19, 5.14, 0.27, -0.8, -0.04, -2.11, -0.71, 1.22, -1.73, -0.8, 0.75, -0.33, 1.24, 2.21, -0.53, 0.26, -0.38, 0.27, -1.34, 0.87, -0.07, -1.31, 0.08, 0.89, 0.91, -0.27, 1.62, -0.3, 1.05, -0.18, 0.24, -1.21, 2.09, -1.57, -3.15, -2.71, -1.09, -0.94, -1.05, -0.58, -0.58, 1.12, 0.04, 0.58, -1.23, 0.53, -0.92, -0.41, 0.25, -1.92, -0.67, -0.2, -0.52, -0.82, -1.19, 1.34, -1.99, -0.74, 0.31, 1.07, 0.74, 2.8, 1.18, -0.34, 0.8, -0.71, 1.13, 1.76, -0.61, 0.89, 0.76, -0.47, -1.41, -2.34, -1.39, -3.94, -0.51, -0.57, 2.29, -0.6, 0.02, 0.91, -1.41, -1.13, -1.29, 0.71, 0.33, -0.03, -0.36, -0.82, -0.61, -5.19, -1.02, -0.1, 0.12, 2.32, 0.03, 2.39, -0.4, 0.46, 0.96, -0.5, 0.16, -0.73, -0.85, -0.52, -1.56, -0.61, 0.87, 0.08, -0.49, 0.28, 0.03, -0.48, -0.32, 0.66, -0.58, 1.88, 1.88, 1.69, -1.49, -0.14, 1.7, -0.44, 2.37, 0.61, -1.75, -0.59, -2.37, 0.52, -0.72, 0.35, 0.73, -1.18, -0.19, -0.9, 1.35, 3.18, 0.34, 0.35, 0.88, -0.03, 0.48, -0.27, 0.79, -0.45, 0.16, -0.14, -0.02, 0.58, -5.38, -0.83, 0.52, 1.03, 2.08, 0.02, 2.75, 0.45, 0.79, -1.13, -0.22, -1.17, 1.62, -0.54, -1.01, -0.13, -0.14, -3.09, 0.55, -0.84, -0.22, -0.9, 0.51, -0.25, 0.92, -1.4, 1.23, 0.26, 3.92, 0.04, -1.24, -1.45, -0.33, 1.7, -0.99, -0.03, -2.06, -0.14, 0.41, 2.45, -0.54, 0.03, 0.04, -1.14, -0.4, -1.22, -0.26, -6.6, 1.56, -1.06, -1.04, 0.49, 1.12, 0.32, 0.91, 0.85, -0.93, 0.08, 0.79, -2.25, -0.04, 0.71, -2.1, 0.66, 0.29, -0.31, -1.5, 3.25, -0.95, 0.06])

In [None]:
num_cmpds = 100
mtx_sig_tokens = evaluate(tf.repeat(tf.expand_dims(mtx_sig,0),axis=0,repeats=num_cmpds),tf3,100)

In [None]:
mtx_smiles = []
for i in range(len(mtx_sig_tokens)):
  smiles = tokens2smiles(mtx_sig_tokens[i])
  mol = Chem.MolFromSmiles(smiles)
  if mol!=None and smiles not in mtx_smiles and sascorer.calculateScore(mol)<4.5:
    mtx_smiles.append(smiles)

In [None]:
known_cmp_mtx = 'CN(Cc1cnc2nc(N)nc(N)c2n1)c3ccc(cc3)C(=O)N[C@@H](CCC(=O)O)C(=O)O'
known_cmp_mol = Chem.MolFromSmiles(known_cmp_mtx)
known_cmp_ecfp = AllChem.GetMorganFingerprintAsBitVect(known_cmp_mol, 3, nBits=1024, useChirality=True)
M = 0
for comp in mtx_smiles:
  mol_pred = Chem.MolFromSmiles(comp)
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_pred, 3, nBits=1024, useChirality=True)
  if DataStructs.TanimotoSimilarity(mol_ecfp,known_cmp_ecfp)>M:
    M = DataStructs.TanimotoSimilarity(mol_ecfp,known_cmp_ecfp)
    best_cmp = comp

In [None]:
best_cmp,round(M,3)

In [None]:
import pickle
pickle.dump(mtx_smiles,open('mtx_smiles.pkl','wb'))

In [None]:
import pickle
mtx_smiles = pickle.load(open('mtx_smiles.pkl','rb'))

### Azacitidine 

In [None]:
az_sig = np.array([-2.57, -0.42, 1.11, 2.04, -0.52, 0.27, -0.23, 0.47, -0.02, 0.23, 0.2, 0.03, 0.13, 0.13, 0.55, 1.11, 0.21, 1.8, 0.48, 0.98, 0.04, 0.14, -0.46, 0.14, -0.74, 0.97, -0.19, 0.64, -0.12, 0.27, -1.79, 1.75, 0.29, -0.16, -0.5, 0.63, -0.2, 0.28, 0.08, 2.33, 0.3, 0.88, -0.61, -0.52, 1.23, 0.88, 0.27, -0.79, -0.7, -0.68, 0.77, 0.13, 0.37, 2.98, 0.18, 1.44, -0.22, -0.12, 0.73, 0.44, 2.03, -0.38, 1.9, -0.07, 0.08, -0.24, 0.72, 1.21, -1.52, -0.88, 0.34, 0.61, 0.65, -1.59, -0.95, 0.65, 0.19, 1.27, -0.73, -0.34, 1.96, 0.59, -2.04, -0.04, -0.52, -0.48, 1.66, 1.75, -0.23, -1.67, 0.02, 1.36, -0.81, 2.01, -0.07, 0.78, 0.12, 2.77, 0.25, 0.92, 0.01, 0.77, 0.6, 0.27, -0.38, -0.87, 0.58, -0.38, 0.02, 0.7, -0.12, -0.07, 2.56, -0.27, -0.4, 1.93, 0.07, -2.07, 1.89, -0.03, 1.24, 1.86, 1.63, -2.22, -0.56, 0.4, -1.93, 0.04, -0.2, -0.4, 0.41, 0.54, 0.18, 0.15, -0.07, -0.69, 1.3, 0.13, 1.74, 0.71, 0.75, 0.66, -0.21, 0.73, 0.58, -2.29, -0.54, 2.39, 0.14, -0.76, 1.33, 2.38, 0.68, -0.16, 0.32, 0.56, -0.65, 0.7, 0.35, -0.35, 0.17, -1.47, -1.05, 0.05, -0.27, 3.73, 0.52, 0.69, -0.9, -0.15, -0.83, 0.14, -0.53, 0.22, 1.32, 1.47, 1.55, 0.02, 0.93, -1.22, 0.62, -0.49, -0.13, 0.35, -0.79, -0.47, -1.49, 0.47, -0.27, -1.01, -0.49, 0.48, -1.0, 0.32, -1.68, -0.46, -0.94, 0.65, 2.01, 0.72, -1.26, 1.28, -0.78, -0.97, 2.56, 1.1, 0.15, 0.37, -0.84, 2.25, 2.9, 1.57, -0.57, 0.54, -0.37, 2.11, -0.8, 0.33, 1.01, 0.98, -1.28, 1.41, -1.21, 0.77, 1.68, 0.96, -2.49, -0.68, -0.29, -2.18, -1.52, 2.38, 1.05, 0.45, -0.58, 1.18, 1.37, 0.57, -0.64, -1.28, 1.34, -0.2, -0.59, 0.8, -3.08, 0.5, 1.85, -1.12, -0.21, 0.02, -1.44, 1.37, -0.78, 0.68, -0.2, 3.42, 0.6, -0.18, -0.96, -0.95, 0.68, 1.74, 0.16, 0.52, 0.64, -0.68, -1.26, -1.1, -1.19, -2.32, 0.12, 0.14, -1.02, -0.26, 0.47, -1.1, -0.74, 1.0, -0.78, -0.29, 0.18, 0.15, -1.74, 1.29, 0.01, -0.19, 0.76, -0.76, 0.22, -0.76, 3.73, 1.61, -0.35, 0.23, 1.07, -0.9, 1.8, -0.15, 0.16, 0.91, 0.4, -0.47, -0.47, -0.47, 0.22, -0.61, 2.56, 0.3, -0.93, -0.41, -0.07, 0.34, 0.81, -0.33, -1.02, -1.82, -1.78, 0.31, -0.39, 0.27, 0.07, 0.31, 0.05, 0.23, -0.07, -0.34, -0.04, -0.24, 3.08, 0.33, -0.19, -1.09, 1.26, -0.25, -0.11, -1.04, 3.2, -0.28, -0.73, 3.57, 0.39, -0.36, -2.2, 0.05, 0.83, -1.37, -0.67, 1.08, -2.0, 0.12, 2.67, -0.75, -0.26, 0.91, -0.43, -0.23, -1.23, 1.2, -0.56, -0.1, -0.09, 0.33, -4.46, 0.07, -0.94, -0.12, -0.03, -0.46, 1.82, 0.08, 0.72, 0.61, 0.14, 1.25, 0.08, 0.39, -0.09, 0.61, -0.14, 0.98, -0.54, -1.16, 0.34, 1.08, 0.36, 0.5, 0.34, -1.74, -0.12, 0.13, -2.05, -1.85, -2.22, -0.08, 0.73, -1.99, 0.41, 0.47, -0.59, 0.24, 0.21, -0.69, 0.48, -0.08, -0.84, 0.2, 3.18, -2.67, -0.4, -2.74, 0.56, -0.43, 0.91, -0.68, -1.78, -0.09, 1.4, -0.01, 1.08, 2.55, -1.61, -1.27, 0.62, 0.43, -0.05, -1.72, -0.06, 0.94, 2.08, 0.45, 0.94, -3.23, 2.01, 1.63, 1.35, 2.32, -0.82, 1.17, 0.6, -1.07, -0.26, 1.76, -1.22, 2.25, -0.3, 1.36, 0.57, 1.0, -2.5, 5.39, -0.4, 3.97, 0.16, -0.88, 0.18, -1.14, 0.38, 0.18, -2.89, -0.11, -3.22, 0.75, 0.56, -1.85, 0.02, -0.05, -2.31, -0.38, -1.36, -0.19, 0.6, -1.04, 2.45, -2.5, -0.38, 0.42, 0.62, -0.08, -1.38, 0.13, 0.07, -0.84, -1.87, -0.59, -0.36, -1.22, 0.35, -1.32, -0.01, 0.9, -1.19, 1.81, -2.47, -0.82, -0.24, -1.01, 2.11, 0.66, 3.02, 0.37, -2.11, 1.29, 1.21, -1.28, -2.29, 0.62, 0.28, -2.12, -0.6, -2.08, -0.71, 0.68, -2.5, -0.83, 0.69, 1.29, 1.08, -2.24, -1.86, -0.94, -0.93, 0.94, 1.14, -2.85, -0.89, -1.34, -2.97, -0.92, -4.64, -0.29, 0.03, 1.11, -0.49, 2.63, -2.43, 2.83, 0.93, 0.34, 0.71, -5.73, -0.3, 0.33, 0.66, 0.57, -1.26, -0.29, 0.16, -1.75, -0.47, -0.98, -2.16, -2.6, -0.77, -0.89, 1.5, 0.51, 1.64, 0.05, -1.37, -1.18, 0.13, 0.02, 0.63, -0.07, 0.85, 0.61, 0.43, -0.21, -1.61, 0.54, 0.75, 0.05, -1.94, -0.36, 2.0, -0.53, -1.35, -1.75, 0.06, 0.07, -0.01, 0.13, 0.18, -1.91, -1.53, 0.04, 0.73, -0.39, -0.11, 0.15, 0.74, 1.44, 0.09, 4.99, 0.41, -0.74, 0.98, 1.13, -0.1, 0.33, 0.54, 0.97, -0.19, -0.5, 1.26, 0.41, 1.92, -0.22, -1.07, 0.73, 2.62, 1.66, 2.79, -1.64, -0.86, -1.17, -0.38, 1.64, -0.03, -2.06, 0.3, 0.02, 1.58, 0.24, 0.7, 1.04, 1.61, -1.4, 1.19, -0.02, 0.05, 0.93, 1.28, -0.95, 1.1, 0.2, 0.06, 1.03, -0.33, -0.3, 0.86, 0.5, 0.21, 2.04, -1.49, -2.47, -0.87, -0.08, -1.16, 0.21, 0.97, 1.69, 1.87, 0.41, -0.01, -0.9, 2.05, -0.34, 0.43, -0.33, -0.38, -0.68, 1.92, -0.49, -1.02, 0.62, 1.12, 0.37, -0.71, -2.57, -1.03, -0.43, 0.93, -1.87, 1.66, 0.75, -0.13, 0.71, -0.27, 0.4, 0.31, -0.06, -1.77, 0.1, -0.29, -0.24, 0.19, 0.17, 0.97, -1.84, -0.52, -2.1, 0.45, 1.01, 0.78, 1.34, -1.3, -1.72, -0.54, -0.1, -0.65, -2.56, 0.8, -0.11, -0.17, 0.39, -0.71, -1.32, 2.27, -1.99, 0.71, -0.69, -0.4, 0.23, 0.22, -1.68, 0.14, -0.42, -1.23, -1.09, 0.05, 1.59, -0.88, 0.28, -0.73, -0.26, -0.02, -2.77, 0.42, 1.64, -0.24, 0.9, 1.79, -0.94, -0.05, -1.34, -0.43, -0.97, -0.26, 0.43, -0.29, -1.68, -3.06, -0.64, 0.13, -0.44, 0.99, -0.54, -0.93, 0.3, 0.04, 0.04, 2.1, -0.14, -0.28, -0.17, 0.29, -0.58, -0.83, -0.36, -0.66, -1.43, 0.92, -0.95, -0.03, -2.87, -1.57, -1.82, 0.06, -0.65, 0.14, -1.65, -1.67, -0.55, 0.38, 0.55, 0.49, -1.26, 0.95, -0.3, -0.15, -1.98, 0.54, -0.06, -0.31, -0.41, 0.18, 0.98, -1.02, 0.73, 0.84, -0.14, 4.28, -1.08, -0.2, 0.49, 1.54, 0.25, -2.05, 0.8, -0.53, -1.54, 0.45, 0.21, 1.43, 1.08, 1.88, -0.47, 0.05, 0.96, -0.75, -0.05, 0.12, 0.42, 0.9, 0.49, -0.51, 0.35, 1.75, 1.67, 0.43, 1.09, -0.71, 3.01, -0.71, 0.4, -0.55, 1.24, -2.0, -2.28, -0.62, 0.21, -2.2, 0.39, 0.64, 0.65, -2.4, 0.57, -0.29, 4.7, -0.3, -3.42, 1.42, 2.6, 1.46, -0.17, 0.06, 0.22, -0.38, -3.76, -1.85, 0.79, -1.49, 4.52, -0.1, 0.04, 0.41, 0.79, -1.53, -0.08, -0.19, 0.08, 1.46, 0.01, 0.55, -1.81, 0.28, 0.06, 1.4, 1.72, 2.74, -0.49, -0.68, 1.04, 1.7, 0.29, 0.31, 1.66, 0.05, 1.81, -2.51, 0.04, -1.17, 3.03, -0.16, -0.46, -0.36, 0.16, -1.39, 0.01, -0.69, -1.4, -0.12, -0.42, -0.91, -1.23, 1.38, 0.78, 0.17, -0.97, 0.7, -0.19, 0.25, -0.11, 1.47, 1.64, 1.09, 0.29, -0.12, 0.2, 1.02, -0.13, -0.04, 1.37, 0.09, 0.69, 0.71, 0.66, -0.38, -1.35, 0.25, 0.9, 0.69, -1.69, 0.7, 0.79, 1.51, -0.18, -0.56, 1.19, -0.02, -0.3, 0.47, -0.42, 0.8, -1.12, -2.09, 0.28, 0.66, -0.89, 0.45, -0.11, 1.7, 5.43, -0.98, 1.29, -0.17, -2.01, 0.26, -1.07, -0.78, -0.57, 1.9, 0.73, 0.12, -2.55, -0.29, 0.43, 1.69, -0.87, -2.16, 0.06, -0.72, -1.01, -0.2, 0.4, 1.42, -0.86, -0.28, 0.89, -1.9, 0.32, -0.33, -1.67, 0.8, -0.41, 2.33, -0.72])

In [None]:
num_cmpds = 100
az_sig_tokens = evaluate(tf.repeat(tf.expand_dims(az_sig,0),axis=0,repeats=num_cmpds),tf3,100)

In [None]:
az_smiles = []
for i in range(len(mtx_sig_tokens)):
  smiles = tokens2smiles(az_sig_tokens[i])
  mol = Chem.MolFromSmiles(smiles)
  if mol!=None and smiles not in az_smiles and sascorer.calculateScore(mol)<4.5:
    az_smiles.append(smiles)

In [None]:
known_cmp_az = 'NC1=NC(=O)N(C=N1)[C@@H]2O[C@H](CO)[C@@H](O)[C@@H]2O'
known_cmp_mol = Chem.MolFromSmiles(known_cmp_az)
known_cmp_ecfp = AllChem.GetMorganFingerprintAsBitVect(known_cmp_mol, 3, nBits=1024, useChirality=True)
M = 0
for comp in az_smiles:
  mol_pred = Chem.MolFromSmiles(comp)
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_pred, 3, nBits=1024, useChirality=True)
  if DataStructs.TanimotoSimilarity(mol_ecfp,known_cmp_ecfp)>M:
    M = DataStructs.TanimotoSimilarity(mol_ecfp,known_cmp_ecfp)
    best_cmp = comp

In [None]:
best_cmp,round(M,3)

In [None]:
import pickle
pickle.dump(az_smiles,open('az_smiles.pkl','wb'))

## Validation

In [None]:
## Loading Validation Data
import pickle
val_data = []
true_cmpds = []
full_val = pickle.load( open( current_loc[0]+"/checkpoints/2 layers/16 heads val 0.2/val_data.pkl", "rb" ) )
for i in range(len(full_val)):
  val_data.append(full_val[i][0])
  true_cmpds.append(full_val[i][2])

In [None]:
# true_cmpds
from tqdm import tqdm
smiles_orig_list = []
for i in tqdm(range(len(val_data[0]))):
  smiles_orig = tokens2smiles(true_cmpds[0][i])
  smiles_orig_list.append(smiles_orig)
    

In [None]:
num_cmpds = 100
ps_dict_val = {}
for i in tqdm(range(len(val_data[0]))):
  sample_test=tf.repeat(tf.expand_dims(val_data[0][i],0),axis=0,repeats=num_cmpds)
  ps_dict_val[i] = translate(sample_test,num_cmpds,tf3)

In [None]:
#Save Validation Compounds
import pickle
pickle.dump(ps_dict_val,open('val _predictions_batch7.pkl','wb'))

In [None]:
from tqdm import tqdm
num_cmpds = 100
ps_dict_val = {}
for i in tqdm(range(len(val_data[6]))):
  sample_test=tf.repeat(tf.expand_dims(val_data[6][i],0),axis=0,repeats=num_cmpds)
  ps_dict_val[i] = translate(sample_test,num_cmpds,tf3)

In [None]:
val_pred_path = current_loc[0]+'/checkpoints/2 layers/16 heads val 0.2/Validation Predictions'

In [None]:
import pickle
batch_pred = pickle.load(open(val_pred_path+'/val _predictions_batch'+str(7)+'.pkl','rb'))
my_kernel[num_cmpds,chembl_max_seq_length](batch_pred[0].numpy(),np.array(dicti),output_tokens)
output_cmpds=[''.join(o[:np.where(np.append(o,invert_tokens[63])==invert_tokens[63])[0][0]]) for o in output_tokens if invert_tokens[0] not in o]


In [None]:
from tqdm import tqdm
val_pred = {}

for i in tqdm(range(len(val_data))):
  batch_pred = pickle.load(open(val_pred_path+'/val _predictions_batch'+str(i+1)+'.pkl','rb'))
  for j in (range(len(batch_pred))):
    my_kernel[num_cmpds,chembl_max_seq_length](batch_pred[j].numpy(),np.array(dicti),output_tokens)
    val_pred[(i,j)] = [''.join(o[:np.where(np.append(o,invert_tokens[63])==invert_tokens[63])[0][0]]) for o in output_tokens if invert_tokens[0] not in o]
  # print()

In [None]:
val_uni_counts = []
synth_counts = []

synth_valid_val_pred = {}
for key in tqdm(val_pred):
  val_uni = 0
  synth = 0
  synth_valid_val_pred[key] = []
  for smiles in val_pred[key]:
    mol = Chem.MolFromSmiles(smiles)
    if mol!= None and smiles not in synth_valid_val_pred[key]:
      val_uni += 1
      if sascorer.calculateScore(mol)<4.5:
        synth += 1
        synth_valid_val_pred[key].append(smiles)
  val_uni_counts.append(val_uni)
  synth_counts.append(synth)



In [None]:
# plt.hist(val_uni_counts,20)
import seaborn as sns
plt.figure(figsize=(7,5))
sns.distplot(val_uni_counts)
plt.xlabel('Number of Molecules')
plt.ylabel('Density')
plt.savefig('val_uni_validation.png',dpi = 300)
plt.title(["Mean: "+str(sum(val_uni_counts)/len(val_uni_counts))+"; Sigma: "+str(np.std(np.array(val_uni_counts)))])

In [None]:
# plt.hist(val_uni_counts,20)
import seaborn as sns
plt.figure(figsize=(7,5))
sns.distplot(synth_counts,bins = 23,color='red')
plt.xlabel('Number of Molecules')
plt.ylabel('Density')
plt.savefig('synth_validation.png',dpi = 300)
plt.title(["Mean: "+str(sum(synth_counts)/len(synth_counts))+"; Sigma: "+str(np.std(np.array(synth_counts)))])

In [None]:
smiles_orig_dict = {}
for i in tqdm(range(len(true_cmpds))):
  for j in range(len(true_cmpds[i])):
    smiles_orig_dict[(i,j)] = tokens2smiles(true_cmpds[i][j])

In [None]:
sim_scores_list = []
generated_active = {}
for key in tqdm(synth_valid_val_pred):
  M = 0
  mol_orig = Chem.MolFromSmiles(smiles_orig_dict[key])
  for smiles in synth_valid_val_pred[key]:
    mol_pred = Chem.MolFromSmiles(smiles)

    ECFP_pred = AllChem.GetMorganFingerprintAsBitVect(mol_pred,3,1024, useChirality=True)
    ECFP_orig = AllChem.GetMorganFingerprintAsBitVect(mol_orig,3,1024, useChirality=True)
    sim = DataStructs.TanimotoSimilarity(ECFP_pred,ECFP_orig)
    if sim>M:
      M = sim
      generated_active[key] = (smiles,smiles_orig_dict[key],sim)
  sim_scores_list.append(M)

In [None]:
# plt.hist(val_uni_counts,20)
import seaborn as sns
plt.figure(figsize=(7,5))
sns.distplot(sim_scores_list,color='green')
plt.xlabel('Similarity')
plt.ylabel('Density')
plt.savefig('sim_validation.png',dpi = 300)
plt.title(["Mean: "+str(sum(sim_scores_list)/len(sim_scores_list))+"; Sigma: "+str(np.std(np.array(sim_scores_list)))])

In [None]:
pickle.dump([generated_active,synth_valid_val_pred],open('final_val_predictions.pkl','wb'))

In [None]:
import pickle
generated_active = pickle.load(open(current_loc[0]+'/checkpoints/2 layers/16 heads val 0.2/Validation Predictions/'+'final_val_predictions.pkl','rb'))

In [None]:
len(generated_active[0])

In [None]:
i = 0
for key in generated_active[0]:
  if generated_active[0][key][2]>0.6:
    i+=1
    print(generated_active[0][key])
  if i>15:
    break

Extras

In [None]:
#device_test
#system_params
tf.config.list_physical_devices('GPU')
!nvidia-smi -pm 1  #persistence mode
!cat /proc/meminfo  #memory info
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
#trials
tf2=Transformer(num_layers,d_model,num_heads,dff,inp_vocab_size,tgt_vocab_size,pe_inp,pe_tgt,rate=dropout_rate)
out,_=tf2(inp_s=data_ges[:100,:],inp_n=noise[:100,:],tar=cmpds_data[:100,:],training=True,look_ahead_mask=None,dec_padding_mask=None) 
print(out)

In [None]:
#kfolds cv
kf = KFold(n_splits=n_batches)
for (batch,(_, batch_index)) in enumerate(kf.split(data_ges)):
    ges_batch = data_ges[batch_index]
    cmpd_batch = cmpds_tokens[batch_index]
    train_step(ges_batch, cmpd_batch)

In [None]:
#ffn_encoder--back_prop 
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
x=FFNEncoderLayer(d_model=256,dim_s0=[512,256], dim_n0=[512,256], dim_m0=[256,256], dim_m1=[256,256], rate=0.1)
input=tf.random.uniform((3,4))
nois=tf.random.uniform((3,4))
with tf.GradientTape() as tape:
  y1,y2=x(input,nois)
  print(x.ffn_s0[0].weights)
  z=tf.reduce_sum((y1-y2))
  gradients = tape.gradient(z, x.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, x.trainable_variables))
print(x.ffn_s0[0].weights)