In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class ProteinCNNEncoder(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dims=[512, 256], dropout=0.1):
#         super().__init__()
#         self.dropout = dropout
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)

#         self.branch_1_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=5)
#         self.branch_1_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5)
        
#         self.branch_2_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=10)
#         self.branch_2_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=10)
        
#         self.branch_3_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=15)
#         self.branch_3_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=15)
        
#         self.branch_4_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=20)
#         self.branch_4_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=20)
        
#         self.branch_5_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=30)
#         self.branch_5_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=30)

    
    
#     def forward(self, x):
#         x = self.embedding(x)
#         x = x.moveaxis(1, 2)
        
#         branch_1 = F.dropout(F.gelu(self.branch_1_layer_1(x)), self.dropout)
#         branch_1 = F.dropout(F.gelu(self.branch_1_layer_2(branch_1)), self.dropout)
#         branch_1, _ = torch.max(branch_1, -1)
        
#         branch_2 = F.dropout(F.gelu(self.branch_2_layer_1(x)), self.dropout)
#         branch_2 = F.dropout(F.gelu(self.branch_2_layer_2(branch_2)), self.dropout)
#         branch_2, _ = torch.max(branch_2, -1)
        
#         branch_3 = F.dropout(F.gelu(self.branch_3_layer_1(x)), self.dropout)
#         branch_3 = F.dropout(F.gelu(self.branch_3_layer_2(branch_3)), self.dropout)
#         branch_3, _ = torch.max(branch_3, -1)
        
#         branch_4 = F.dropout(F.gelu(self.branch_4_layer_1(x)), self.dropout)
#         branch_4 = F.dropout(F.gelu(self.branch_4_layer_2(branch_4)), self.dropout)
#         branch_4, _ = torch.max(branch_4, -1)
        
#         branch_5 = F.dropout(F.gelu(self.branch_5_layer_1(x)), self.dropout)
#         branch_5 = F.dropout(F.gelu(self.branch_5_layer_2(branch_5)), self.dropout)
#         branch_5, _ = torch.max(branch_5, -1)
        
#         x = torch.mean(x, -1)
        
#         return x

In [100]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class ProteinCNNEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dims=[512, 256], dropout=0.1):
        super().__init__()
        self.dropout = dropout
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.branch_1_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=5, padding=2)
        self.branch_1_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=2)
        
        
        self.branch_2_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=10, padding=5)
        self.branch_2_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=10, padding=4)
        
        self.branch_3_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=15, padding=7)
        self.branch_3_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=15, padding=7)
        
        self.branch_4_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=20, padding=10)
        self.branch_4_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=20, padding=9)
        
        self.branch_5_layer_1 = nn.Conv1d(embedding_dim, hidden_dims[0], kernel_size=30, padding=15)
        self.branch_5_layer_2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=30, padding=14)

    
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.moveaxis(1, 2)
        
        branch_1 = F.dropout(F.gelu(self.branch_1_layer_1(x)), self.dropout)
        branch_1 = F.dropout(F.gelu(self.branch_1_layer_2(branch_1)), self.dropout)
        
        branch_2 = F.dropout(F.gelu(self.branch_2_layer_1(x)), self.dropout)
        branch_2 = F.dropout(F.gelu(self.branch_2_layer_2(branch_2)), self.dropout)      
        
        branch_3 = F.dropout(F.gelu(self.branch_3_layer_1(x)), self.dropout)
        branch_3 = F.dropout(F.gelu(self.branch_3_layer_2(branch_3)), self.dropout)
        
        branch_4 = F.dropout(F.gelu(self.branch_4_layer_1(x)), self.dropout)
        branch_4 = F.dropout(F.gelu(self.branch_4_layer_2(branch_4)), self.dropout)
       
        branch_5 = F.dropout(F.gelu(self.branch_5_layer_1(x)), self.dropout)
        branch_5 = F.dropout(F.gelu(self.branch_5_layer_2(branch_5)), self.dropout)
        
        x = (branch_1 + branch_2 + branch_3 + branch_4 + branch_5) / 5
        x = torch.moveaxis(x, 1, 2)
        
        return x

fasta_stoi = {
    "[PAD]": 0,
    "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
    "I": 9, "J": 10, "K": 11, "L": 12, "M": 13, "N": 14, "O": 15,
    "P": 16, "Q": 17, "R": 18, "S": 19, "T": 20, "U": 21, "V": 22, 
    "W": 23, "Y": 24, "Z": 25, "X": 26, "*": 27, "-": 28
}
    
vocab_size = len(fasta_stoi)
embedding_dim = 256
protein_cnn_encoder = ProteinCNNEncoder(vocab_size, embedding_dim)

In [101]:
out = protein_cnn_encoder(torch.randint(0, vocab_size, (1, 1024)))
out.shape

torch.Size([1, 256, 1024])

In [170]:
def func_attention(query, context, eps=1e-8):
    batch_size_q, queryL = query.size(0), query.size(1)
    batch_size, sourceL = context.size(0), context.size(1)

    queryT = torch.transpose(query, 1, 2)

    attn = torch.bmm(context, queryT)
    
    attn = attn.view(batch_size * sourceL, queryL)
    attn = F.softmax(attn).view(batch_size, sourceL, queryL)

    attn = torch.transpose(attn, 1, 2).contiguous().view(batch_size * queryL, sourceL)
    attn = F.softmax(attn).view(batch_size, queryL, sourceL)
    attnT = torch.transpose(attn, 1, 2).contiguous()

    contextT = torch.transpose(context, 1, 2)

    weightedContext = torch.bmm(contextT, attnT)
    weightedContext = torch.transpose(weightedContext, 1, 2)

    return weightedContext

In [222]:
from transformers import BertModel, BertTokenizer

molecule_tokenizer = BertTokenizer.from_pretrained("data/drug/molecule_tokenizer")
molecule_bert = BertModel.from_pretrained("weights/molecule_bert_pretrained-masking_rate_30", local_files_only=True)

Some weights of BertModel were not initialized from the model checkpoint at weights/molecule_bert_pretrained-masking_rate_30 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [223]:
encoded_smiles = molecule_tokenizer(smiles_seq, return_tensors="pt")
encoded_smiles

{'input_ids': tensor([[ 2, 30, 53, 19,  7, 63, 35,  8, 63, 53, 20, 53, 53, 53,  5, 10, 53, 21,
         53, 63, 53, 53,  5, 41, 30, 30,  5, 40,  6, 30, 53, 22, 53, 53, 53, 53,
         53, 22,  6, 53, 21,  6, 53, 53, 19, 20,  3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [214]:
encoded_smiles = molecule_tokenizer(smiles_seq, return_tensors="pt")
encoded_smiles

{'input_ids': tensor([[30, 53, 19,  7, 63, 35,  8, 63, 53, 20, 53, 53, 53,  5, 10, 53, 21, 53,
         63, 53, 53,  5, 41, 30, 30,  5, 40,  6, 30, 53, 22, 53, 53, 53, 53, 53,
         22,  6, 53, 21,  6, 53, 53, 19, 20]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [206]:
from tdc.multi_pred import DTI

davis = DTI(name="Davis")
davis_split = davis.get_split()

smiles_seq = " ".join(davis_split['train'].loc[0, "Drug"])
encoded_smiles = molecule_tokenizer(smiles_seq, return_tensors="pt")

Found local copy...
Loading...
Done!


In [207]:
encoded_smiles

{'input_ids': tensor([[30, 53, 19,  7, 63, 35,  8, 63, 53, 20, 53, 53, 53,  5, 10, 53, 21, 53,
         63, 53, 53,  5, 41, 30, 30,  5, 40,  6, 30, 53, 22, 53, 53, 53, 53, 53,
         22,  6, 53, 21,  6, 53, 53, 19, 20]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [185]:
protein_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False, model_max_length=2048)
fasta_seq = " ".join(davis_split['train'].loc[0, "Target"])

In [184]:
protein_tokenizer(fasta_seq, return_tensors="pt")['input_ids'].shape

torch.Size([1, 963])

In [186]:
len(davis_split['train'].loc[0, "Target"])

961

In [175]:
len(davis_split['train'].loc[0, "Drug"])

45

In [188]:
molecule_tokenizer.encode(smiles_seq)

[30,
 53,
 19,
 7,
 63,
 35,
 8,
 63,
 53,
 20,
 53,
 53,
 53,
 5,
 10,
 53,
 21,
 53,
 63,
 53,
 53,
 5,
 41,
 30,
 30,
 5,
 40,
 6,
 30,
 53,
 22,
 53,
 53,
 53,
 53,
 53,
 22,
 6,
 53,
 21,
 6,
 53,
 53,
 19,
 20]

In [12]:
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, BertForMaskedLM

protein_tokenizer = RobertaTokenizer(
    vocab_file="data/target/bpe_tokenizer/vocab.json", 
    merges_file="data/target/bpe_tokenizer/merges.txt"
)

protein_tokenizer.get_vocab()

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '<mask>': 4,
 '!': 5,
 '"': 6,
 '#': 7,
 '$': 8,
 '%': 9,
 '&': 10,
 "'": 11,
 '(': 12,
 ')': 13,
 '*': 14,
 '+': 15,
 ',': 16,
 '-': 17,
 '.': 18,
 '/': 19,
 '0': 20,
 '1': 21,
 '2': 22,
 '3': 23,
 '4': 24,
 '5': 25,
 '6': 26,
 '7': 27,
 '8': 28,
 '9': 29,
 ':': 30,
 ';': 31,
 '<': 32,
 '=': 33,
 '>': 34,
 '?': 35,
 '@': 36,
 'A': 37,
 'B': 38,
 'C': 39,
 'D': 40,
 'E': 41,
 'F': 42,
 'G': 43,
 'H': 44,
 'I': 45,
 'J': 46,
 'K': 47,
 'L': 48,
 'M': 49,
 'N': 50,
 'O': 51,
 'P': 52,
 'Q': 53,
 'R': 54,
 'S': 55,
 'T': 56,
 'U': 57,
 'V': 58,
 'W': 59,
 'X': 60,
 'Y': 61,
 'Z': 62,
 '[': 63,
 '\\': 64,
 ']': 65,
 '^': 66,
 '_': 67,
 '`': 68,
 'a': 69,
 'b': 70,
 'c': 71,
 'd': 72,
 'e': 73,
 'f': 74,
 'g': 75,
 'h': 76,
 'i': 77,
 'j': 78,
 'k': 79,
 'l': 80,
 'm': 81,
 'n': 82,
 'o': 83,
 'p': 84,
 'q': 85,
 'r': 86,
 's': 87,
 't': 88,
 'u': 89,
 'v': 90,
 'w': 91,
 'x': 92,
 'y': 93,
 'z': 94,
 '{': 95,
 '|': 96,
 '}': 97,
 '~': 98,
 

In [17]:
from tdc.multi_pred import DTI

davis = DTI(name="Davis")
davis_split = davis.get_split()

fasta_seq = " ".join(davis_split['train'].loc[0, "Target"])
protein_smiles = protein_tokenizer(fasta_seq, return_tensors="pt")

Found local copy...
Loading...
Done!


In [18]:
protein_smiles

{'input_ids': tensor([[  0,  49, 225,  ..., 225,  48,   2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [13]:
protein_tokenizer(new_X[0], return_tensors="pt")

{'input_ids': tensor([[    0,  6817,  2268,   274,  1124,  3444,   467,    58,  1044,   362,
           427,  1357,   385,  2455,  6871,   500,   357,   369,   364,   352,
           349,   369,  7345,   299,   510,   383,  2787,   458,  5887,   423,
           565,  9333,  1307,   486,  3990,  1387,   386,   350,   372,   307,
           496,   334,   451,   381,  2917,   643,   851,   313,  1110,  6242,
          1403,   384,   619,  1557,   283,   372,    59,  5076,   363,   634,
          6876,   477,  6195,   418,  4034,  1678,   399,  1309,   410,   594,
          1059,   269,  2939,  3371,  5455,   300,  1201,   380,   331,  2926,
           272,   990,   462,   358,   410,  1605,  2228,   391,   348,   322,
           283,  3422,   302,   281,   548,   309,   386,   523,  8763,   782,
          3355,  1192,   338,    50,   678,  2578,    59,  2085,   391,   320,
          9262,   379,   317,   678,  6753,   852,   268,    49,  1261,  1002,
            50,   775,   305,  1641,  

In [16]:
from transformers import RobertaConfig, RobertaForMaskedLM
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, BertForMaskedLM

protein_tokenizer = RobertaTokenizer.from_pretrained("data/target/protein_tokenizer")
vocab_size = len(protein_tokenizer.get_vocab().keys())

print(f"load tokenizer\nvocab size: {vocab_size}\nspecial tokens: {protein_tokenizer.all_special_tokens}")


config = RobertaConfig(
    vocab_size=vocab_size,
    hidden_size=128,
    num_hidden_layers=8,
    num_attention_heads=8,
    intermediate_size=512,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=128,
    type_vocab_size=1,
    pad_token_id=1,
    position_embedding_type="absolute"
)

model = RobertaForMaskedLM(config)

load tokenizer
vocab size: 10261
special tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>']


In [17]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(10261, 128, padding_idx=1)
      (position_embeddings): Embedding(128, 128, padding_idx=1)
      (token_type_embeddings): Embedding(1, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNor

In [11]:
from tdc.multi_pred import DTI

davis = DTI(name="Davis")
kiba = DTI(name="Kiba")
binding_db = DTI(name="BindingDB")

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Downloading...
100%|█████████████████████████████████████| 54.4M/54.4M [00:06<00:00, 8.70MiB/s]
Loading...
Done!


In [12]:
import numpy as np

davis_unique_target = np.unique(davis.get_data().Target.values)
kiba_unique_target = np.unique(kiba.get_data().Target.values)
binding_db_unique_target = np.unique(binding_db.get_data().Target.values)

target_sequences = np.concatenate((davis_unique_target, kiba_unique_target, binding_db_unique_target))
target_sequences.shape

(2021,)

In [13]:
import pickle

with open("data/target/target_sequences.pkl", "wb") as f:
    pickle.dump(target_sequences, f)