In [1]:
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem.QED import qed
from tdc.multi_pred import DTI

from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')

import warnings
warnings.filterwarnings(action='ignore')


# binding_db = DTI(name="BindingDB_Kd")
# binding_db.convert_to_log(form='binding')
# binding_db_split = binding_db.get_split()

# davis = DTI(name="Davis")
# davis.convert_to_log(form='binding')
# davis_split = davis.get_split()

kiba = DTI(name="KIBA")
kiba.convert_to_log(form='binding')
kiba_split = kiba.get_split()

Found local copy...
Loading...
Done!
To log space...


In [2]:
def canonicalize_smiles(df):
    for i, smiles in enumerate(df['Drug']):
        mol = Chem.MolFromSmiles(smiles)
        canonical_smiles = Chem.MolToSmiles(mol)
        
        df.loc[i, 'Drug'] = canonical_smiles
        
    return df


# binding_db_split['train'] = canonicalize_smiles(binding_db_split['train'])
# binding_db_split['valid'] = canonicalize_smiles(binding_db_split['valid'])
# binding_db_split['test'] = canonicalize_smiles(binding_db_split['test'])

# davis_split['train'] = canonicalize_smiles(davis_split['train'])
# davis_split['valid'] = canonicalize_smiles(davis_split['valid'])
# davis_split['test'] = canonicalize_smiles(davis_split['test'])

In [3]:
from transformers import BertModel, BertTokenizer
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer

molecule_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="data/drug/tokenizer_model/vocab.json",
    pad_token="[PAD]",
    mask_token="[MASK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    unk_token="[UNK]"
)
molecule_bert = BertModel.from_pretrained("weights/molecule_bert_pretrained-masking_rate_30", local_files_only=True).eval().to('cuda')

# protein_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
# protein_bert = BertModel.from_pretrained("Rostlab/prot_bert").eval().to('cuda')


Some weights of BertModel were not initialized from the model checkpoint at weights/molecule_bert_pretrained-masking_rate_30 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import torch
import numpy as np
from transformers import DataCollatorWithPadding


def get_molecule_feature_vectors(df, molecule_tokenizer, molecule_bert):
    drug_pooler_output = []
    y = []
    
    for i, line in tqdm(df.iterrows(), total=len(df)):
        molecule = " ".join(line['Drug'])

        encoded_drug = molecule_tokenizer(molecule, max_length=128, truncation=True, return_tensors='pt').to("cuda")
        drug_encoder_out = molecule_bert(**encoded_drug)
        
        drug_pooler_output.append(drug_encoder_out.pooler_output.clone().detach().to("cpu"))
        y.append(line['Y'])
        
        del molecule
        del encoded_drug
        del drug_encoder_out
        torch.cuda.empty_cache()
        
    drug_pooler_output = np.array(drug_pooler_output)
    y = np.array(y)

    return drug_pooler_output, y

# train_molecule, train_y = get_molecule_feature_vectors(davis_split['train'], molecule_tokenizer, molecule_bert)
# valid_molecule, valid_y = get_molecule_feature_vectors(davis_split['valid'], molecule_tokenizer, molecule_bert)
# test_molecule, test_y = get_molecule_feature_vectors(davis_split['test'], molecule_tokenizer, molecule_bert)

train_molecule, train_y = get_molecule_feature_vectors(kiba_split['train'], molecule_tokenizer, molecule_bert)
valid_molecule, valid_y = get_molecule_feature_vectors(kiba_split['valid'], molecule_tokenizer, molecule_bert)
test_molecule, test_y = get_molecule_feature_vectors(kiba_split['test'], molecule_tokenizer, molecule_bert)

100%|█████████████████████████████████████| 82360/82360 [18:37<00:00, 73.72it/s]
100%|█████████████████████████████████████| 11766/11766 [02:47<00:00, 70.41it/s]
100%|█████████████████████████████████████| 23531/23531 [05:33<00:00, 70.49it/s]


In [5]:
np.save("data/interaction/kiba/train_molecule.npy", train_molecule)
np.save("data/interaction/kiba/valid_molecule.npy", valid_molecule)
np.save("data/interaction/kiba/test_molecule.npy", test_molecule)

np.save("data/interaction/kiba/train_y.npy", train_y)
np.save("data/interaction/kiba/valid_y.npy", valid_y)
np.save("data/interaction/kiba/test_y.npy", test_y)

In [7]:
temp = np.load("data/interaction/kiba/train_y.npy")
temp

array([7.95078198, 7.95078198, 7.95078198, ..., 7.97477854, 7.97477854,
       7.97477854])

In [None]:
import torch
import numpy as np
from transformers import DataCollatorWithPadding

def get_protein_feature_vectors(df, protein_tokenizer, protein_bert):
    target_pooler_output = []
    y = []
    
    for i, line in tqdm(df.iterrows(), total=len(df)):
        protein = " ".join(line['Target'])
        encoded_target = protein_tokenizer(protein, max_length=2048, truncation=True, return_tensors='pt').to("cuda")
        target_encoder_out = protein_bert(**encoded_target)
        
        target_pooler_output.append(target_encoder_out.pooler_output.clone().detach().to("cpu"))
        
        del protein
        del encoded_target
        del target_encoder_out
        torch.cuda.empty_cache()
        
    target_pooler_output = np.array(target_pooler_output)

    return target_pooler_output

# train_protein = get_protein_feature_vectors(davis_split['train'], protein_tokenizer, protein_bert)
# valid_protein = get_protein_feature_vectors(davis_split['valid'], protein_tokenizer, protein_bert)
# test_protein = get_protein_feature_vectors(davis_split['test'], protein_tokenizer, protein_bert)

train_protein = get_protein_feature_vectors(kiba_split['train'], protein_tokenizer, protein_bert)
valid_protein = get_protein_feature_vectors(kiba_split['valid'], protein_tokenizer, protein_bert)
test_protein = get_protein_feature_vectors(kiba_split['test'], protein_tokenizer, protein_bert)

 30%|█████████▊                       | 24465/82360 [1:32:52<3:14:14,  4.97it/s]

In [None]:
np.save("data/interaction/kiba/train_protein.npy", train_protein)
np.save("data/interaction/kiba/valid_protein.npy", valid_protein)
np.save("data/interaction/kiba/test_protein.npy", test_protein)

## 