In [1]:
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem.QED import qed
from tdc.multi_pred import DTI

from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')

import warnings
warnings.filterwarnings(action='ignore')

# binding_db = DTI(name="BindingDB_Kd")
# binding_db.convert_to_log(form='binding')
# binding_db_split = binding_db.get_split()

davis = DTI(name="Davis")
davis.convert_to_log(form='binding')
davis_split = davis.get_split()

Found local copy...
Loading...
Done!
To log space...


In [2]:
def canonicalize_smiles(df):
    for i, smiles in enumerate(df['Drug']):
        mol = Chem.MolFromSmiles(smiles)
        canonical_smiles = Chem.MolToSmiles(mol)
        
        df.loc[i, 'Drug'] = canonical_smiles
        
    return df


# binding_db_split['train'] = canonicalize_smiles(binding_db_split['train'])
# binding_db_split['valid'] = canonicalize_smiles(binding_db_split['valid'])
# binding_db_split['test'] = canonicalize_smiles(binding_db_split['test'])

davis_split['train'] = canonicalize_smiles(davis_split['train'])
davis_split['valid'] = canonicalize_smiles(davis_split['valid'])
davis_split['test'] = canonicalize_smiles(davis_split['test'])

In [3]:
from transformers import BertModel, BertTokenizer
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer

molecule_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="data/drug/tokenizer_model/vocab.json",
    pad_token="[PAD]",
    mask_token="[MASK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    unk_token="[UNK]"
)

molecule_bert = BertModel.from_pretrained("weights/MoleculeBERT_pretrained")

protein_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
protein_bert = BertModel.from_pretrained("Rostlab/prot_bert")

Some weights of the model checkpoint at weights/MoleculeBERT_pretrained were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at weights/MoleculeBERT_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

In [4]:
def get_feature_vectors(df, molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert):
    df["Drug_vector"] = ""
    df["Target_vector"] = ""
    
    for i, line in tqdm(df.iterrows(), total=len(df)):
        drug = line['Drug']
        target = "".join(line['Target'])
        
        encoded_drug = molecule_tokenizer(drug, max_length=128, truncation=True, return_tensors='pt')
        encoded_target = protein_tokenizer(target, return_tensors='pt')
        
        drug_feature_vector = molecule_bert(**encoded_drug)
        target_feature_vector = protein_bert(**encoded_target)
        
        df.at[i, "Drug_vector"] = drug_feature_vector[1].detach().numpy()[0]
        df.at[i, "Target_vector"] = target_feature_vector[1].detach().numpy()[0]
        
    return df

# binding_db_split['train'] = get_feature_vectors(binding_db_split['train'], molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert)
# binding_db_split['valid'] = get_feature_vectors(binding_db_split['valid'], molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert)
# binding_db_split['test'] = get_feature_vectors(binding_db_split['test'], molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert)

davis_split['train'] = get_feature_vectors(davis_split['train'], molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert)
davis_split['valid'] = get_feature_vectors(davis_split['valid'], molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert)
davis_split['test'] = get_feature_vectors(davis_split['test'], molecule_tokenizer, molecule_bert, protein_tokenizer, protein_bert)


100%|█████████████████████████████████████████████████████████| 18041/18041 [30:20<00:00,  9.91it/s]
100%|███████████████████████████████████████████████████████████| 2577/2577 [05:04<00:00,  8.47it/s]
100%|███████████████████████████████████████████████████████████| 5154/5154 [10:13<00:00,  8.40it/s]


In [None]:
import pickle 

with open("data/interaction/davis_embedded.pkl", "wb") as f:
    pickle.dump(f, davis_split)