In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from tdc.multi_pred import DTI
from transformers import AutoTokenizer, AutoModel

mol_tokenizer = AutoTokenizer.from_pretrained("jonghyunlee/DrugLikeMoleculeBERT")
mol_encoder = AutoModel.from_pretrained("jonghyunlee/DrugLikeMoleculeBERT")
mol_encoder.to("cuda")
mol_encoder.eval()

prot_tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
prot_encoder = AutoModel.from_pretrained("Rostlab/prot_bert")
prot_encoder.to("cuda")
prot_encoder.eval()

print()

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [2]:
def get_unique(df):
    try:
        mols = df.get_data().loc[:, ["Drug_ID", "Drug"]].drop_duplicates().reset_index(drop=True)
        prots = df.get_data().loc[:, ["Target_ID", "Target"]].drop_duplicates().reset_index(drop=True)
    except:
        mols = df.loc[:, ["Drug_ID", "Drug"]].drop_duplicates().reset_index(drop=True)
        prots = df.loc[:, ["Target_ID", "Target"]].drop_duplicates().reset_index(drop=True)
    
    return mols, prots


davis = DTI(name="davis")
davis_mols, davis_prots = get_unique(davis)

kiba = DTI(name="kiba")
kiba_mols, kiba_prots = get_unique(kiba)

biosnap = pd.read_csv("data/BIOSNAP.csv")
biosnap_mols, biosnap_prots = get_unique(biosnap)

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [3]:
binding = DTI(name="BindingDB_Kd")
binding.convert_to_log(form="binding")
binding = binding.get_data()

def generate_prot_dict():
    with open("data/BindingDB_prot_ID.txt", "r") as f:
        prot_id = f.readlines()

    prot_dict = {}
    for i in range(0, len(prot_id), 2):
        meta_line = prot_id[i].rstrip()
        prot_name = "_".join(meta_line.split(" ")[3:])
        fasta = prot_id[i+1].rstrip()
        prot_dict[fasta] = prot_name
        
    return prot_dict
    
prot_dict = generate_prot_dict()

for i, line in binding.iterrows():
    binding.loc[i, "Target_ID"] = prot_dict[line.Target]
    
binding.to_csv("data/BindingDB_new_ID.csv", index=False)
binding_mols, binding_prots = get_unique(binding)

Found local copy...
Loading...
Done!
To log space...


In [4]:
def get_embeddings(df, fname, mode="mol"):
    cls_feature_dict = {}
    full_feature_dict = {}
    
    print(fname)
    for i, data in tqdm(df.iterrows(), total=len(df)):
        name, seq = data[0], data[1]

        if mode == "mol":
            X = mol_tokenizer.encode_plus(" ".join(seq) + " [PAD]" * (128-len(seq)), 
                                          return_tensors="pt", max_length=128, truncation=True)
            output = mol_encoder(**X.to("cuda"))
        elif mode == "prot":
            X = prot_tokenizer.encode_plus(" ".join(seq) + " [PAD]" * (1024-len(seq)), 
                                           return_tensors="pt", max_length=1024, truncation=True)
            output = prot_encoder(**X.to("cuda"))
        
        cls_feature_dict[name] = output[1].detach().to("cpu")
        full_feature_dict[name] = output[0].detach().to("cpu")
    
    with open("data/" + fname + "_cls.pkl", "wb") as f:
        pickle.dump(cls_feature_dict, f)
    
    with open("data/" + fname + "_full.pkl", "wb") as f:
        pickle.dump(full_feature_dict, f)
        
        
get_embeddings(davis_mols, "davis_mols", "mol")
get_embeddings(kiba_mols, "kiba_mols", "mol")
get_embeddings(binding_mols, "binding_mols", "mol")
get_embeddings(biosnap_mols, "biosnap_mols", "mol")

get_embeddings(davis_prots, "davis_prots", "prot")
get_embeddings(kiba_prots, "kiba_prots", "prot")
get_embeddings(binding_prots, "binding_prots", "prot")
get_embeddings(biosnap_prots, "biosnap_prots", "prot")

davis_mols


100%|███████████████████████████████████████████| 68/68 [00:01<00:00, 44.29it/s]


kiba_mols


100%|███████████████████████████████████████| 2068/2068 [00:21<00:00, 98.28it/s]


binding_mols


100%|████████████████████████████████████| 10661/10661 [01:38<00:00, 108.09it/s]


biosnap_mols


100%|██████████████████████████████████████| 4510/4510 [00:37<00:00, 118.77it/s]


davis_prots


100%|█████████████████████████████████████████| 379/379 [00:30<00:00, 12.35it/s]


kiba_prots


100%|█████████████████████████████████████████| 229/229 [00:18<00:00, 12.28it/s]


binding_prots


100%|███████████████████████████████████████| 1413/1413 [01:57<00:00, 11.98it/s]


biosnap_prots


100%|███████████████████████████████████████| 2182/2182 [03:03<00:00, 11.89it/s]


In [5]:
from glob import glob

def generate_merged_dict(flist, fname):
    merged_dict = {}
    
    for file_name in flist:
        with open(file_name, "rb") as f:
            sub_dict = pickle.load(f)
            
        merged_dict.update(sub_dict)
        
    with open("data/" + fname + ".pkl", "wb") as f:
        pickle.dump(merged_dict, f)

# mols_cls_list = glob("data/*_mols_cls.pkl")
mols_full_list = glob("data/*_mols_full.pkl")
# prots_cls_list = glob("data/*_prots_cls.pkl")
prots_full_list = glob("data/*_prots_full.pkl")

# generate_merged_dict(mol_cls_list, "mols_cls")
generate_merged_dict(mols_full_list, "mols_full")
# generate_merged_dict(prots_cls_list, "prots_cls")
generate_merged_dict(prots_full_list, "prots_full")

In [9]:
def preprocessing(dataset_name):
    if dataset_name == "davis":
        data = DTI(name=dataset_name)
        data.convert_to_log(form="binding")
        df = data.get_data()
        df.loc[:, "Y_label"] = df.Y.map(lambda x: 1 if x >= 7 else 0)
        df.loc[:, "Source"] = "DAVIS"
        df.loc[:, "Source_ID"] = 0
    elif dataset_name == "BindingDB":
        df = pd.read_csv("data/BindingDB_new_ID.csv")
        df.loc[:, "Y_label"] = df.Y.map(lambda x: 1 if x >= 7 else 0)
        df.loc[:, "Source"] = "BindingDB"
        df.loc[:, "Source_ID"] = 1
    elif dataset_name == "kiba":
        df = DTI(name=dataset_name).get_data()
        df.loc[:, "Y_label"] = df.Y.map(lambda x: 1 if x >= 12.1 else 0)
        df.loc[:, "Source"] = "KIBA"
        df.loc[:, "Source_ID"] = 2
    else:
        df = pd.read_csv("data/BIOSNAP.csv")
        df.loc[:, "Source"] = "BIOSNAP"
        df.loc[:, "Source_ID"] = 3
    
    df = df.sample(frac=1).reset_index(drop=True)
    
    return df

davis_df = preprocessing("davis")
binding_df = preprocessing("BindingDB")
kiba_df = preprocessing("kiba")
biosnap_df = preprocessing("BIOSNAP")

Found local copy...
Loading...
Done!
To log space...
Found local copy...
Loading...
Done!


In [10]:
from sklearn.model_selection import StratifiedKFold

davis_index = []
binding_index = []
kiba_index = []
biosnap_index = []

fold = StratifiedKFold()
for train_index, test_index in fold.split(davis_df, davis_df.Y_label):
    davis_index.append([train_index, test_index])

for train_index, test_index in fold.split(binding_df, binding_df.Y_label):
    binding_index.append([train_index, test_index])
    
for train_index, test_index in fold.split(kiba_df, kiba_df.Y_label):
    kiba_index.append([train_index, test_index])
    
for train_index, test_index in fold.split(biosnap_df, biosnap_df.Y_label):
    biosnap_index.append([train_index, test_index])
   

In [11]:
for n_fold in range(5):
    fold_train_df = pd.DataFrame([])
    fold_test_df = pd.DataFrame([])

    fold_train_df = fold_train_df.append(davis_df.loc[davis_index[n_fold][0]])
    fold_test_df = fold_test_df.append(davis_df.loc[davis_index[n_fold][1]])
    
    fold_train_df = fold_train_df.append(binding_df.loc[binding_index[n_fold][0]])
    fold_test_df = fold_test_df.append(binding_df.loc[binding_index[n_fold][1]])
    
    fold_train_df = fold_train_df.append(kiba_df.loc[kiba_index[n_fold][0]])
    fold_test_df = fold_test_df.append(kiba_df.loc[kiba_index[n_fold][1]])
    
    fold_train_df = fold_train_df.append(biosnap_df.loc[biosnap_index[n_fold][0]])
    fold_test_df = fold_test_df.append(biosnap_df.loc[biosnap_index[n_fold][1]])
    
    fold_train_df = fold_train_df.reset_index(drop=True)
    fold_test_df = fold_test_df.reset_index(drop=True)
    
    with open(f"data/fold_number_{n_fold}_train.pkl", "wb") as f:
        pickle.dump(fold_train_df, f)
        
    with open(f"data/fold_number_{n_fold}_test.pkl", "wb") as f:
        pickle.dump(fold_test_df, f)
