In [1]:
import pandas as pd
import seaborn as sns
from rdkit import Chem
from tdc.multi_pred import DTI

davis = DTI(name="DAVIS")
davis.convert_to_log(form="binding")
davis_split = davis.get_split()

kiba = DTI(name="KIBA")
kiba.convert_to_log(form="binding")
kiba_split = kiba.get_split()

binding_db = DTI(name = 'BindingDB_Kd')
binding_db.convert_to_log(form="binding")
binding_db_split = binding_db.get_split()

Downloading...
100%|██████████| 26.3M/26.3M [00:03<00:00, 8.70MiB/s]
Loading...
Done!
To log space...
Downloading...
100%|██████████| 97.2M/97.2M [00:09<00:00, 10.3MiB/s]
Loading...
Done!
To log space...
Downloading...
100%|██████████| 54.4M/54.4M [00:05<00:00, 9.14MiB/s]
Loading...
Done!
To log space...


In [2]:
train_df = pd.concat([davis_split["train"], kiba_split["train"], binding_db_split["train"]], axis=0).reset_index(drop=True)
valid_df = pd.concat([davis_split["valid"], kiba_split["valid"], binding_db_split["valid"]], axis=0).reset_index(drop=True)
test_df = pd.concat([davis_split["test"], kiba_split["test"], binding_db_split["test"]], axis=0).reset_index(drop=True)

print(f"train: {train_df.shape} valid: {valid_df.shape} test: {test_df.shape}")

train: (138552, 5) valid: (19793, 5) test: (39586, 5)


In [3]:
temp = train_df.iloc[0]
sample = temp.Drug
sample

'Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12'

In [10]:
from rdkit import Chem

Chem.MolToSmiles(Chem.MolFromSmiles(sample), isomericSmiles=False, kekuleSmiles=True)

'CC1=C2C=C(C3=CN=CC(OCC(N)CC4=CC=CC=C4)=C3)C=CC2=NN1'

In [11]:
from rdkit import Chem

train_df.Drug = train_df.Drug.map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False, kekuleSmiles=True))
valid_df.Drug = valid_df.Drug.map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False, kekuleSmiles=True))
test_df.Drug = test_df.Drug.map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False, kekuleSmiles=True))

In [13]:
import pickle 

with open("./data/DTI/DTI_train.pickle", "wb") as f:
    pickle.dump(train_df, f)
    
with open("./data/DTI/DTI_valid.pickle", "wb") as f:
    pickle.dump(valid_df, f)
    
with open("./data/DTI/DTI_test.pickle", "wb") as f:
    pickle.dump(test_df, f)

In [12]:
train_df

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
0,11314340,CC1=C2C=C(C3=CN=CC(OCC(N)CC4=CC=CC=C4)=C3)C=CC...,ABL1p,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,4.999996
1,11314340,CC1=C2C=C(C3=CN=CC(OCC(N)CC4=CC=CC=C4)=C3)C=CC...,ABL2,MVLGTVLLPPNSYGRDQDTSLCCLCTEASESALPDLTDHFASCVED...,4.999996
2,11314340,CC1=C2C=C(C3=CN=CC(OCC(N)CC4=CC=CC=C4)=C3)C=CC...,ACVR1B,MAESAGASSFFPLVVLLLAGSGGSGPRGVQALLCACTSCLQANYTC...,4.999996
3,11314340,CC1=C2C=C(C3=CN=CC(OCC(N)CC4=CC=CC=C4)=C3)C=CC...,ACVRL1,MTLGSPRKGLLMLLMALVTQGDPVKPSRGPLVTCTCESPHCKGPTC...,4.999996
4,11314340,CC1=C2C=C(C3=CN=CC(OCC(N)CC4=CC=CC=C4)=C3)C=CC...,ADCK3,MAAILGDTIMVAKGLVKLTQAAVETHLQHLGIGGELIMAARALQST...,4.999996
...,...,...,...,...,...
138547,53358942.0,COC1=CC(C(=O)O)=CC=C1NC(=O)C1NC(CC(C)(C)C)C(C#...,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,9.602060
138548,53476877.0,CC(C)(C)CC1NC(C(=O)NC2CCC(O)CC2)C(C2=CC=CC(Cl)...,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,8.552842
138549,58573469.0,CC(C)C(CS(=O)(=O)C(C)C)N1C(=O)C(C)(CC(=O)O)CC(...,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,9.838632
138550,113557.0,CCCCCCCOC1OC(CO)C(O)C(O)C1O,P08191,MKRVITLFAVLLMGWSVNAWSFACKTANGTAIPIGGGSANVYVNLA...,7.767004
