In [1]:
import os
import glob
import pandas as pd

from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors

flist = glob.glob("BindingDB*.csv") + glob.glob("BIOSNAP*.csv") + glob.glob("DAVIS*.csv")
unique_smiles = []

for f in flist:
    df = pd.read_csv(f)
    smiles = pd.unique(df.SMILES).tolist()
    
    unique_smiles.extend(smiles)
    
len(unique_smiles)

67143

In [2]:
import parmap

def get_property(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        properties = Descriptors.CalcMolDescriptors(mol)
        result = list(properties.values())
    except:
        result = [0] * 209

    return [smiles] + result

results = parmap.map(get_property, unique_smiles, pm_pbar=True, pm_processes=16)
results_df = pd.DataFrame(results)
results_df.rename(columns = {0 : 'SMILES'}, inplace=True)

results_df

  0%|          | 0/67143 [00:00<?, ?it/s]

[17:55:14] Explicit valence for atom # 13 Be, 3, is greater than permitted
[17:55:14] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:14] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:15] Unusual charge on atom 0 number of radical electrons set to zero
[17:55:18] Explicit valence for atom # 22 N, 4, is greater than permitted
[17:55:31] Explicit valence for atom # 11 N, 4, is greater than permitted
[17:55:32] Explic

Unnamed: 0,SMILES,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,CCN(CCO)CCCOc1ccc2c(Nc3cc(CC(=O)Nc4cccc(F)c4)n...,13.317366,13.317366,0.023740,-0.419099,0.202540,507.570,477.330,507.239416,194.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,CSc1cccc(Nc2ncc3cc(-c4c(Cl)cccc4Cl)c(=O)n(C)c3...,13.002835,13.002835,0.234694,-0.234694,0.406163,443.359,427.231,442.042187,146.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCCCCCCOc1cccc(c1)C([O-])=O,10.634689,10.634689,0.160632,-1.168152,0.649983,235.303,216.151,235.133968,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,Cc1ccccc1CSc1nc2c(nc(N)[nH]c2=O)[nH]1,11.673917,11.673917,0.084818,-0.327323,0.637991,287.348,274.244,287.084081,102.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,COc1cc2ncnc(Oc3cccc(NC(=O)Nc4cc(nn4-c4ccc(cc4)...,13.106574,13.106574,0.253237,-0.465999,0.230910,563.618,534.386,563.228102,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67138,CN(C)CC1CCN2C=C(C3=CC=CC=C32)C4=C(C5=CN(CCO1)C...,13.201412,13.201412,0.074058,-0.348032,0.457397,468.557,440.333,468.216141,178.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67139,CC1=CC2=C(C=C1)N=C(C3=NC=C(N23)C)NCCN.Cl,5.540650,5.540650,0.000000,0.000000,0.776787,291.786,273.642,291.125073,106.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67140,CN1C2=C(C=C(C=C2)OC3=CC(=NC=C3)C4=NC=C(N4)C(F)...,12.853011,12.853011,0.058393,-4.557853,0.246731,518.421,502.293,518.128978,190.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67141,C1CC1CONC(=O)C2=C(C(=C(C=C2)F)F)NC3=C(C=C(C=C3...,14.276657,14.276657,0.079339,-1.162537,0.451904,478.664,464.552,477.975660,132.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
results_df.to_csv("molecular_property/molecular_property.csv", index=False)

In [30]:
results_df = pd.read_csv("molecular_property/molecular_property.csv")

In [33]:
flist = glob.glob("BindingDB*.csv") + glob.glob("BIOSNAP*.csv") + glob.glob("DAVIS*.csv")

for f in flist:
    df_left = pd.read_csv(f).loc[:, ["SMILES", "Target Sequence", "Label", "Function"]]
    df_left = pd.merge(df_left, results_df, on="SMILES", how="left")

    df_left.to_csv(f, index=False)