In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
import pandas as pd

In [2]:
# read the samples
df_1 = pd.read_csv('DFT_eval_500samples.csv')

# proof and make a list of SMILES
df_smiles = df_1['SMILES']
c_smiles = []
for ds in df_smiles:
    try:
        cs = Chem.CanonSmiles(ds)
        c_smiles.append(cs)
    except:
        print('Invalid SMILES:', ds)
print()

# make a list of mols
ms = [Chem.MolFromSmiles(x) for x in c_smiles]

# make a list of fingerprints (fp)
fps = [FingerprintMols.FingerprintMol(x) for x in ms]

# the list for the dataframe
qu, ta, sim = [], [], []

# compare all fp pairwise without duplicates
for n in range(len(fps)-1): # -1 so the last fp will not be used
    s = DataStructs.BulkTanimotoSimilarity(fps[n], fps[n+1:]) # +1 compare with the next to the last fp
    print (s)
    print(c_smiles[n], c_smiles[n+1:]) # witch mol is compared with what group
        # collect the SMILES and values
    for m in range(len(s)):
        qu.append(c_smiles[n])
        ta.append(c_smiles[n+1:][m])
        sim.append(s[m])
print()

# build the dataframe and sort it
d = {'query':qu, 'target':ta, 'Similarity':sim}
df_final = pd.DataFrame(data=d)
df_final = df_final.sort_values('Similarity', ascending=False)
print(df_final)


[0.17543859649122806, 0.29213483146067415, 0.26627218934911245, 0.23655913978494625, 0.2689655172413793, 0.22727272727272727, 0.20809248554913296, 0.33663366336633666, 0.2904761904761905, 0.27450980392156865, 0.33544303797468356, 0.44565217391304346, 0.4426229508196721, 0.23225806451612904, 0.3235294117647059, 0.23741007194244604, 0.5263157894736842, 0.2535211267605634, 0.21768707482993196, 0.32673267326732675, 0.28859060402684567, 0.37373737373737376, 0.30092592592592593, 0.23717948717948717, 0.30113636363636365, 0.35319148936170214, 0.28888888888888886, 0.3617021276595745, 0.29523809523809524, 0.2620320855614973, 0.3275862068965517, 0.2953020134228188, 0.21333333333333335, 0.17419354838709677, 0.3956043956043956, 0.30864197530864196, 0.35353535353535354, 0.2517985611510791, 0.34615384615384615, 0.2616279069767442, 0.4268292682926829, 0.2482758620689655, 0.36681222707423583, 0.2119205298013245, 0.24561403508771928, 0.23026315789473684, 0.4482758620689655, 0.29333333333333333, 0.41860

[0.3958333333333333, 0.2710843373493976, 0.31155778894472363, 0.38095238095238093, 0.32989690721649484, 0.335, 0.33727810650887574, 0.2468354430379747, 0.31210191082802546, 0.30994152046783624, 0.29015544041450775, 0.38333333333333336, 0.2795031055900621, 0.3148148148148148, 0.2692307692307692, 0.46464646464646464, 0.3333333333333333, 0.40425531914893614, 0.25, 0.2848101265822785, 0.2956989247311828, 0.25, 0.30612244897959184, 0.3015075376884422, 0.3568075117370892, 0.3918918918918919, 0.19760479041916168, 0.3125, 0.3227848101265823, 0.21428571428571427, 0.2631578947368421, 0.2916666666666667, 0.36257309941520466, 0.37438423645320196, 0.31958762886597936, 0.31413612565445026, 0.30303030303030304, 0.29797979797979796, 0.39473684210526316, 0.2896551724137931, 0.3016759776536313, 0.37735849056603776, 0.4479166666666667, 0.36683417085427134, 0.2891566265060241, 0.30851063829787234, 0.31958762886597936, 0.3973509933774834, 0.3826086956521739, 0.22727272727272727, 0.336734693877551, 0.253246

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




CC(C)OC(C)NC=N ['C#CC(CCC)CCO', 'C#CCCCCCCN', 'CC1C(C)C1CCCO', 'CCC(=O)CC(N)CC', 'C=C(C)N(C)C(C)NO', 'CCCC(O)C1CC1C', 'C#CC(OC)C(C)CC', 'CCCC(C)(C=N)CO', 'CCCC(C)CC(C)=O', 'CCCC(C)C(O)OO', 'C#CCC(C)CCCC', 'C#CC(CC)CCCC', 'C#CC(CCC)CCC', 'CCCC(C)C(O)OC', 'CCCCC(C)COO', 'CCCCN(O)CCC', 'C=CC(C)NC(C)CC', 'CC(=CNN)C(C)(C)N', 'CCCC1CC(CC)CN1', 'CCC(C)(CCO)NC', 'CCCC1CCC1CCO', 'CCCC(C)C1OCC1C', 'CCCCOC(C)(C)C', 'CCC1(COC)CCC1C', 'COCC(CCO)CCO', 'CCCC(CCO)COC', 'CCC(N)CC#CC(C)N', 'CCCCCCCC(C)C']
[0.5681818181818182, 0.4647887323943662, 0.2926829268292683, 0.27722772277227725, 0.4810126582278481, 0.44155844155844154, 0.3783783783783784, 0.3142857142857143, 0.42857142857142855, 0.5476190476190477, 0.6415094339622641, 0.631578947368421, 0.37662337662337664, 0.49206349206349204, 0.25, 0.2702702702702703, 0.3409090909090909, 0.35294117647058826, 0.4, 0.5294117647058824, 0.4942528735632184, 0.5106382978723404, 0.4148936170212766, 0.38571428571428573, 0.4507042253521127, 0.3333333333333333, 0.341463

In [3]:
# put the DFT_cv values for each query and target
query_cv_ = []
target_cv_ = []
for q, t in zip(df_final['query'], df_final['target']):
    query_cv_.append (df_1['DFT_cv'][df_1['SMILES']==q])
    target_cv_.append (df_1['DFT_cv'][df_1['SMILES']==t])

# exclude extra in each vectors
query_cv = [float(i) for i in query_cv_]
target_cv = [float(i) for i in target_cv_]

df_final ['query_cv'] = query_cv
df_final ['target_cv'] = target_cv

# save as csv
df_final.to_csv('third.csv', index=False, sep=',')

In [5]:
query_cv_

[481    42.142
 Name: DFT_cv, dtype: float64,
 226    31.686
 Name: DFT_cv, dtype: float64,
 96    27.764
 Name: DFT_cv, dtype: float64,
 480    42.111
 Name: DFT_cv, dtype: float64,
 270    33.195
 Name: DFT_cv, dtype: float64,
 215    31.468
 Name: DFT_cv, dtype: float64,
 306    34.241
 Name: DFT_cv, dtype: float64,
 494    45.227
 Name: DFT_cv, dtype: float64,
 17    22.921
 Name: DFT_cv, dtype: float64,
 190    30.569
 Name: DFT_cv, dtype: float64,
 13    22.694
 Name: DFT_cv, dtype: float64,
 197    30.816
 Name: DFT_cv, dtype: float64,
 364    35.652
 Name: DFT_cv, dtype: float64,
 17    22.921
 Name: DFT_cv, dtype: float64,
 456    39.233
 Name: DFT_cv, dtype: float64,
 333    34.837
 Name: DFT_cv, dtype: float64,
 105    28.154
 Name: DFT_cv, dtype: float64,
 472    41.001
 Name: DFT_cv, dtype: float64,
 171    30.029
 Name: DFT_cv, dtype: float64,
 255    32.705
 Name: DFT_cv, dtype: float64,
 431    38.279
 Name: DFT_cv, dtype: float64,
 8    21.609
 Name: DFT_cv, dtype: flo