In [4]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
import seaborn as sns
import umap

from sklearn.manifold import TSNE
import metric_learn
import numpy as np
from sklearn.datasets import make_classification, make_regression
from rdkit.Chem import PandasTools

In [2]:
sns.set(rc={'figure.figsize': (10, 10)})
sns.set(font_scale=1.5)
sns.set_style('whitegrid')

# visualisation imports
import matplotlib.pyplot as plt
np.random.seed(42)

In [7]:
df = pd.read_csv("supporting_4.csv")
PandasTools.AddMoleculeColumnToFrame(df,'SMILES','Mol')

In [8]:
def fp_list_from_smiles_list(smiles_list,n_bits=2048):
    fp_list = []
    for smiles in tqdm(smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        fp_list.append(fp_as_array(mol,n_bits))
    return fp_list

def fp_as_array(mol,n_bits=2048):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)
    arr = np.zeros((1,), np.int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

In [9]:
fp_list = fp_list_from_smiles_list(df.SMILES)
df['fingerprint'] = fp_list

100%|██████████| 1012/1012 [00:00<00:00, 2799.61it/s]


In [10]:
reducer = umap.UMAP(n_neighbors=20, min_dist=0.0, n_components=2, random_state=42)
crds_embedded = reducer.fit_transform(fp_list)
print("umap x y", crds_embedded)
tsne_df = pd.DataFrame(crds_embedded,columns=["X","Y"])
tsne_df['SMILES'] = df['SMILES']
tsne_df['PCBA883'] = df['PCBA883']

umap x y [[ 0.86777586  8.958862  ]
 [ 5.6070876  11.453878  ]
 [ 1.3165851   9.967407  ]
 ...
 [ 3.5876715   7.7237773 ]
 [ 5.3396025  10.837394  ]
 [ 9.347091   10.3531475 ]]


In [11]:
print('umap_df=',tsne_df.head())
tsne_df.to_csv('umap_df.csv',index=False)

umap_df=           X          Y                                             SMILES  \
0  0.867776   8.958862    O=C(NC(Nc1ccc(F)cc1)(C(F)(F)F)C(F)(F)F)c1cccnc1   
1  5.607088  11.453878  CCC1CN2CCC1CC2[C@@H](NC(=S)Nc1cc(C(F)(F)F)cc(C...   
2  1.316585   9.967407        CC(NC(=O)NNC(=O)c1ccncc1)(C(F)(F)F)C(F)(F)F   
3  0.969752   9.594785            CCC(NC(=O)NCc1ccccc1)(C(F)(F)F)C(F)(F)F   
4  4.140071  10.116746  OC(c1ccccc1)(c1nc(/C=C/c2ccc(C(F)(F)F)cc2)cs1)...   

   PCBA883  
0      NaN  
1      NaN  
2      NaN  
3      NaN  
4      NaN  


In [12]:
# settng up LMNN
lmnn = metric_learn.LMNN(k=5, learn_rate=1e-6)

In [None]:
# fit the data!
lmnn_df = pd.DataFrame()
lmnn_df['fingerprint']=df['fingerprint']
lmnn_df['SMILES'] = df['SMILES']
lmnn_df['PCBA883'] = df['PCBA883']
lmnn_df=lmnn_df.dropna()
lmnn_df.reset_index(drop=True,inplace=True)
X = lmnn_df['fingerprint']
y = lmnn_df['PCBA883']
lmnn.fit(X.values.tolist(),y.values.tolist())

In [None]:
# transform our input space
X_lmnn = lmnn.transform(X.values.tolist())
crds_embedded_2 = reducer.fit_transform(X_lmnn)
print("lmnn embedded", crds_embedded_2)
metric_df = pd.DataFrame(crds_embedded_2,columns=["X","Y"])
metric_df['PCBA883'] = lmnn_df['PCBA883']
metric_df['SMILES'] = lmnn_df['SMILES']
print('metric_df=',metric_df.head())
metric_df.to_csv('metric_df.csv',index=False)

In [None]:
metric_df