In [5]:
# We will start by importing a the necessary Python libraries
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import umap
from rdkit.Chem import PandasTools

In [2]:
# We set a few parameters to improve the appearance of our plots
sns.set(rc={'figure.figsize': (10, 10)})
sns.set(font_scale=1.5)
sns.set_style('whitegrid')

In [4]:
# Read the input data. 
df = pd.read_csv("supporting_4.csv")

In [6]:
# Define a couple of functions to convert a list SMILES to a list of fingerprints.
def fp_list_from_smiles_list(smiles_list,n_bits=2048):
    fp_list = []
    for smiles in tqdm(smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        fp_list.append(fp_as_array(mol,n_bits))
    return fp_list

def fp_as_array(mol,n_bits=2048):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)
    arr = np.zeros((1,), np.int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

In [7]:
# Convert the SMILES from our dataframe to fingerprints.
fp_list = fp_list_from_smiles_list(df.SMILES)
fp_df = pd.DataFrame(fp_list)
fp_df.to_csv('fp_df.csv',index=False)

100%|██████████| 1012/1012 [00:00<00:00, 2818.03it/s]


In [11]:
# Perform UMAP on the fingerprints. (Dimension reduction)

reducer = umap.UMAP(n_neighbors=20, min_dist=0.0, n_components=2, random_state=42)
crds_embedded = reducer.fit_transform(fp_list)

In [12]:
tsne_df = pd.DataFrame(crds_embedded,columns=["X","Y"])
tsne_df['SMILES'] = df['SMILES']
tsne_df['PCBA1030'] = df['PCBA1030']
tsne_df['PCBA1461'] = df['PCBA1461']
tsne_df['PCBA1468'] = df['PCBA1468']
tsne_df['PCBA1688'] = df['PCBA1688']
tsne_df['PCBA2546'] = df['PCBA2546']
tsne_df['PCBA2551'] = df['PCBA2551']
tsne_df['PCBA504332'] = df['PCBA504332']
tsne_df['PCBA504339'] = df['PCBA504339']
tsne_df['PCBA504444'] = df['PCBA504444']
tsne_df['PCBA504467'] = df['PCBA504467']
tsne_df['PCBA540276'] = df['PCBA540276']
tsne_df['PCBA588855'] = df['PCBA588855']
tsne_df['PCBA624288'] = df['PCBA624288']
tsne_df['PCBA624296'] = df['PCBA624296']
tsne_df['PCBA624417'] = df['PCBA624417']
tsne_df['PCBA651635'] = df['PCBA651635']
tsne_df['PCBA686970'] = df['PCBA686970']
tsne_df['PCBA686978'] = df['PCBA686978']
tsne_df['PCBA686979'] = df['PCBA686979']
tsne_df['PCBA720504'] = df['PCBA720504']
tsne_df['PCBA720579'] = df['PCBA720579']
tsne_df['PCBA720580'] = df['PCBA720580']
tsne_df['PCBA883'] = df['PCBA883']
tsne_df['PCBA884'] = df['PCBA884']
tsne_df['PCBA891'] = df['PCBA891']
tsne_df['PCBA938'] = df['PCBA938']

In [13]:
tsne_df.to_csv('tsne_df.csv',index=False)