In [30]:
import re
from pathlib import Path

import scipy
import functools
import numpy as np
import pandas as pd

import networkx as nx

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, PandasTools, rdFingerprintGenerator, MACCSkeys
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem.Fingerprints import FingerprintMols

import matplotlib.pyplot as plt
%matplotlib inline

import umap
# import umap.plot
# import hdbscan

from tqdm import tqdm

In [31]:
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.utils import check_X_y
from sklearn.utils import _safe_indexing
from sklearn.utils import check_random_state
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances_chunked
from sklearn.utils.validation import _deprecate_positional_args

In [32]:
HERE = Path.cwd()
DATA = HERE / "data"

In [33]:
def check_number_of_labels(n_labels, n_samples):
    if not 1 < n_labels < n_samples:
        raise ValueError("Number of labels is %d. Valid values are 2"
                         "to n_samples - 1 (inclusive)" % n_labels)


def FingerprintFromSmiles(smiles, method='morgan'):
    """Converts SMILES to Fingerprint"""

    if method == "rdkit":
        rdk_fpg = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
        return rdk_fpg.GetFingerprint(Chem.MolFromSmiles(smiles))
    elif method == "morgan":
        mfp_fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
        return mfp_fpg.GetFingerprint(Chem.MolFromSmiles(smiles))
    elif method == "maccs":
        return MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles))
    else:
        return "invalid method: use rdkit or morgan"

In [34]:
zinc_data_f = pd.read_csv(DATA / "filtered_zinc_data.csv")
# data_r = pd.read_csv("")

print(zinc_data_f.shape)
zinc_data_f.head()

(486318, 6)


Unnamed: 0,smiles,zinc_id,inchikey,mwt,logp,reactive
0,Cc1cc(C(=O)O)nn1C(C)(C)C,128082,FCFGJNOOQYQMKY-UHFFFAOYSA-N,182.223,1.645,0
1,Cc1cc(C=O)oc1C,153224,JPTPEPVCVXGNJM-UHFFFAOYSA-N,124.139,1.709,0
2,Oc1nc2ccc(Cl)cc2nc1O,336490,RNOLFZACEWWIHP-UHFFFAOYSA-N,196.593,1.694,0
3,CC(C)[C@@H]1C(=O)C[C@@H](C)CN1C,1296142,ALCOISCHVBFFNS-PSASIEDQSA-N,169.268,1.552,0
4,COc1cc(N(C)C)ccc1C=O,1583074,HGDRXADJVGVGBC-UHFFFAOYSA-N,179.219,1.574,0


Add ROMol objects of these molecules to the DataFrame.

Calculate circular **Morgan fingerprints** for molecules and add those values to a DataFrame as a separate column.

In [37]:
zinc_data_f["morgan"] = [FingerprintFromSmiles(smiles, method='morgan') for smiles in tqdm(zinc_data_f["smiles"], desc="Generating Morgan fingerprints for zinc_data")]
zinc_data_f.head(5)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  zinc_data_f["morgan"] = [FingerprintFromSmiles(smiles, method='morgan') for smiles in tqdm(zinc_data_f["smiles"], desc="Generating Morgan fingerprints for zinc_data")]


Generating Morgan fingerprints for zinc_data:   0%|          | 0/486318 [00:00<?, ?it/s]

Unnamed: 0,smiles,zinc_id,inchikey,mwt,logp,reactive,morgan
0,Cc1cc(C(=O)O)nn1C(C)(C)C,128082,FCFGJNOOQYQMKY-UHFFFAOYSA-N,182.223,1.645,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Cc1cc(C=O)oc1C,153224,JPTPEPVCVXGNJM-UHFFFAOYSA-N,124.139,1.709,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Oc1nc2ccc(Cl)cc2nc1O,336490,RNOLFZACEWWIHP-UHFFFAOYSA-N,196.593,1.694,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CC(C)[C@@H]1C(=O)C[C@@H](C)CN1C,1296142,ALCOISCHVBFFNS-PSASIEDQSA-N,169.268,1.552,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,COc1cc(N(C)C)ccc1C=O,1583074,HGDRXADJVGVGBC-UHFFFAOYSA-N,179.219,1.574,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
