In [None]:
# general and data handling
import numpy as np
import pandas as pd
import os
from collections import Counter

In [None]:
#current directory
os

In [None]:
# Required RDKit modules
import rdkit as rd
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import RDConfig
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdFingerprintGenerator
from rdkit import DataStructs
from rdkit.Chem import AllChem 
from rdkit.Chem.rdMolDescriptors import GetAtomPairFingerprint
from rdkit.Chem.AtomPairs import Torsions
from rdkit.Chem import Draw
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdmolops
from rdkit.Chem import rdMolDescriptors

In [None]:
# modeling
import sklearn as sk
from sklearn import metrics

In [None]:
# Graphing
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#loading sdfFile
sdfFile = os.path.join(RDConfig.RDDataDir, 'C:\\Users\\ADMIN\\Desktop\\structures.molV3.sdf')

In [None]:
#adding columns to pandas
dframe = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule', includeFingerprints=True)

In [None]:
#table content
dframe

In [None]:
#table columns
dframe.columns

In [None]:
# drop columns from pandas 
data=dframe.drop(['ID', 'PREFERRED_NAME', 'CAS_RN', 'SYNONYMS', 'URL','Molecule', 'type', 'CAS', 'Name', 'Ambiguous'],axis=1)

In [None]:
#reading mol from smiles
data["mol"] = [Chem.MolFromSmiles(x) for x in dframe["SMILES"]]

In [None]:
#count the nummber of unique rows in the SMILES column
data['SMILES'].nunique()

In [None]:
#Compare that with total rows
data['SMILES'].count()

In [None]:
#we can drop the duplicated ones
data=data.drop_duplicates(['SMILES'])
display(data)

In [None]:
#atompair fingerprint(Generate BitVect fingerprints)
MinLength=1
MaxLength=7
FPSize = 2048
BitsPerHash = 2       
data["atom_fp"] = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x, minLength = MinLength, maxLength = MaxLength, includeChirality = True, nBits = FPSize, nBitsPerEntry = BitsPerHash) for x in data['mol']]

In [None]:
data["atom_fp"] 

In [None]:
atom_fp = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x, minLength = MinLength, maxLength = MaxLength, includeChirality = True) for x in data['mol']]

In [None]:
atom_fp

In [None]:
len(atom_fp[1])

In [None]:
print(atom_fp[1].GetNumBits())
print(atom_fp[1].GetNumOffBits())
print(atom_fp[1].GetNumOnBits())
print(atom_fp[1].ToBinary())

In [None]:
# convert the RDKit explicit vectors into numpy arrays
atom_fp_np = []
for fp in atom_fp:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  atom_fp_np.append(arr)

In [None]:
#tanimota similarity for atompair fingerprint
data['atom_tanimoto'] = DataStructs.BulkTanimotoSimilarity(atom_fp[0], atom_fp)
data['atom_tanimoto']

In [None]:
#drawing molstructure for tanimoto values
Draw.MolsToGridImage(data["mol"], molsPerRow=5, subImgSize=(200,200), legends=['Tanimoto: {:.2f}'.format(i) for i in data['atom_tanimoto']])

In [None]:
#MACCSKEY
data["mac_fp"]=[rdMolDescriptors.GetMACCSKeysFingerprint(x) for x in data['mol']]

In [None]:
data["mac_fp"]

In [None]:
mac_fp =[rdMolDescriptors.GetMACCSKeysFingerprint(x) for x in data['mol']]

In [None]:
mac_fp

In [None]:
len(mac_fp[1])

In [None]:
print(mac_fp[1].GetNumBits())
print(mac_fp[1].GetNumOffBits())
print(mac_fp[1].GetNumOnBits())
print(mac_fp[1].ToBinary())

In [None]:
# convert the RDKit explicit vectors into numpy arrays
mac_fp_np = []
for fp in mac_fp:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  mac_fp_np.append(arr)

In [None]:
#tanimota similarity for maccskey fingerprint
data['mac_tanimoto'] = DataStructs.BulkTanimotoSimilarity(mac_fp[0], mac_fp)
data['mac_tanimoto'] 

In [None]:
#from maccskey tanimoto values
Draw.MolsToGridImage(data["mol"], molsPerRow=5, subImgSize=(200,200), legends=['Tanimoto: {:.2f}'.format(i) for i in data['mac_tanimoto']])

In [None]:
#Morgan fingerprint
#create a "morg_fp" column
data["morg_fp"] = [AllChem.GetMorganFingerprintAsBitVect(x, 2,  nBits = 2048) for x in data['mol']]

In [None]:
data["morg_fp"]

In [None]:
# generate morgan fingeprints with radius 2 contained in a list
morg_fp = [AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits = 2048) for x in data['mol']]

In [None]:
morg_fp

In [None]:
len(morg_fp[1])

In [None]:
print(morg_fp[1].GetNumBits())
print(morg_fp[1].GetNumOffBits())
print(morg_fp[1].GetNumOnBits())
print(morg_fp[1].ToBinary())

In [None]:
# convert the RDKit explicit vectors into numpy arrays
morg_fp_np = []
for fp in morg_fp:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  morg_fp_np.append(arr)

In [None]:
morg_fp_np

In [None]:
#tanimota similarity for morgan fingerprint
data['morg_tanimoto'] = DataStructs.BulkTanimotoSimilarity(morg_fp[0], morg_fp)
data['morg_tanimoto'] 

In [None]:
# drawing images from morgan tanimoto values
Draw.MolsToGridImage(data["mol"], molsPerRow=5, subImgSize=(200,200), legends=['Tanimoto: {:.2f}'.format(i) for i in data['morg_tanimoto']])

In [None]:
#pathlength or daylight fingerprint
data['palen_fp'] = [FingerprintMols.FingerprintMol(x, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=64 ) for x in data['mol']]

In [None]:
data['palen_fp']

In [None]:
palen_fp =  [FingerprintMols.FingerprintMol(x, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=64,) for x in data['mol']]

In [None]:
palen_fp

In [None]:
len(palen_fp[1])

In [None]:
print(palen_fp[1].GetNumBits())
print(palen_fp[1].GetNumOffBits())
print(palen_fp[1].GetNumOnBits())
print(palen_fp[1].ToBinary())

In [None]:
# convert the RDKit explicit vectors into numpy arrays
palen_fp_np = []
for fp in palen_fp:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  palen_fp_np.append(arr)

In [None]:
palen_fp_np

In [None]:
#tanimota similarity for path_based fingerprint
data['palen_tanimoto'] = DataStructs.BulkTanimotoSimilarity(palen_fp[0], palen_fp)
data['palen_tanimoto']

In [None]:
#drawing molstructure for tanimoto values
Draw.MolsToGridImage(data["mol"], molsPerRow=5, subImgSize=(200,200), legends=['Tanimoto: {:.2f}'.format(i) for i in data['palen_tanimoto']])

In [None]:
#Topological fingerprint
FPSize = 2048
BitsPerHash = 4 
data["toptor_fp"] = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x,  includeChirality = True, nBits = FPSize, nBitsPerEntry = BitsPerHash) for x in data['mol']]

In [None]:
data["toptor_fp"]

In [None]:
toptor_fp = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(x,  includeChirality = True, nBits = FPSize, nBitsPerEntry = BitsPerHash) for x in data['mol']]

In [None]:
toptor_fp

In [None]:
len(toptor_fp[1])

In [None]:
print(toptor_fp[1].GetNumBits())
print(toptor_fp[1].GetNumOffBits())
print(toptor_fp[1].GetNumOnBits())
print(toptor_fp[1].ToBinary())

In [None]:
# convert the RDKit explicit vectors into numpy arrays
toptor_fp_np = []
for fp in toptor_fp:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  toptor_fp_np.append(arr)

In [None]:
toptor_fp_np

In [None]:
#tanimota similarity for topological fingerprint
data['toptor_tanimoto'] = DataStructs.BulkTanimotoSimilarity(toptor_fp[0], toptor_fp)
data['toptor_tanimoto']

In [None]:
#drawing molstructure for tanimoto values
Draw.MolsToGridImage(data["mol"], molsPerRow=5, subImgSize=(200,200), legends=['Tanimoto: {:.2f}'.format(i) for i in data['toptor_tanimoto']])

In [None]:
data

In [None]:
data.columns

In [None]:
# Drop the non-transformed columns
data = data.drop(columns = ['SMILES', 'mol'])

In [None]:
sns.pairplot(data)

In [None]:
data

In [None]:
data = data.drop(columns = ['atom_fp', 'mac_fp','morg_fp','palen_fp','toptor_fp'])

In [None]:
data

In [None]:
data.columns

In [None]:
#fuzzy c-means clustering
### Importing the required Libraries
import random
import operator
import math

In [None]:
# Number of Attributes
num_attr = len(data.columns) - 1

# Number of Clusters to make
k = 5

# Maximum number of iterations
MAX_ITER = 100

# Number of data points
n = len(data)

# Fuzzy parameter
m = 2.00

In [None]:
### initializing the membership matrix with random values
def initializeMembershipMatrix():
    membership_mat = list()
    for i in range(n):
        random_num_list = [random.random() for i in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]
        membership_mat.append(temp_list)
    return membership_mat


In [None]:
membership_mat = initializeMembershipMatrix()

In [None]:
### calculating the cluster center, is done in every iteration

def calculateClusterCenter(membership_mat):
    cluster_mem_val = list(zip(*membership_mat))
    cluster_centers = list()
    for j in range(k):
        x = list(cluster_mem_val[j])
        xraised = [e ** m for e in x]
        denominator = sum(xraised)
        temp_num = list()
        for i in range(n):
            data_point = list(data.iloc[i])
            prod = [xraised[i] * val for val in data_point]
            temp_num.append(prod)
        numerator = map(sum, zip(*temp_num))
        center = [z/denominator for z in numerator]
        cluster_centers.append(center)
    return cluster_centers

In [None]:
#cluster_centers = calculateClusterCenter(membership_mat)
calculateClusterCenter(membership_mat)

In [None]:
### updating the membership values using the cluster centers

def updateMembershipValue(membership_mat, cluster_centers):
    p = float(2/(m-1))
    for i in range(n):
        x = list(data.iloc[i])
        distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(k)]
        for j in range(k):
            den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(k)])
            membership_mat[i][j] = float(1/den)       
    return membership_mat

In [None]:
def getClusters(membership_mat):
    cluster_labels = list()
    for i in range(n):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels

In [None]:
### the final fcm function, 

def fuzzyCMeansClustering():
    # Membership Matrix
    membership_mat = initializeMembershipMatrix()
    curr = 0
    while curr <= MAX_ITER:
        cluster_centers = calculateClusterCenter(membership_mat)
        membership_mat = updateMembershipValue(membership_mat, cluster_centers)
        cluster_labels = getClusters(membership_mat)
        curr += 1
    
    print(membership_mat)
    return cluster_labels, cluster_centers

In [None]:
### calling the main function and storing the final results in labels, centers

print("printing the final membership matrix")

labels, centers = fuzzyCMeansClustering()

print("printing cluster centers")
print(centers)

In [None]:
from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
score_fuzzy_s = silhouette_score(n,labels, metric='euclidean')
score_fuzzy_c = calinski_harabasz_score(x, labels_)
score_fuzzy_d = davies_bouldin_score(x, labels)
print('Silhouette Score: %.4f' % score_fuzzy_s)
print('Calinski Harabasz Score: %.4f' % score_fuzzy_c)
print('Davies Bouldin Score: %.4f' % score_fuzzy_d)