In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator
from classix_m import CLASSIX_M
from classix_t import CLASSIX_T
# Setting up the Hyperparameters

regenerateFromScratch = False   # generate Morgan Fingerprints can take a while, so we save
pKi_threshold = 0
tanimoto_radius = 0.3
manhattan_radius = 1.2
mergeScale = 1.5
minPts = 50

# Loading the Data
if regenerateFromScratch:
    # Data should be generated from scratch if the pKi threshold is changed.
    
    fp_gen = rdFingerprintGenerator.GetMorganGenerator(
        radius=2,  # atom radius to consider for hashing
        fpSize=1024  # bit-length of vector (smaller will lead to more collissions during hashing) 
    )
    df = pd.read_csv("BL_Sets_data_rdkitblog.csv", sep=",")
    df["mol"] = df["smiles"].apply(Chem.MolFromSmiles)
    df = df[df["pKi"]>pKi_threshold]
    data = np.vstack(df["mol"].apply(fp_gen.GetFingerprintAsNumPy))
    np.save('BL_Sets.npy', data, allow_pickle=True)
else:
    df = pd.read_csv("BL_Sets_data_rdkitblog.csv", sep=",")
    df = df[df["pKi"]>pKi_threshold]
    data = np.load('BL_Sets.npy')

cluster_membership = df["target_chembl_id"].astype("category").cat.codes
true_labels = np.array(cluster_membership)
data = data.astype(np.int32)

In [2]:
clx_m = CLASSIX_M(radius=manhattan_radius, minPts=minPts)
clx_m.fit(data[::10].astype(np.float32))


OWN AGGREGATION


100%|██████████| 9167/9167 [00:00<00:00, 12319.70it/s]

  aggregation time: 0.8748559951782227
  search time: 0.0046770572662353516
  ips time: 0.6853864192962646
  merging time: 0.05605506896972656
 minPts Merging
small clusters [1 2 3 4 5 6 7]
final cluster sizes [9167.]
[0 0 0 ... 0 0 0]





CLASSIX_T(sorting=popcount, radius=1.2, minPts=50, group_merging=manhattan_distance, mergeScale=1.4, mergeTinyGroups=True)

In [14]:
data_bit = np.vstack(df["mol"].apply(fp_gen.GetFingerprint))

In [18]:
d_0 = fp_gen.GetFingerprint(df['mol'][0])
data_bit[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
import bi
np.array(d_0)

b'np.array(d_0)'

In [10]:

# Initialise the models
tanimoto_radius = 0.33
manhattan_radius = 1
mergeScale = 1.5
minPts = 50

clx_t = CLASSIX_T(radius=tanimoto_radius, minPts=minPts, mergeScale=mergeScale)
# clx_m = CLASSIX_M(radius=manhattan_radius, minPts=minPts, mergeScale=mergeScale)
# clx_t.fit(data[::30,:])
# Run Tanimoto Clustering
clx_t.fit(data[:,:], minPts=minPts, mergeScale=mergeScale)


OWN AGGREGATION


100%|██████████| 90498/90498 [00:16<00:00, 5428.44it/s] 


time for ips: 13.248857021331787
time for search: 0.6045198440551758
time for conversion: 1.0070078372955322
time for loop: 1.6557228565216064
nr_dist: 590472610
  merging time: 3.4357738494873047
 minPts Merging
  minPts merging time: 4.332130193710327
Total time: 25.138460159301758


CLASSIX_T(sorting=popcount, radius=0.33, minPts=50, group_merging=tanimoto_distance, mergeScale=1.5, mergeTinyGroups=True)

184

In [10]:
# Example list
import numpy as np
example_list = np.array([1, 0, 3, 0, 5, 0, 7])
vec = np.array([3, 4, 5])
n=len(example_list)

i = 2
last_j = 5

# Create a boolean mask
list_mask = np.array(example_list) == 0
print(list_mask)
vec_mask = np.array(vec) >=4
print(vec_mask)
vec_mask = np.pad(vec_mask, (i, n-last_j), 'constant', constant_values=False)
# np.pad()
print(vec_mask)

[False  True False  True False  True False]
[False  True  True]
[False False False  True  True False False]


In [4]:
np.array(example_list)[boolean_mask]

array([0, 0, 0])