This code is used for clustering molecules, based on t-SNE (Kmeans) cluster method. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplotplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from rdkit import chem
from rdkit.Chem import AllChem, DataStructs
import pickle

In [None]:
# Step 1: Load your dataset
# Replace  "filename.csv" with the path to your CSV file containing your dataset
df = pd.read_csv('filename.csv')

In [None]:
# Convert SMILES strings to RDKit molecules
molecules = [Chem.MolFromSmiles(smiles) for smiles in df ['SMILES']]

In [None]:
# Generate Morgan Fingerprints for each molecule
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in molecules]

In [None]:
# List array
np.where(np.sum(np.array(fps), axis=0) !=0)

In [None]:
# Convert fingerprints to numpy array
fp_array = []
for fp in fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    fp_array.append(arr)
    
fp_array = np.array(fp_array)

In [None]:
ic50_values = df["IC50"].values

In [None]:
# Perform t-SNE embedding
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(fp_array, y=ic50_values)

In [None]:
kmeans = KMeans(n_clusters = 8, ramdon_state=42)
kmeans.fit(tsne_results)
cluster_lables = kmeans.labels_

In [None]:
plt.figure(figsize=(8,6))
scatter = plt.scatter(tnse_results[:,0], tsne_results[:,1], c=cluster_lables, cmap='viridis', alpha=0.8)
plt.title('t-SNE Cluster with K-Means')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(scatter, label='Cluster')
plt.savefig('t-SNE_cluster_with_KMeans_8_clusters.png', dpi=600)
plt.show()                

In [None]:
colors = ic50_values

In [None]:
# Plot the t-SNE embedding
plt.figure(figsize=(8, 6))
x, y =tsne_results[:,0], tsne_results[:,1]
locs = (x < 10) & (y < 7.5) & (ic50_values > 90)
plt.scatter(tsne_results[:,0], tsne_results[:,1], c=colors, cmap='viridis', alpha=1.0)
plt.title('t-SNE Clustering of Molecular Structures')
plt.xlable('t_SNE component 1')
plt.ylabel('t_SNE component 2')
plt.colorbar(label='IC50')
plt.savefig('t-SNE_cluster_IC50_8_clusters.png', dpi=600)
plt.show()

In [None]:
good_mols = [molecules[i] for i in np.where(loc)[0]]

In [None]:
# Print (good_mols[0])
good_mols[0]

In [None]:
good_mols[1]

In [None]:
good_mols[2]

In [None]:
if len(good_mols) > 3:
    print(good_mols[2])
else:
    print("The list does not have at least three elements.")

In [None]:
pickle.dump(tsne_results, open('filename.pkl', 'wb'))

In [None]:
model_loaded = pickle.load(open('filename.pkl', 'rb'))