In [1]:
# package imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw

# scikit learn!
import sklearn
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, fclusterdata
from scipy.spatial.distance import cdist

# umap for dimensionality reduction
import umap

# nice plotting
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns; sns.set()

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# cleaned aryl X dataset
# We kept parameters for the low energy conformer (removing those for min/max and Bolztmann average, which are highly correlated)
arylx = pd.read_csv('arylx.csv')
arylx.head()

Unnamed: 0,id,smiles,HOMO_low_E,LUMO_low_E,η_low_E,μ_low_E,ω_low_E,polar_aniso(Debye)_low_E,polar_iso(Debye)_low_E,dipole(Debye)_low_E,...,%Vbur_Br_2.5Å_low_E,%Vbur_C_3.0Å_low_E,%Vbur_Br_3.0Å_low_E,%Vbur_C_3.5Å_low_E,%Vbur_Br_3.5Å_low_E,%Vbur_C_4.0Å_low_E,%Vbur_Br_4.0Å_low_E,Sterimol_B1_C_Br(Å)_morfeus_low_E,Sterimol_B5_C_Br(Å)_morfeus_low_E,Sterimol_L_C_Br(Å)_morfeus_low_E
0,arbr141,Cc1[nH]nc(C(F)(F)F)c1Br,-0.3077,0.01581,0.32351,-0.145945,0.03292,43.1852,89.2019,5.5518,...,34.132048,78.811271,33.014563,68.080856,31.508572,56.128189,29.409255,2.277507,4.841637,4.126049
1,arbr142,Cc1[nH]nc(-c2ccccc2)c1Br,-0.27821,0.00248,0.28069,-0.137865,0.03386,102.715,147.439,2.8706,...,36.231931,77.875447,34.498845,67.397153,32.27439,55.776721,29.634644,2.077762,7.508006,4.13671
2,arbr143,Cc1cc(C(C)(C)C)cc(C)c1Br,-0.28384,0.0148,0.29864,-0.13452,0.0303,86.5084,162.025,1.8175,...,36.599818,79.91098,35.651631,70.185543,33.763511,58.853811,30.847713,2.968765,4.479852,4.170312
3,arbr145,Cc1cccc(Cl)c1Br,-0.30209,-0.00082,0.30127,-0.151455,0.03807,69.304,110.67,2.6224,...,36.397968,79.426773,35.005401,69.397596,32.937127,57.807554,30.127708,1.868018,4.49847,4.1539
4,arbr146,Cc1cccc(F)c1Br,-0.3022,0.00107,0.30327,-0.150565,0.03738,62.0065,98.2669,2.6395,...,34.662716,78.545888,33.005252,67.583512,30.757315,55.012104,27.829201,1.85,4.454212,4.152511


In [3]:
# the molecular descriptors
X = arylx.select_dtypes(include=['number'])

# set dtype to float64
X = X.astype('float32')

# trim significant figures to 3
X = X.round(3)

# Feature scaling through standardization (or Z-score normalization) is an important preprocessing step 
# for many machine learning algorithms. Standardization involves rescaling the features such that they 
# have the properties of a standard normal distribution with a mean of zero and a standard deviation of one. 

X_scaled=pd.DataFrame(scale(X),index=X.index, columns=X.columns)

# drop zero-variance features
zero_std_cols = X_scaled.columns[X_scaled.std() == 0]
X_scaled=X_scaled[X_scaled.columns.difference(zero_std_cols)]
print (f"Dropping {len(zero_std_cols)} features {zero_std_cols}")

# drop highly correlated features
corr = X_scaled.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X_scaled = X_scaled.drop(to_drop, axis=1)
print (f"Dropping {len(to_drop)} features {to_drop}")

Dropping 0 features Index([], dtype='object')
Dropping 10 features ['ω_low_E', 'SASA_surface_area(Å²)_low_E', 'SASA_volume(Å³)_low_E', 'pyramidalization_Gavrish_C(°)_low_E', '%Vbur_Br_2.5Å_low_E', '%Vbur_Br_3.0Å_low_E', '%Vbur_C_3.5Å_low_E', '%Vbur_Br_3.5Å_low_E', '%Vbur_C_4.0Å_low_E', '%Vbur_Br_4.0Å_low_E']




In [4]:
X_scaled

Unnamed: 0,HOMO_low_E,LUMO_low_E,η_low_E,μ_low_E,polar_aniso(Debye)_low_E,polar_iso(Debye)_low_E,dipole(Debye)_low_E,volume(Bohr_radius³/mol)_low_E,SASA_sphericity_low_E,NBO_charge_Br_low_E,...,pyramidalization_Agranat-Radhakrishnan_C_low_E,NBO_LP_energy_Br_low_E,NBO_LP_occupancy_Br_low_E,%Vbur_C_2.0Å_low_E,%Vbur_Br_2.0Å_low_E,%Vbur_C_2.5Å_low_E,%Vbur_C_3.0Å_low_E,Sterimol_B1_C_Br(Å)_morfeus_low_E,Sterimol_B5_C_Br(Å)_morfeus_low_E,Sterimol_L_C_Br(Å)_morfeus_low_E
0,-0.678474,1.672894,2.252083,0.751543,-1.580533,-1.507735,1.917315,-0.849583,1.123424,1.346344,...,-0.453525,-0.513093,1.226132,-0.139070,0.656542,0.887134,1.480228,1.197531,-0.633539,-0.255508
1,0.937642,1.076414,0.384873,1.202188,0.167396,-0.007780,0.020992,0.195476,0.093161,0.094426,...,4.298754,0.386669,1.945188,-0.208346,2.074224,0.358493,1.058018,0.368943,1.087855,-0.205884
2,0.614418,1.630288,1.166496,1.371179,-0.308476,0.367897,-0.723816,1.000580,0.056367,-1.822574,...,-0.453525,1.398903,0.363259,0.209503,1.640863,1.057912,1.976420,4.060303,-0.867276,-0.057016
3,-0.355250,0.948597,1.253343,0.469891,-0.813623,-0.954804,-0.155131,-0.935549,1.528171,0.133548,...,-0.453525,0.386669,-0.643415,0.174849,1.879682,0.973486,1.758097,-0.501075,-0.855654,-0.129196
4,-0.355250,1.033808,1.340190,0.469891,-1.027908,-1.274257,-0.142399,-0.932373,1.712147,0.055303,...,-0.453525,0.386669,-0.068178,0.356709,1.036189,0.817099,1.360692,-0.575648,-0.884064,-0.133707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5041,1.530217,-0.244362,-1.482336,0.638882,1.681749,1.608798,-0.573864,1.571986,-0.826715,-0.883635,...,-0.453525,0.836552,0.363259,0.300433,-0.768396,-0.041584,-0.375970,1.354962,-0.098267,-0.097618
5042,1.153124,-0.372179,-1.308642,0.357229,3.030501,2.417950,-0.041252,1.698710,-1.599413,-0.883635,...,0.074506,0.949021,0.363259,0.328580,-0.793803,-0.053091,-0.389502,0.373085,1.062674,-0.102129
5043,0.668289,-0.329573,-0.874407,0.131907,1.680047,0.298510,0.382432,0.257995,0.019572,-0.805390,...,0.074506,0.949021,0.075640,0.343726,-0.681289,-0.063646,-0.394915,-0.509361,-1.547182,-0.102129
5044,1.422476,0.394723,-0.744137,1.033196,0.468887,1.555045,-1.218232,0.945473,-0.495559,-0.844513,...,-0.057502,0.611610,0.363259,0.462803,-0.742265,1.247875,0.961938,0.870238,0.889630,-0.102129


In [5]:
# Define the VAE architecture
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, latent_dim)  # Mean of the latent space
        self.fc22 = nn.Linear(hidden_dim, latent_dim)  # Log-variance of the latent space
        # Decoder
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [6]:
# Define the loss function
def loss_function(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    # KLD is the Kullback-Leibler divergence between the learned latent distribution and a unit Gaussian
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

# Training the VAE
def train_vae(model, data_loader, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        train_loss = 0
        for batch in data_loader:
            batch = batch[0].to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(batch)
            loss = loss_function(recon_batch, batch, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {train_loss / len(data_loader.dataset):.4f}")

In [7]:
# Data preparation
X_tensor = torch.tensor(X_scaled.values, dtype=torch.float32)  # Assuming X_scaled is your input data
dataset = TensorDataset(X_tensor)
data_loader = DataLoader(dataset, batch_size=64, shuffle=False)

for batch in data_loader:
    print(batch)
    break

# Define the dimensions
input_dim = X_scaled.shape[1]
hidden_dim = 128  # Size of the hidden layer
latent_dim = 2  # Choose the latent dimension (2 or 3)

# Initialize the model, optimizer
vae = VAE(input_dim, hidden_dim, latent_dim).to(device)
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

# Train the model
train_vae(vae, data_loader, optimizer, epochs=50)

# After training, extract the low-dimensional representations
vae.eval()
with torch.no_grad():
    X_tensor = X_tensor.to(device)
    mu, _ = vae.encode(X_tensor)
    reduced_data = mu.cpu().numpy()  # Use mu (the mean) as the reduced representation

[tensor([[-0.6785,  1.6729,  2.2521,  ...,  1.1975, -0.6335, -0.2555],
        [ 0.9376,  1.0764,  0.3849,  ...,  0.3689,  1.0879, -0.2059],
        [ 0.6144,  1.6303,  1.1665,  ...,  4.0603, -0.8673, -0.0570],
        ...,
        [-0.1398,  1.1190,  1.2533,  ..., -0.5756,  1.1563, -0.0886],
        [ 0.0218,  1.1190,  1.1231,  ..., -0.4058, -0.8640, -0.1021],
        [ 0.9915, -0.1165, -0.9178,  ..., -0.5756,  0.4538, -0.3232]])]


../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [32,0,0] Assertion `target_val >= zero && target_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [33,0,0] Assertion `target_val >= zero && target_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [34,0,0] Assertion `target_val >= zero && target_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [35,0,0] Assertion `target_val >= zero && target_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [36,0,0] Assertion `target_val >= zero && target_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [37,0,0] Assertion `target_val >= zero && target_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [2,0,0], thread: [41,0,0] Assertion `target_val >= zero && targe

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# define the dimensionalities of the reduced representation to study
# We will compare the performance of clustering based on reduced dimensionality representations of size 3 and 2 dimensions
# Because tSNE cannot convert high dimension data, we only use size 3 and 3 dimensions
dims = [3, 2]

# dictionary to store data at different levels of dimensionality reduction
dfs={}

np.random.seed(42)

# UMAP
n_neighbors = int(np.sqrt(X_scaled.shape[1]))
for dim in dims:
    key = f"umap{dim}"
    dfs[key] = pd.DataFrame(umap.UMAP(n_epochs=100, n_components=2, n_neighbors=2, 
                                      random_state=np.random.RandomState(42), 
                                      init="pca", verbose=False, n_jobs=1
                                      ).fit_transform(X_scaled), index=X_scaled.index)