In [139]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

In [140]:
def one_hot_encode_smiles(smiles, charset, max_length=120):
    char_to_int = dict((c, i) for i, c in enumerate(charset))
    integer_encoded = [char_to_int[char] for char in smiles]
    if len(integer_encoded) > max_length:
        integer_encoded = integer_encoded[:max_length]
    else:
        integer_encoded = integer_encoded + [0] * (max_length - len(integer_encoded))
    onehot_encoded = np.zeros((max_length, len(charset)), dtype=np.float32)
    for i, val in enumerate(integer_encoded):
        onehot_encoded[i, val] = 1.0

    return onehot_encoded

In [141]:
def decode_smiles_from_one_hot(one_hot_encoded, charset):
    int_to_char = {i: c for i, c in enumerate(charset)}
    integer_decoded = np.argmax(one_hot_encoded, axis=1)
    chars = [int_to_char[idx] for idx in integer_decoded]
    smiles = "".join(chars).rstrip()

    return smiles

In [142]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data


class MolecularVAE(nn.Module):
    def __init__(self):
        super(MolecularVAE, self).__init__()

        self.conv_1 = nn.Conv1d(120, 9, kernel_size=9)
        self.conv_2 = nn.Conv1d(9, 9, kernel_size=9)
        self.conv_3 = nn.Conv1d(9, 10, kernel_size=11)
        self.linear_0 = nn.Linear(90, 435)
        self.linear_1 = nn.Linear(435, 292)
        self.linear_2 = nn.Linear(435, 292)

        self.linear_3 = nn.Linear(292, 292)
        self.gru = nn.GRU(292, 501, 3, batch_first=True)
        self.linear_4 = nn.Linear(501, 35)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def encode(self, x):
        x = self.relu(self.conv_1(x))
        x = self.relu(self.conv_2(x))
        x = self.relu(self.conv_3(x))
        x = x.view(x.size(0), -1)
        x = F.selu(self.linear_0(x))
        return self.linear_1(x), self.linear_2(x)

    def sampling(self, z_mean, z_logvar):
        epsilon = 1e-2 * torch.randn_like(z_logvar)
        return torch.exp(0.5 * z_logvar) * epsilon + z_mean

    def decode(self, z):
        z = F.selu(self.linear_3(z))
        z = z.view(z.size(0), 1, z.size(-1)).repeat(1, 120, 1)
        output, hn = self.gru(z)
        out_reshape = output.contiguous().view(-1, output.size(-1))
        y0 = F.softmax(self.linear_4(out_reshape), dim=1)
        y = y0.contiguous().view(output.size(0), -1, y0.size(-1))
        return y

    def forward(self, x):
        z_mean, z_logvar = self.encode(x)
        z = self.sampling(z_mean, z_logvar)
        return self.decode(z), z_mean, z_logvar

In [143]:
vae_model = MolecularVAE()
checkpoint = torch.load("/content/drive/MyDrive/checkpoints/checkpoint_epoch_60.pt")
vae_model.load_state_dict(checkpoint["model_state_dict"])
vae_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vae_model = vae_model.to(device)
device

device(type='cuda')

In [144]:
class SMILESLogPDataset(Dataset):
    def __init__(self, smiles_list, charset, logp_list, vae_model):
        self.smiles_list = smiles_list
        self.charset = charset
        self.logp_list = logp_list
        self.vae_model = vae_model

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        encoded_smiles = one_hot_encode_smiles(smiles, self.charset)
        logp = self.logp_list[idx]
        return torch.FloatTensor(encoded_smiles), smiles, logp

In [145]:
df = pd.read_csv("250k_rndm_zinc_drugs_clean_3.csv")
df["smiles"] = df["smiles"].str.rstrip("\n")

charset = set("".join(df["smiles"].values.tolist()))
charset = sorted(list(charset))
charset.insert(0, " ")

In [146]:
dataset = SMILESLogPDataset(
    df["smiles"].values.tolist(), charset, df["logP"].values.tolist(), vae_model
)

In [147]:
test_ratio = 0.2

test_size = int(test_ratio * len(dataset))
train_size = len(dataset) - test_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [148]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128)
torch.manual_seed(42)

<torch._C.Generator at 0x793253316750>

In [149]:
train_size

199564

In [150]:
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.batch_norm1 = nn.BatchNorm1d(256)
        self.batch_norm2 = nn.BatchNorm1d(128)
        self.batch_norm3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = self.dropout(self.relu(self.batch_norm1(self.fc1(x))))
        x = self.dropout(self.relu(self.batch_norm2(self.fc2(x))))
        x = self.dropout(self.relu(self.batch_norm3(self.fc3(x))))
        x = self.dropout(self.relu(self.fc4(x)))
        x = self.fc5(x)
        return x

In [151]:
import torch.optim as optim

input_dim = 292
model = MLPRegressor(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 500
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for ohe, _, logp in train_loader:
        ohe = ohe.to(device).float()
        logp = logp.to(device).float().view(-1, 1)

        with torch.no_grad():
            z_mean, z_logvar = vae_model.encode(ohe)
            z = vae_model.sampling(z_mean, z_logvar)

        optimizer.zero_grad()
        outputs = model(z)
        loss = criterion(outputs, logp)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    model.eval()
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for ohe, _, logp in test_loader:
            ohe = ohe.to(device).float()
            logp = logp.to(device).float().view(-1, 1)

            z_mean, z_logvar = vae_model.encode(ohe)
            z = vae_model.sampling(z_mean, z_logvar)

            outputs = model(z)
            loss = criterion(outputs, logp)
            test_loss += loss.item()
    avg_test_loss = test_loss / len(test_loader)
    print(f"Average test loss: {avg_test_loss:.4f}")
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "train_loss": total_loss,
        },
        f"/content/drive/MyDrive/checkpoints/mlp_logp_regressor_epoch_{epoch}.pt",
    )
    print("Training completed and model saved.")

KeyboardInterrupt: 

In [152]:
input_dim = 292
model = MLPRegressor(input_dim)
checkpoint = torch.load(
    "/content/drive/MyDrive/checkpoints/mlp_logp_regressor_epoch_499.pt"
)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()

MLPRegressor(
  (fc1): Linear(in_features=292, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=32, bias=True)
  (fc5): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batch_norm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [153]:
from logging import log
import torch.optim as optim

lr = 0.001
num_iterations = 1000

model.eval()
z = torch.randn(1, input_dim, requires_grad=True)
z_history = torch.empty((num_iterations // 100 + 1, 1, input_dim))
logp_history = []
optimizer = optim.Adam([z], lr=lr)
for i in range(num_iterations):
    optimizer.zero_grad()

    predicted_logP = model(z)
    loss = predicted_logP

    loss.backward()

    optimizer.step()

    if (i + 1) % 100 == 0:
        print(f"Iteration {i+1}, Predicted LogP: {predicted_logP.item():.4f}")
        logp_history.append(predicted_logP.item())
        z_history[i // 100] = z.detach().clone()

z_opt = z.detach()

with torch.no_grad():
    final_logP = model(z_opt)
print(f"Final optimized LogP: {final_logP.item():.4f}")
print("Optimized input vector:")
print(z_opt)

Iteration 100, Predicted LogP: 1.8741
Iteration 200, Predicted LogP: -1.1236
Iteration 300, Predicted LogP: -2.4987
Iteration 400, Predicted LogP: -3.7894
Iteration 500, Predicted LogP: -5.3355
Iteration 600, Predicted LogP: -8.9677
Iteration 700, Predicted LogP: -22.2083
Iteration 800, Predicted LogP: -41.4113
Iteration 900, Predicted LogP: -58.4973
Iteration 1000, Predicted LogP: -71.5192
Final optimized LogP: -71.6371
Optimized input vector:
tensor([[ 4.4355e-01, -4.3069e-01,  7.0372e-01,  1.1605e+00, -1.3759e+00,
          2.9366e+00, -1.5973e+00,  5.9685e-01, -4.9110e-01, -1.7654e-01,
          1.2772e+00, -1.9186e+00,  1.4255e-01,  1.2807e+00,  9.7490e-01,
          2.4773e+00,  1.4204e+00, -2.3928e-03, -5.6430e-02, -1.2289e-02,
         -3.0847e-01,  1.3775e+00, -4.1097e-01,  7.7727e-02,  2.9289e-01,
         -1.6312e+00,  4.5520e-01, -8.6815e-02, -1.6836e-01,  2.7462e+00,
         -1.3220e+00,  4.4604e-01, -2.0065e+00, -4.5698e-01,  6.6975e-01,
          5.0952e-01,  6.4281e-01

In [154]:
z_opt = z_opt.cpu()
vae_model = vae_model.cpu()
ohe_opt = vae_model.decode(z_opt)

In [155]:
ohe_opt

tensor([[[6.2733e-04, 5.4162e-03, 1.0074e-04,  ..., 6.6311e-05,
          1.6166e-03, 7.7639e-05],
         [1.8233e-05, 3.4739e-04, 1.1427e-03,  ..., 3.2508e-09,
          4.3327e-06, 3.4078e-08],
         [7.0299e-07, 1.0436e-05, 1.2286e-04,  ..., 9.1528e-11,
          1.7173e-06, 1.0077e-10],
         ...,
         [7.6942e-07, 5.0528e-06, 2.2785e-04,  ..., 6.6574e-09,
          7.5214e-07, 3.3726e-09],
         [2.0884e-06, 1.1549e-06, 5.7578e-03,  ..., 4.7579e-09,
          5.8374e-07, 4.6483e-09],
         [1.2700e-06, 2.1563e-06, 8.8738e-03,  ..., 1.6911e-09,
          2.3550e-07, 1.4530e-09]]], grad_fn=<ViewBackward0>)

In [156]:
generated_smiles = [
    decode_smiles_from_one_hot(ohe.detach().cpu().numpy(), charset) for ohe in ohe_opt
]

In [157]:
generated_smiles

['C/PP[P[PP[[[PPPPPPPPP[[PPPP[[[PPPP[[PPPP[[PPP][[PPP][[PPP][[PPPP[[PPPP[[PPPP[[PPPP[[PPPP][PPPP][[PPP][[PPPP[[PPPP[[PPPP[']

In [158]:
vae_model = vae_model.cpu()
ohe_history = vae_model.decode(z_history.cpu())

In [159]:
generated_smiles = [
    decode_smiles_from_one_hot(ohe.detach().cpu().numpy(), charset)
    for ohe in ohe_history
]

In [160]:
generated_smiles

['CCCCCC@](CCCCCCCCCCCCCCCC1[CC@]([[[[)[[)[[@@[[C@@H[[[#@@()####-#',
 'CCCCCC@](CCCCCCCCCCCCC@@1[C@@]([N)[([C@]([))[[[C@[[@@@@)##C)C',
 'CCCCP@]]NCCCCCCCCCCCCCC@1[[C@]([[N+][[N@[([C@[([C@@@[C@@]()C',
 'CCCP@@](NCCCCCCCCCCCCCCC1[[@H]([NH+][[@@]([O)[[[C@[[[C@@H((CCC',
 'CCPP@H](CCCCCCCCCCCCCCCCCC[[[[NH]([C@H][[NH+][[[H[[[[@[][C@HH]]',
 'C//CC@]/NCCCCCCCCCCCCC@]([C@H]([C@H][CC@]([C@H]([C@H](CC@H]CCC@H]CCC@H][CC@]([C@H]([C@H](CC@H][CC@]][C@H]([C@H](CC@H][CC',
 'C/[C@@@](CC@@H[[N@@H[[N@HH[[N@@H[[N@@H[[N@@H[[N@@H[[N@@H[[N@@H[[N@HH[[N@HH[[N@H][[N@H][[N@H][[N@H][[NHH][[NHH][[NHH][[NH',
 'CN[[@@@][[[@@H[[[@HH[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[',
 'C/P[[P[[[[[[PP[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[',
 'C/PP[P[PP[[[PPPPPPPPP[[PPPP[[[PPPP[[PPPP[[PPP][[PPP][[PPP][[PPPP[[PPPP[[PPPP[[PPPP[[PPPP][PPPP][[PPP][[PPPP[[PPPP[[PPPP[',
 '56666666O66555555555555555555555555555

In [161]:
ohe, _, logP = dataset[0]
ohe = ohe.unsqueeze(0)
ohe = ohe.to(device)
vae_model.to(device)
with torch.no_grad():
    z_mean, z_logvar = vae_model.encode(ohe)
    z = vae_model.sampling(z_mean, z_logvar)
z.requires_grad = True

In [162]:
ohe_opt = vae_model.decode(z)
init_smiles = [
    decode_smiles_from_one_hot(ohe.detach().cpu().numpy(), charset) for ohe in ohe_opt
]
init_smiles, logP

(['CC(C)(C)c1ccc2ccc(CC(=O)Nc3ccccc3F)ccc1'], 5.0506)

In [163]:
from logging import log
import torch.optim as optim

lr = 0.001
num_iterations = 1000
model.eval()
z_history = torch.empty((num_iterations // 100 + 1, 1, input_dim))
logp_history = []
optimizer = optim.Adam([z], lr=lr)
for i in range(num_iterations):
    optimizer.zero_grad()
    model.to(device)
    predicted_logP = model(z)
    loss = predicted_logP.sum()

    loss.backward()

    optimizer.step()

    if (i + 1) % 100 == 0:
        print(
            f"Iteration {i+1}, Loss: {loss}, Predicted LogP: {predicted_logP.item():.4f}"
        )
        logp_history.append(predicted_logP.item())
        z_history[i // 100] = z.detach().clone()

z_opt = z.detach()

with torch.no_grad():
    final_logP = model(z_opt)
print(f"Final optimized LogP: {final_logP.item():.4f}")
print("Optimized input vector:")
print(z_opt)

Iteration 100, Loss: -0.010578632354736328, Predicted LogP: -0.0106
Iteration 200, Loss: -0.8642542362213135, Predicted LogP: -0.8643
Iteration 300, Loss: -1.1348967552185059, Predicted LogP: -1.1349
Iteration 400, Loss: -7.113698959350586, Predicted LogP: -7.1137
Iteration 500, Loss: -17.729915618896484, Predicted LogP: -17.7299
Iteration 600, Loss: -28.3962459564209, Predicted LogP: -28.3962
Iteration 700, Loss: -38.10822677612305, Predicted LogP: -38.1082
Iteration 800, Loss: -47.125694274902344, Predicted LogP: -47.1257
Iteration 900, Loss: -55.94256591796875, Predicted LogP: -55.9426
Iteration 1000, Loss: -66.79432678222656, Predicted LogP: -66.7943
Final optimized LogP: -66.9037
Optimized input vector:
tensor([[ 6.5844e-02,  7.8179e-01, -1.7205e-01, -3.4570e-01,  6.6610e-01,
         -5.8852e-01, -7.7552e-01,  9.3110e-01, -9.7632e-02,  5.6445e-01,
          6.5937e-01,  6.3253e-01,  1.1995e-02,  4.6673e-01,  4.2815e-01,
          1.0256e+00, -4.5978e-01, -6.4501e-01,  2.1953e-01,

In [164]:
vae_model = vae_model.cpu()
ohe_history = vae_model.decode(z_history.cpu())

In [165]:
generated_smiles = [
    decode_smiles_from_one_hot(ohe.detach().cpu().numpy(), charset)
    for ohe in ohe_history
]

In [166]:
init_smiles[0], generated_smiles

('CC(C)(C)c1ccc2ccc(CC(=O)Nc3ccccc3F)ccc1',
 ['CC(C)(C)c1ncc2ccc(CC(=O)NNc3cccc3F)c21',
  'CC(C)(C)n1nnc2ncc(CC(=O)NNc3cccc3F)c2',
  'NC(C)(CCn1nnc2ncc1CC(=O)NNc33ccc3F)s1',
  'NCCC1[n@H]c2nnnn11CCC=O)[NH]]CC33nN33',
  'O=C1[[)[nH]][H]nc1CC1=O[NHH]]2NN=NN3',
  'O=CCC1[O=][[[=H]==CCC===[NNHHNN=N[NHHNN',
  'O=CCC1[O[=[[[[HH]11CCCC==[[NHHHH[[[NNHHNNN',
  'O=CC[[[][][[[[[[[[[H111CCCCCC[NNHH][NNHHHHHHNN',
  'O=CC[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[',
  'O=OO[H[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[',
  ''])