In [1]:
# Dependencies

import re, json, random
import numpy as np
import pandas as pd
from rdkit import Chem

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
# Tokenizer: handles Cl/Br, bracket atoms, and %nn tokens
TOKENIZER_RE = re.compile(r"(\%\d{2}|\[[^\]]+\]|Br|Cl|.)")

def tokenize_smiles(smi: str):
    return TOKENIZER_RE.findall(smi)

def build_vocab(smiles_list, min_freq=1):
    from collections import Counter
    cnt = Counter()
    for s in smiles_list:
        cnt.update(tokenize_smiles(s))
    itos = ["<PAD>", "<UNK>"]
    for tok, f in cnt.most_common():
        if f >= min_freq and tok not in itos:
            itos.append(tok)
    stoi = {t:i for i,t in enumerate(itos)}
    return stoi, itos

def encode_smiles(smi, stoi, max_len):
    ids = [stoi.get(t, stoi["<UNK>"]) for t in tokenize_smiles(smi)[:max_len]]
    if len(ids) < max_len:
        ids += [stoi["<PAD>"]] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

In [4]:
# CNN Model


class SmilesCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, max_len=256, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(emb_dim, 128, kernel_size=3, padding=1),
            nn.Conv1d(emb_dim, 128, kernel_size=5, padding=2),
            nn.Conv1d(emb_dim, 128, kernel_size=7, padding=3),
        ])
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.head = nn.Sequential(
            nn.Linear(128*3, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 1)
        )
    def forward(self, x):          # x: (B, L)
        e = self.emb(x)            # (B, L, E)
        e = e.transpose(1, 2)      # (B, E, L)
        feats = [torch.amax(self.act(conv(e)), dim=-1) for conv in self.convs]  # (B,128) x3
        h = torch.cat(feats, dim=1)
        h = self.dropout(h)
        y = self.head(h).squeeze(1)  # (B,)
        return y

In [7]:
# Dataset and evaluation 


class SmilesDataset(Dataset):
    def __init__(self, smiles, y, stoi, max_len):
        self.X = [encode_smiles(s, stoi, max_len) for s in smiles]
        self.y = None if y is None else np.array(y, dtype=np.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        ids = torch.from_numpy(self.X[i])
        if self.y is None: return ids
        return ids, torch.tensor(self.y[i])

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    ys, ps = [], []
    for xb, yb in loader:
        xb = xb.to(device)
        pred = model(xb).cpu().numpy()
        ys.append(yb.numpy()); ps.append(pred)
    y = np.concatenate(ys); p = np.concatenate(ps)
    rmse = float(np.sqrt(((y - p) ** 2).mean()))   # manual RMSE
    mae  = float(np.mean(np.abs(y - p)))           # manual MAE
    r2   = r2_score(y, p)                          # OK to keep this
    return rmse, mae, r2

In [8]:
# Training

# ---- Config ----
CSV_PATH = "/Users/amrithaa/Downloads/AqSolDB-master/data/dataset-A.csv"       
SMILES_COL = "SMILES"
TARGET_COL = "Solubility"
MAX_LEN = 256
EPOCHS = 30
BATCH_SIZE = 128
LR = 2e-3
DROPOUT = 0.2
SEED = 42

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ---- Load data ----
df = pd.read_csv(CSV_PATH)
assert SMILES_COL in df.columns and TARGET_COL in df.columns, f"Missing columns. Got: {list(df.columns)}"

smiles, y = [], []
for s, t in zip(df[SMILES_COL].astype(str), df[TARGET_COL]):
    m = Chem.MolFromSmiles(s)
    if not m: 
        continue
    smiles.append(Chem.MolToSmiles(m))
    try:
        y.append(float(t))
    except:
        y.append(np.nan)

smiles = np.array(smiles)
y = np.array(y, dtype=np.float32)
mask = np.isfinite(y)
smiles, y = smiles[mask], y[mask]
print(f"Samples after cleaning: {len(smiles)}")

# ---- Split ---- (simple random 80/10/10 for a quick start)
X_train, X_tmp, y_train, y_tmp = train_test_split(smiles, y, test_size=0.2, random_state=SEED)
X_val, X_test, y_val, y_test   = train_test_split(X_tmp, y_tmp,  test_size=0.5, random_state=SEED)

# ---- Vocab ----
stoi, itos = build_vocab(X_train.tolist())
with open("vocab.json", "w") as f: json.dump(itos, f)
print("Vocab size:", len(itos))

# ---- DataLoaders ----
train_ds = SmilesDataset(X_train, y_train, stoi, MAX_LEN)
val_ds   = SmilesDataset(X_val,   y_val,   stoi, MAX_LEN)
test_ds  = SmilesDataset(X_test,  y_test,  stoi, MAX_LEN)

train_ld = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_ld   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)
test_ld  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

# ---- Model/opt ----
model = SmilesCNN(vocab_size=len(itos), emb_dim=128, max_len=MAX_LEN, dropout=DROPOUT).to(device)
opt   = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
crit  = nn.MSELoss()

best_rmse, best_state, patience, waited = float("inf"), None, 8, 0

for ep in range(1, EPOCHS+1):
    model.train()
    losses = []
    for xb, yb in train_ld:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        pred = model(xb)
        loss = crit(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        opt.step()
        losses.append(loss.item())

    rmse, mae, r2 = evaluate(model, val_ld, device)
    print(f"Epoch {ep:03d} | train_loss {np.mean(losses):.4f} | val_RMSE {rmse:.4f} MAE {mae:.4f} R2 {r2:.3f}")

    if rmse < best_rmse - 1e-4:
        best_rmse, best_state, waited = rmse, {k:v.cpu() for k,v in model.state_dict().items()}, 0
    else:
        waited += 1
        if waited >= patience:
            print("Early stopping."); break

if best_state is not None:
    model.load_state_dict({k: v.to(device) for k,v in best_state.items()})
    torch.save(best_state, "best_cnn.pt")
    print("Saved best weights to best_cnn.pt")

# ---- Final metrics ----
tr_rmse, tr_mae, tr_r2 = evaluate(model, train_ld, device)
va_rmse, va_mae, va_r2 = evaluate(model, val_ld, device)
te_rmse, te_mae, te_r2 = evaluate(model, test_ld, device)

print("\nFinal metrics")
print(f"Train: RMSE {tr_rmse:.4f} | MAE {tr_mae:.4f} | R2 {tr_r2:.3f}")
print(f"Val:   RMSE {va_rmse:.4f} | MAE {va_mae:.4f} | R2 {va_r2:.3f}")
print(f"Test:  RMSE {te_rmse:.4f} | MAE {te_mae:.4f} | R2 {te_r2:.3f}")


Device: cpu




Samples after cleaning: 6110
Vocab size: 187
Epoch 001 | train_loss 4.9289 | val_RMSE 1.8231 MAE 1.3443 R2 0.483
Epoch 002 | train_loss 2.9659 | val_RMSE 1.5605 MAE 1.1748 R2 0.621
Epoch 003 | train_loss 2.5074 | val_RMSE 1.5401 MAE 1.1256 R2 0.631
Epoch 004 | train_loss 2.2254 | val_RMSE 1.4087 MAE 1.0613 R2 0.691
Epoch 005 | train_loss 2.0576 | val_RMSE 1.4753 MAE 1.1051 R2 0.662
Epoch 006 | train_loss 1.9042 | val_RMSE 1.4064 MAE 1.0357 R2 0.692
Epoch 007 | train_loss 1.7490 | val_RMSE 1.3337 MAE 0.9579 R2 0.723
Epoch 008 | train_loss 1.6761 | val_RMSE 1.3415 MAE 0.9540 R2 0.720
Epoch 009 | train_loss 1.6966 | val_RMSE 1.3492 MAE 0.9645 R2 0.717
Epoch 010 | train_loss 1.5835 | val_RMSE 1.3815 MAE 0.9865 R2 0.703
Epoch 011 | train_loss 1.5318 | val_RMSE 1.4271 MAE 1.0360 R2 0.683
Epoch 012 | train_loss 1.6830 | val_RMSE 1.3157 MAE 0.9516 R2 0.731
Epoch 013 | train_loss 1.3974 | val_RMSE 1.2922 MAE 0.9310 R2 0.740
Epoch 014 | train_loss 1.3697 | val_RMSE 1.3260 MAE 0.9578 R2 0.727
Epo