In [1]:
#!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

import os
import pickle
import gc

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
# train = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
# test = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')

In [5]:
train_df.shape

(7973, 7)

In [6]:
train_df.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [7]:
train_df.isnull().sum()

id            0
SMILES        0
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64

In [8]:
train_df.describe(include="all")

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
count,7973.0,7973,511.0,7030.0,737.0,613.0,614.0
unique,,7973,,,,,
top,,*CC(*)c1ccccc1C(=O)OCCCCCC,,,,,
freq,,1,,,,,
mean,1080050000.0,,96.452314,0.367212,0.256334,0.985484,16.419787
std,621824100.0,,111.228279,0.029609,0.089538,0.146189,4.60864
min,87817.0,,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,537664100.0,,13.674509,0.349549,0.186,0.890243,12.540328
50%,1079079000.0,,74.040183,0.364264,0.236,0.948193,15.052194
75%,1621708000.0,,161.147595,0.38079,0.3305,1.062096,20.411067


In [9]:
missing = train_df.isnull().sum()[['Tg', 'FFV', 'Tc', 'Density', 'Rg']]
missing

Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64

### Prepare Dataset

In [10]:
class ChemBERTDS(Dataset):
    def __init__(self, df, tokenizer, targ_col=None, max_len=None):
        self.smiles = df['SMILES'].tolist()
        self.tokenizer = tokenizer
        if max_len is None:
            self.max_len = tokenizer.model_max_length
        else:
            self.max_len = max_len
        self.has_target = targ_col is not None
        if self.has_target:
            self.target = df[targ_col].values.astype('float32')
        
    def __len__(self):
        return len(self.smiles)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.smiles[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        items = {k:v.squeeze(0) for k, v in encoding.items()}
        if self.has_target:
            items['target'] = torch.tensor(self.target[idx])
        
        return items

### Writing model class

In [11]:
class chemBERTModel(nn.Module):
    def __init__(self, base_model, out_dim=1):
        super().__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.2)
        self.regressor = nn.Linear(base_model.config.hidden_size, out_dim)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_tokens = outputs.last_hidden_state[:,0]
        return self.regressor(self.dropout(cls_tokens)).squeeze(1)

### Training and evaluation

In [12]:
def train(model, dl, loss_fn, opt, sched):
    model.train()
    total_loss = 0
    n = 0
    for batch in dl:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)
        preds = model(input_ids, attention_mask)
        loss = loss_fn(preds, targets)
        loss.backward()
        opt.step()
        sched.step()
        opt.zero_grad()
        total_loss += loss.item() * len(targets)
        n += len(targets)
    return total_loss / n

def eval(model, dl, loss_fn):
    model.eval()
    total_loss, n = 0, 0
    all_preds, all_targs = [], []
    with torch.no_grad():
        for batch in dl:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            preds = model(input_ids, attention_mask)
            loss = loss_fn(preds, targets)
            total_loss += loss.item() * len(targets)
            n += len(targets)
            all_preds.extend(preds.detach().cpu().numpy())
            all_targs.extend(targets.detach().cpu().numpy())
    return total_loss / n, all_preds, all_targs

### Training function

In [13]:
def train_chemberta(df, target, base_model, tokenizer, n_epochs=30, save_dir = 'saved_models_chemberta', patience=5):
    os.makedirs(save_dir, exist_ok=True)
    
    df_clean = df[['SMILES', target]].dropna()
        
    # scale target
    y_scaler = StandardScaler()
    df_clean[target] = y_scaler.fit_transform(df_clean[[target]])
    
    # save scaler
    with open(os.path.join(save_dir, f"{target}_scaler.pkl"), 'wb') as f:
        pickle.dump(y_scaler, f)
        
    train_data, val_data = train_test_split(df_clean, test_size=0.2, random_state=42)
    train_ds = ChemBERTDS(train_data, tokenizer, target)
    val_ds = ChemBERTDS(val_data, tokenizer, target)
    train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=16, shuffle=False)
    
    model = chemBERTModel(base_model).to(device)
    opt = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.1 * len(train_dl) * n_epochs),
        num_training_steps=len(train_dl) * n_epochs
    )
    loss_fn = nn.L1Loss()
    
    best_val_loss = float("inf")
    epoch_no_improve = 0
    
    for epoch in range(n_epochs):
        train_loss = train(model, train_dl, loss_fn, opt, scheduler)
        val_loss, preds, targs = eval(model, val_dl, loss_fn)
        print(f"Epoch {epoch+1}/{n_epochs} | Train MAE: {train_loss:.4f}, Val MAE: {val_loss:.4f}")
        
        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(save_dir, f"{target}_model.pt"))
            epoch_no_improve = 0
        else:
            epoch_no_improve += 1
            if epoch_no_improve >= patience:
                print("Early stopping triggered!")
                break
        
    model.load_state_dict(torch.load(os.path.join(save_dir, f"{target}_model.pt")))
    
    # Free up memory
    # del train_dl, val_dl, opt, scheduler
    # torch.cuda.empty_cache()
    # gc.collect()
    
    return model, tokenizer, y_scaler

In [14]:
def predict_chemberta(df, target, model, tokenizer, scaler):
    test_ds = ChemBERTDS(df, tokenizer)
    test_loader = DataLoader(test_ds, batch_size=16, shuffle=False)
    
    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids, attention_mask)
            preds.extend(outputs.detach().cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    return preds

In [16]:
MODEL_NAME = "seyonec/ChemBERTa-zinc-base-v1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

for target in targets:
    print(f'Working on predicting: {target}')
    
    base_model = RobertaModel.from_pretrained(MODEL_NAME)
    
    model, tokenizer, y_scaler = train_chemberta(
        df=train_df, 
        target=target, 
        base_model=base_model, 
        tokenizer=tokenizer)
    
    test_df[target_col] = predict_chemberta(
        df=test_df, 
        target=target, 
        model=model, 
        tokenizer=tokenizer, 
        scaler=y_scaler
    )
    
    # Clean up to be safe
    del model, base_model, y_scaler
    gc.collect()
    torch.cuda.empty_cache()

In [17]:
model, tokenizer, y_scaler = train_chemberta(train_df, 'Tg', base_model, tokenizer)

Epoch 1/30 | Train MAE: 0.8943, Val MAE: 0.6919
Epoch 2/30 | Train MAE: 0.7500, Val MAE: 0.6409
Epoch 3/30 | Train MAE: 0.6698, Val MAE: 0.5810
Epoch 4/30 | Train MAE: 0.5999, Val MAE: 0.5775
Epoch 5/30 | Train MAE: 0.5657, Val MAE: 0.6090
Epoch 6/30 | Train MAE: 0.5259, Val MAE: 0.5221
Epoch 7/30 | Train MAE: 0.4592, Val MAE: 0.6000
Epoch 8/30 | Train MAE: 0.4438, Val MAE: 0.5490
Epoch 9/30 | Train MAE: 0.4569, Val MAE: 0.5878
Epoch 10/30 | Train MAE: 0.4427, Val MAE: 0.5472
Epoch 11/30 | Train MAE: 0.3995, Val MAE: 0.6170
Early stopping triggered!


In [18]:
test_df['Tg'] = predict_chemberta(test_df, 'Tg', model, tokenizer, y_scaler)

In [19]:
model, tokenizer, y_scaler = train_chemberta(train_df, 'FFV', base_model, tokenizer)

Epoch 1/30 | Train MAE: 0.6394, Val MAE: 0.4840
Epoch 2/30 | Train MAE: 0.4991, Val MAE: 0.3891
Epoch 3/30 | Train MAE: 0.4230, Val MAE: 0.3690
Epoch 4/30 | Train MAE: 0.3627, Val MAE: 0.3696
Epoch 5/30 | Train MAE: 0.3242, Val MAE: 0.2978
Epoch 6/30 | Train MAE: 0.2810, Val MAE: 0.2765
Epoch 7/30 | Train MAE: 0.2647, Val MAE: 0.2782
Epoch 8/30 | Train MAE: 0.2432, Val MAE: 0.2502
Epoch 9/30 | Train MAE: 0.2202, Val MAE: 0.2835
Epoch 10/30 | Train MAE: 0.2097, Val MAE: 0.2460
Epoch 11/30 | Train MAE: 0.2005, Val MAE: 0.2238
Epoch 12/30 | Train MAE: 0.1879, Val MAE: 0.2264
Epoch 13/30 | Train MAE: 0.1759, Val MAE: 0.2413
Epoch 14/30 | Train MAE: 0.1724, Val MAE: 0.2096
Epoch 15/30 | Train MAE: 0.1655, Val MAE: 0.2298
Epoch 16/30 | Train MAE: 0.1569, Val MAE: 0.2403
Epoch 17/30 | Train MAE: 0.1520, Val MAE: 0.2049
Epoch 18/30 | Train MAE: 0.1468, Val MAE: 0.2186
Epoch 19/30 | Train MAE: 0.1410, Val MAE: 0.2057
Epoch 20/30 | Train MAE: 0.1367, Val MAE: 0.2106
Epoch 21/30 | Train MAE: 0.13

In [20]:
test_df['FFV'] = predict_chemberta(test_df, 'FFV', model, tokenizer, y_scaler)

In [21]:
model, tokenizer, y_scaler = train_chemberta(train_df, 'Tc', base_model, tokenizer)

Epoch 1/30 | Train MAE: 0.9367, Val MAE: 0.7300
Epoch 2/30 | Train MAE: 0.5929, Val MAE: 0.3859
Epoch 3/30 | Train MAE: 0.4447, Val MAE: 0.3686
Epoch 4/30 | Train MAE: 0.4226, Val MAE: 0.3340
Epoch 5/30 | Train MAE: 0.3915, Val MAE: 0.3288
Epoch 6/30 | Train MAE: 0.3783, Val MAE: 0.3665
Epoch 7/30 | Train MAE: 0.3776, Val MAE: 0.3508
Epoch 8/30 | Train MAE: 0.3780, Val MAE: 0.3138
Epoch 9/30 | Train MAE: 0.3540, Val MAE: 0.3402
Epoch 10/30 | Train MAE: 0.3499, Val MAE: 0.3560
Epoch 11/30 | Train MAE: 0.3185, Val MAE: 0.3478
Epoch 12/30 | Train MAE: 0.3230, Val MAE: 0.3603
Epoch 13/30 | Train MAE: 0.3167, Val MAE: 0.3279
Early stopping triggered!


In [22]:
test_df['Tc'] = predict_chemberta(test_df, 'Tc', model, tokenizer, y_scaler)

In [23]:
model, tokenizer, y_scaler = train_chemberta(train_df, 'Density', base_model, tokenizer)

Epoch 1/30 | Train MAE: 0.8395, Val MAE: 0.6220
Epoch 2/30 | Train MAE: 0.6100, Val MAE: 0.4561
Epoch 3/30 | Train MAE: 0.4625, Val MAE: 0.3691
Epoch 4/30 | Train MAE: 0.4030, Val MAE: 0.3368
Epoch 5/30 | Train MAE: 0.3796, Val MAE: 0.3537
Epoch 6/30 | Train MAE: 0.3472, Val MAE: 0.2956
Epoch 7/30 | Train MAE: 0.3462, Val MAE: 0.3429
Epoch 8/30 | Train MAE: 0.3271, Val MAE: 0.3066
Epoch 9/30 | Train MAE: 0.3131, Val MAE: 0.3111
Epoch 10/30 | Train MAE: 0.3046, Val MAE: 0.3025
Epoch 11/30 | Train MAE: 0.2939, Val MAE: 0.3168
Early stopping triggered!


In [24]:
test_df['Density'] = predict_chemberta(test_df, 'Density', model, tokenizer, y_scaler)

In [25]:
model, tokenizer, y_scaler = train_chemberta(train_df, 'Rg', base_model, tokenizer)

Epoch 1/30 | Train MAE: 0.8712, Val MAE: 0.7677
Epoch 2/30 | Train MAE: 0.6407, Val MAE: 0.5080
Epoch 3/30 | Train MAE: 0.5103, Val MAE: 0.3847
Epoch 4/30 | Train MAE: 0.4704, Val MAE: 0.3789
Epoch 5/30 | Train MAE: 0.4330, Val MAE: 0.3794
Epoch 6/30 | Train MAE: 0.4247, Val MAE: 0.4003
Epoch 7/30 | Train MAE: 0.3658, Val MAE: 0.3727
Epoch 8/30 | Train MAE: 0.3612, Val MAE: 0.3631
Epoch 9/30 | Train MAE: 0.3624, Val MAE: 0.3807
Epoch 10/30 | Train MAE: 0.3350, Val MAE: 0.3660
Epoch 11/30 | Train MAE: 0.3337, Val MAE: 0.3836
Epoch 12/30 | Train MAE: 0.3304, Val MAE: 0.3732
Epoch 13/30 | Train MAE: 0.3032, Val MAE: 0.3764
Early stopping triggered!


In [26]:
test_df['Rg'] = predict_chemberta(test_df, 'Rg', model, tokenizer, y_scaler)

In [27]:
test_df

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...,111.223137,0.367491,0.225861,1.163119,20.589508
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...,231.585541,0.377434,0.223918,1.110826,19.536264
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...,273.13446,0.346256,0.255565,1.09861,19.572638


In [28]:
# lengths = train_df['SMILES'].apply(lambda x: len(tokenizer.tokenize(x)))
# print(lengths.describe())

count    7973.000000
mean       40.538819
std        25.570449
min         3.000000
25%        21.000000
50%        35.000000
75%        54.000000
max       238.000000
Name: SMILES, dtype: float64
