In [27]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

import os
import pickle
import gc

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup

import warnings
warnings.filterwarnings('ignore')

In [28]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [29]:
train_df.shape

(7973, 7)

In [30]:
train_df.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [31]:
train_df.isnull().sum()

id            0
SMILES        0
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64

In [32]:
train_df.describe(include="all")

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
count,7973.0,7973,511.0,7030.0,737.0,613.0,614.0
unique,,7973,,,,,
top,,*CC(*)c1ccccc1C(=O)OCCCCCC,,,,,
freq,,1,,,,,
mean,1080050000.0,,96.452314,0.367212,0.256334,0.985484,16.419787
std,621824100.0,,111.228279,0.029609,0.089538,0.146189,4.60864
min,87817.0,,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,537664100.0,,13.674509,0.349549,0.186,0.890243,12.540328
50%,1079079000.0,,74.040183,0.364264,0.236,0.948193,15.052194
75%,1621708000.0,,161.147595,0.38079,0.3305,1.062096,20.411067


In [33]:
missing = train_df.isnull().sum()[['Tg', 'FFV', 'Tc', 'Density', 'Rg']]
missing

Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64

### Prepare Dataset

In [34]:
class SMILESDataset(Dataset):
    def __init__(self, df, target_col=None):
        self.smiles = df["SMILES"].tolist()
        self.has_target = target_col is not None
        if self.has_target:
            self.targets = df[target_col].values.astype("float32")

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        item = {"smiles": self.smiles[idx]}
        if self.has_target:
            item["target"] = self.targets[idx]
        return item


In [35]:
def chemberta_collate_fn(batch, tokenizer):
    smiles = [item["smiles"] for item in batch]
    encoding = tokenizer(smiles, padding=True, truncation=True, return_tensors="pt")

    result = {
        "input_ids": encoding["input_ids"],
        "attention_mask": encoding["attention_mask"]
    }
    if "target" in batch[0]:
        targets = torch.tensor([item["target"] for item in batch], dtype=torch.float32)
        result["targets"] = targets
    return result

### Writing model class

In [36]:
class chemBERTaModel(nn.Module):
    def __init__(self, base_model, out_dim=1):
        super().__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.2)
        self.regressor = nn.Linear(base_model.config.hidden_size, out_dim)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_tokens = outputs.last_hidden_state[:,0]
        return self.regressor(self.dropout(cls_tokens)).squeeze(1)

### Training and evaluation

In [44]:
def train(model, dl, loss_fn, opt, sched):
    model.train()
    total_loss = 0
    n = 0
    for batch in dl:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        preds = model(input_ids, attention_mask)
        loss = loss_fn(preds, targets)
        loss.backward()
        opt.step()
        sched.step()
        opt.zero_grad()
        total_loss += loss.item() * len(targets)
        n += len(targets)
    return total_loss / n

def eval(model, dl, loss_fn):
    model.eval()
    total_loss, n = 0, 0
    all_preds, all_targs = [], []
    with torch.no_grad():
        for batch in dl:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            preds = model(input_ids, attention_mask)
            loss = loss_fn(preds, targets)
            total_loss += loss.item() * len(targets)
            n += len(targets)
            all_preds.extend(preds.detach().cpu().numpy())
            all_targs.extend(targets.detach().cpu().numpy())
    return total_loss / n, all_preds, all_targs

### Training function

In [45]:
def train_chemberta(df, target, base_model, tokenizer, n_epochs=30, save_dir = 'saved_models_chemberta', patience=5):
    os.makedirs(save_dir, exist_ok=True)
    
    df_clean = df[['SMILES', target]].dropna()
        
    # scale target
    y_scaler = StandardScaler()
    df_clean[target] = y_scaler.fit_transform(df_clean[[target]])
    
    # save scaler
    with open(os.path.join(save_dir, f"{target}_scaler.pkl"), 'wb') as f:
        pickle.dump(y_scaler, f)
        
    train_data, val_data = train_test_split(df_clean, test_size=0.2, random_state=42)
    train_ds = SMILESDataset(train_data, target)
    val_ds = SMILESDataset(val_data, target)
    train_dl = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=lambda x: chemberta_collate_fn(x, tokenizer))
    val_dl = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=lambda x: chemberta_collate_fn(x, tokenizer))
    
    model = chemBERTaModel(base_model).to(device)
    opt = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.1 * len(train_dl) * n_epochs),
        num_training_steps=len(train_dl) * n_epochs
    )
    loss_fn = nn.MSELoss()
    
    best_val_loss = float("inf")
    epoch_no_improve = 0
    
    for epoch in range(n_epochs):
        train_loss = train(model, train_dl, loss_fn, opt, scheduler)
        val_loss, preds, targs = eval(model, val_dl, loss_fn)
        print(f"Epoch {epoch+1}/{n_epochs} | Train MAE: {train_loss:.4f}, Val MAE: {val_loss:.4f}")
        
        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(save_dir, f"{target}_model.pt"))
            epoch_no_improve = 0
        else:
            epoch_no_improve += 1
            if epoch_no_improve >= patience:
                print("Early stopping triggered!")
                break
        
    model.load_state_dict(torch.load(os.path.join(save_dir, f"{target}_model.pt")))
    
    # Free up memory
    # del train_dl, val_dl, opt, scheduler
    # torch.cuda.empty_cache()
    # gc.collect()
    
    return model, tokenizer, y_scaler

In [48]:
def predict_chemberta(df, target, model, tokenizer, scaler):
    test_ds = SMILESDataset(df)
    test_loader = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=lambda x: chemberta_collate_fn(x, tokenizer))
    
    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids, attention_mask)
            preds.extend(outputs.detach().cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    return preds

In [49]:
MODEL_NAME = "seyonec/ChemBERTa-zinc-base-v1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

for target in targets:
    print(f'Working on predicting: {target}')
    
    base_model = RobertaModel.from_pretrained(MODEL_NAME)
    
    model, tokenizer, y_scaler = train_chemberta(
        df=train_df, 
        target=target, 
        base_model=base_model, 
        tokenizer=tokenizer)
    
    test_df[target] = predict_chemberta(
        df=test_df, 
        target=target, 
        model=model, 
        tokenizer=tokenizer, 
        scaler=y_scaler
    )
    
    # Clean up to be safe
    del model, base_model, y_scaler
    gc.collect()
    torch.cuda.empty_cache()

Working on predicting: Tg


Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/30 | Train MAE: 1.1250, Val MAE: 0.6074
Epoch 2/30 | Train MAE: 0.7552, Val MAE: 0.5187
Epoch 3/30 | Train MAE: 0.5611, Val MAE: 0.4262
Epoch 4/30 | Train MAE: 0.5084, Val MAE: 0.7141
Epoch 5/30 | Train MAE: 0.3966, Val MAE: 0.4734
Epoch 6/30 | Train MAE: 0.3594, Val MAE: 0.5363
Epoch 7/30 | Train MAE: 0.2882, Val MAE: 0.4919
Epoch 8/30 | Train MAE: 0.2733, Val MAE: 0.6605
Early stopping triggered!
Working on predicting: FFV


Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/30 | Train MAE: 0.8305, Val MAE: 0.6322
Epoch 2/30 | Train MAE: 0.4987, Val MAE: 0.5057
Epoch 3/30 | Train MAE: 0.3944, Val MAE: 0.4109
Epoch 4/30 | Train MAE: 0.2914, Val MAE: 0.3281
Epoch 5/30 | Train MAE: 0.2156, Val MAE: 0.3172
Epoch 6/30 | Train MAE: 0.1728, Val MAE: 0.3212
Epoch 7/30 | Train MAE: 0.1390, Val MAE: 0.3040
Epoch 8/30 | Train MAE: 0.1160, Val MAE: 0.2771
Epoch 9/30 | Train MAE: 0.0988, Val MAE: 0.2770
Epoch 10/30 | Train MAE: 0.0859, Val MAE: 0.2625
Epoch 11/30 | Train MAE: 0.0766, Val MAE: 0.2976
Epoch 12/30 | Train MAE: 0.0678, Val MAE: 0.2749
Epoch 13/30 | Train MAE: 0.0614, Val MAE: 0.2734
Epoch 14/30 | Train MAE: 0.0570, Val MAE: 0.2667
Epoch 15/30 | Train MAE: 0.0529, Val MAE: 0.2430
Epoch 16/30 | Train MAE: 0.0509, Val MAE: 0.2588
Epoch 17/30 | Train MAE: 0.0462, Val MAE: 0.2455
Epoch 18/30 | Train MAE: 0.0433, Val MAE: 0.2627
Epoch 19/30 | Train MAE: 0.0421, Val MAE: 0.2423
Epoch 20/30 | Train MAE: 0.0387, Val MAE: 0.2434
Epoch 21/30 | Train MAE: 0.03

Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/30 | Train MAE: 1.4107, Val MAE: 0.5250
Epoch 2/30 | Train MAE: 0.5542, Val MAE: 0.3744
Epoch 3/30 | Train MAE: 0.4199, Val MAE: 0.3258
Epoch 4/30 | Train MAE: 0.3220, Val MAE: 0.2806
Epoch 5/30 | Train MAE: 0.3231, Val MAE: 0.3090
Epoch 6/30 | Train MAE: 0.3032, Val MAE: 0.3074
Epoch 7/30 | Train MAE: 0.2595, Val MAE: 0.3032
Epoch 8/30 | Train MAE: 0.2469, Val MAE: 0.2873
Epoch 9/30 | Train MAE: 0.2256, Val MAE: 0.2844
Early stopping triggered!
Working on predicting: Density


Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/30 | Train MAE: 1.0536, Val MAE: 0.6110
Epoch 2/30 | Train MAE: 0.7440, Val MAE: 0.5615
Epoch 3/30 | Train MAE: 0.4873, Val MAE: 0.5061
Epoch 4/30 | Train MAE: 0.4030, Val MAE: 0.4311
Epoch 5/30 | Train MAE: 0.2941, Val MAE: 0.4391
Epoch 6/30 | Train MAE: 0.2740, Val MAE: 0.4278
Epoch 7/30 | Train MAE: 0.2082, Val MAE: 0.4592
Epoch 8/30 | Train MAE: 0.2388, Val MAE: 0.4236
Epoch 9/30 | Train MAE: 0.2068, Val MAE: 0.3969
Epoch 10/30 | Train MAE: 0.2280, Val MAE: 0.4500
Epoch 11/30 | Train MAE: 0.1920, Val MAE: 0.4656
Epoch 12/30 | Train MAE: 0.1607, Val MAE: 0.4359
Epoch 13/30 | Train MAE: 0.1849, Val MAE: 0.4206
Epoch 14/30 | Train MAE: 0.1526, Val MAE: 0.4202
Early stopping triggered!
Working on predicting: Rg


Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/30 | Train MAE: 0.9601, Val MAE: 0.4952
Epoch 2/30 | Train MAE: 0.5452, Val MAE: 0.3819
Epoch 3/30 | Train MAE: 0.3823, Val MAE: 0.3634
Epoch 4/30 | Train MAE: 0.3216, Val MAE: 0.4213
Epoch 5/30 | Train MAE: 0.3087, Val MAE: 0.3456
Epoch 6/30 | Train MAE: 0.2633, Val MAE: 0.3221
Epoch 7/30 | Train MAE: 0.2258, Val MAE: 0.3475
Epoch 8/30 | Train MAE: 0.2016, Val MAE: 0.3525
Epoch 9/30 | Train MAE: 0.2218, Val MAE: 0.3237
Epoch 10/30 | Train MAE: 0.2101, Val MAE: 0.3274
Epoch 11/30 | Train MAE: 0.1937, Val MAE: 0.3188
Epoch 12/30 | Train MAE: 0.1742, Val MAE: 0.3265
Epoch 13/30 | Train MAE: 0.1771, Val MAE: 0.3133
Epoch 14/30 | Train MAE: 0.1695, Val MAE: 0.3458
Epoch 15/30 | Train MAE: 0.1683, Val MAE: 0.3472
Epoch 16/30 | Train MAE: 0.1575, Val MAE: 0.3346
Epoch 17/30 | Train MAE: 0.1528, Val MAE: 0.3632
Epoch 18/30 | Train MAE: 0.1328, Val MAE: 0.3224
Early stopping triggered!


In [51]:
test_df

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...,117.830582,0.36969,0.244372,1.186184,21.222961
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...,155.775955,0.376675,0.297824,1.141359,21.309484
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...,159.235245,0.343452,0.280604,1.128601,25.005985


In [62]:
train_df['Rg'].dropna().head()

10    28.682441
11    13.534248
37    13.872913
46    12.737463
64    13.435339
Name: Rg, dtype: float64

In [None]:
# lengths = train_df['SMILES'].apply(lambda x: len(tokenizer.tokenize(x)))
# print(lengths.describe())