In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/practice.csv')
len(train_data)

19

In [13]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))
train_data

Unnamed: 0,smiles,value
0,*CC(*)C,0.67382
1,*CC(*)CC,0.787752
2,*CC(*)CCC,0.431143
3,*CC1CCC(*)C1,0.826822
4,*CC(*)CCCC1CCCCC1,0.601899
5,*C(*)C,0.507022
6,*C=C*,-3.138791
7,*C=CCCCC*,-0.402452
8,*C1CCC1*,0.01918
9,*CC1(*)CCCCC1,0.469704


In [14]:
PretrainedModel = RobertaModel.from_pretrained('ckpt/pretrain.pt')
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)
train_dataset[1]

Some weights of the model checkpoint at ckpt/pretrain.pt were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: 100%|██████

{'input_ids': tensor([   0, 3226,  347,  347, 1640, 3226,   43,  347,  347,    2,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1, 

In [15]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        print(type(logits))
        output = self.Regressor(logits)
        return output

In [16]:
def train(model, optimizer, scheduler, loss_fn, train_dataloader, device):
    model.eval()

    for step, batch in enumerate(train_dataloader):
        print(f'Smile: {step+1}')
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        prop = batch["prop"].to(device).float()
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).float()
        print(outputs)
        

    return None

In [17]:
model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

In [18]:
train_dataloader = DataLoader(train_dataset, 1, shuffle=False, num_workers=8)

In [19]:
"""Parameters for scheduler"""
steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

In [20]:
for epoch in range(1):
    print("Training epoch: %s/%s" % (epoch+1, 1))
    train(model, optimizer, scheduler, loss_fn, train_dataloader, device)

Training epoch: 1/1
Smile: 1
<class 'torch.Tensor'>
tensor([[0.2513]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 2
<class 'torch.Tensor'>
tensor([[0.3112]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 3
<class 'torch.Tensor'>
tensor([[0.3186]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 4
<class 'torch.Tensor'>
tensor([[0.3677]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 5
<class 'torch.Tensor'>
tensor([[0.4324]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 6
<class 'torch.Tensor'>
tensor([[0.1009]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 7
<class 'torch.Tensor'>
tensor([[0.2996]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 8
<class 'torch.Tensor'>
tensor([[0.4147]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 9
<class 'torch.Tensor'>
tensor([[0.3492]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 10
<class 'torch.Tensor'>
tensor([[0.3859]], device='cuda:0', grad_fn=<ToCopyBackward0>)
Smile: 11
<class 'torch.Tenso