In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm
2023-06-15 16:17:05.066717: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-15 16:17:05.148968: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-15 16:17:05.166943: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-15 16:17:05.444982: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/Egc.csv')
len(train_data)

3380

In [3]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))
train_data

Unnamed: 0,smiles,value
0,[*]C[*],1.517933
1,[*]CC([*])C,1.276332
2,[*]CC([*])CC,1.274669
3,[*]CC([*])CCC,1.413256
4,[*]CC([*])CC(C)C,1.416967
...,...,...
3375,[*]Nc1c([2H])c([2H])c([*])c([2H])c1[2H],-0.741061
3376,[*]CCCCCC[N+](C)(C)CCC[N+]([*])(C)C,-2.756854
3377,[*]CCCCCCCC[N+](C)(C)CCCCCC[N+]([*])(C)C,-2.829091
3378,[*]CCCCCCCCCCCCCCCC[N+](C)(C)CCCCCC[N+]([*])(C)C,-2.781807


In [4]:
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
text = train_data['smiles'][6]
print(text)
tokens = tokenizer.tokenize(text)
print(tokens)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)
train_dataset[6]['input_ids']
PretrainedModel = torch.load('ckpt/Egc.pt/Egc_best_model.pt')


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PolymerSmilesTokenizer'.


[*]CC([*])CCCC1CCCCC1
['[', '*', ']', 'C', 'C', '(', '[', '*', ']', ')', 'C', 'C', 'C', 'C', '1', 'C', 'C', 'C', 'C', 'C', '1']


In [5]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        print(logits)
        output = self.Regressor(logits)
        return output

In [6]:
def train(model, optimizer, scheduler, loss_fn, train_dataloader, device):
    model.train()

    for step, batch in enumerate(train_dataloader):
        print(f'Smile: {step+1}')
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        prop = batch["prop"].to(device).float()
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).float()
        print(outputs)
        loss = loss_fn(outputs.squeeze(), prop.squeeze())
        loss.backward()
        optimizer.step()
        scheduler.step()
        print()

    return None

In [7]:
model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

AttributeError: 'dict' object has no attribute 'resize_token_embeddings'

In [None]:
train_dataloader = DataLoader(train_dataset, 1, shuffle=True, num_workers=8)

In [None]:
"""Parameters for scheduler"""
steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

In [None]:
for epoch in range(1):
    print("Training epoch: %s/%s" % (epoch+1, 1))
    train(model, optimizer, scheduler, loss_fn, train_dataloader, device)