In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm
2023-06-16 09:49:08.159200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-16 09:49:08.246714: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-16 09:49:08.264055: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-16 09:49:08.535483: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/Egc.csv')
train_data = train_data[0:10]
len(train_data)

3380

In [3]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))
train_data

Unnamed: 0,smiles,value
0,[*]C[*],1.517933
1,[*]CC([*])C,1.276332
2,[*]CC([*])CC,1.274669
3,[*]CC([*])CCC,1.413256
4,[*]CC([*])CC(C)C,1.416967
...,...,...
3375,[*]Nc1c([2H])c([2H])c([*])c([2H])c1[2H],-0.741061
3376,[*]CCCCCC[N+](C)(C)CCC[N+]([*])(C)C,-2.756854
3377,[*]CCCCCCCC[N+](C)(C)CCCCCC[N+]([*])(C)C,-2.829091
3378,[*]CCCCCCCCCCCCCCCC[N+](C)(C)CCCCCC[N+]([*])(C)C,-2.781807


In [4]:
PretrainedModel = RobertaModel.from_pretrained('ckpt/pretrain.pt')
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)
train_dataset[1]

Some weights of the model checkpoint at ckpt/pretrain.pt were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you 

{'input_ids': tensor([    0, 10975,  3226,   742,   347,   347,  1640, 10975,  3226,   742,
            43,   347,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [5]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        print(type(logits))
        output = self.Regressor(logits)
        return output

In [6]:
def train(model, optimizer, scheduler, loss_fn, train_dataloader, device):
    model.eval()

    for step, batch in enumerate(train_dataloader):
        print(f'Smile: {step+1}')
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        prop = batch["prop"].to(device).float()
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).float()
        print(outputs)
        

    return None

In [7]:
model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

In [8]:
train_dataloader = DataLoader(train_dataset, 1, shuffle=False, num_workers=8)

In [9]:
"""Parameters for scheduler"""
steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

In [10]:
for epoch in range(1):
    print("Training epoch: %s/%s" % (epoch+1, 1))
    train(model, optimizer, scheduler, loss_fn, train_dataloader, device)

Training epoch: 1/1
Smile: 1
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 2
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 3
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 4
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 5
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 6
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 7
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 8
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 9
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 10
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 11
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 12
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 13
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 14
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 15
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 16
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 17
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 18
<class 'to

<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 145
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 146
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 147
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 148
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 149
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 150
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 151
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 152
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 153
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 154
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 155
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 156
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 157
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 158
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 159
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 160
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 161
<class 'torch.

<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 287
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 288
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 289
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 290
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 291
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 292
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 293
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 294
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 295
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 296
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 297
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 298
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 299
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 300
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 301
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 302
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 303
<class 'torch.

<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 429
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 430
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 431
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 432
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 433
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 434
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 435
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 436
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 437
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 438
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 439
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 440
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 441
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 442
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 443
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 444
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 445
<class 'torch.

<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 571
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 572
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 573
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 574
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 575
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 576
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 577
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 578
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 579
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 580
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 581
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 582
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 583
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 584
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 585
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 586
<class 'torch.Tensor'>
<class 'torch.Tensor'>

Smile: 587
<class 'torch.

KeyboardInterrupt: 