In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm
2023-06-16 17:03:21.248938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-16 17:03:21.336419: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-16 17:03:21.354092: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-16 17:03:21.626306: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [2]:
fingerprint = torch.empty(2, 768)
pred_output = torch.empty(2,1)

In [3]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask,step):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        print(logits)
        fingerprint[step] = logits
        output = self.Regressor(logits)
        return output

def test(model, loss_fn, train_dataloader,device):

    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(train_dataloader):
            print(f'Smiles: {step+1}')
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            prop = batch["prop"].to(device).float()
            outputs = model(input_ids, attention_mask,step).float()
            print(outputs)
            pred_output[step] = outputs

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/Egc.csv')
train_data = train_data[0:2]
original_output = train_data['value'].tolist()
original_output

[6.8972, 6.5196]

In [5]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))

PretrainedModel = RobertaModel.from_pretrained('ckpt/pretrain.pt')
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)

model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

train_dataloader = DataLoader(train_dataset, 1, shuffle=False, num_workers=8)

steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

Some weights of the model checkpoint at ckpt/pretrain.pt were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you 

In [12]:
for epoch in range(1):
    test(model, loss_fn, train_dataloader, device)

fingerprint = fingerprint.detach().cpu().numpy().tolist()
pred_output = pred_output.detach().cpu().numpy().tolist()
print(fingerprint)
print(pred_output)
    

Smiles: 1
tensor([[-4.6697e-01, -8.1470e-01, -9.1897e-01, -8.7242e-02, -6.6621e-01,
         -5.4890e-01, -7.7002e-01,  6.0730e-01, -1.3500e+00,  5.6777e-01,
         -4.8231e-01,  4.2597e-01,  5.4432e-01,  4.8350e-01, -8.9016e-02,
          1.4053e+00,  3.9309e-01,  9.9460e-01,  5.2886e-01, -5.9197e-01,
          1.3898e+00, -4.5902e-02, -2.0183e+00,  6.6389e-01, -4.1917e-01,
         -1.9579e-04,  5.6167e-01,  2.1255e-01,  5.9619e-01, -3.8329e-01,
         -3.6094e-01, -3.5935e-01,  1.5565e+00,  9.9061e-01,  2.1953e-01,
          7.8248e-01,  1.5975e+00, -3.7581e-02, -1.0295e+00, -1.0690e+00,
          3.5503e-02,  4.5003e-01, -1.5753e+00, -3.8135e-01,  1.3160e+00,
          9.6388e-01, -1.7682e+00,  2.7415e-01,  1.8462e+00,  1.8401e+00,
         -9.9130e-01, -1.2174e+00, -1.0951e-01,  1.0074e+00, -1.4264e+00,
          2.9040e-01,  1.5178e+00, -1.1104e+00, -7.6970e-01, -2.7635e-01,
         -9.7517e-01, -2.2777e-01,  4.0092e-01, -5.5175e-01,  1.0737e-01,
         -2.4473e-01,  2.095

tensor([[-8.5491e-01,  7.5635e-02, -1.1678e+00,  1.9576e-02, -1.4808e-01,
          1.1663e-01, -5.0793e-01,  1.9926e-01,  1.6595e-01, -2.2509e-01,
         -3.2040e-03, -1.5773e-01,  3.1585e-01, -3.6816e-01,  5.0217e-01,
          5.3881e-01, -7.0554e-01,  1.5189e+00,  1.0905e+00, -2.2474e-01,
          1.8015e+00, -6.2293e-01, -1.4573e+00, -2.9402e-02,  6.6823e-02,
          6.8088e-02,  1.2779e+00,  8.3722e-02,  4.1433e-01, -6.5794e-01,
         -1.7629e+00,  3.2425e-01,  8.0662e-01,  6.8602e-01, -6.7008e-01,
          6.8771e-01,  1.5388e+00, -9.7112e-01, -1.1376e+00, -8.2072e-01,
         -3.1229e-02, -5.2293e-01, -7.4482e-01, -1.1387e-01,  1.5396e+00,
         -2.6489e-02, -1.7372e+00,  1.0827e+00,  8.0250e-01,  1.4594e+00,
         -1.2276e+00, -9.9465e-01,  6.1098e-01,  1.0827e+00,  3.2236e-01,
          5.5270e-01,  1.1286e+00, -9.4201e-01, -8.3344e-01, -6.0996e-01,
         -9.8769e-01,  1.3391e-01, -2.0504e-01, -6.1132e-01,  2.0911e-02,
         -4.8847e-01,  1.1927e+00,  4.

AttributeError: 'list' object has no attribute 'detach'

In [9]:
data = {'fingerprint': fingerprint, 'pred_out': pred_output, 'orig_out': original_output }
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
df

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.