In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fingerprint = torch.empty(370, 768)
pred_output = torch.empty(370,1)

In [3]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask,step):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        fingerprint[step] = logits
        output = self.Regressor(logits)
        return output
    
def test(model, loss_fn, train_dataloader,device):

    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(train_dataloader):
            print(f'Smiles: {step+1}')
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            prop = batch["prop"].to(device).float()
            outputs = model(input_ids, attention_mask,step).float()
            pred_output[step] = outputs

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/Ei.csv')
original_output = train_data['value'].tolist()

In [5]:
saved_state = torch.load('ckpt/Ei/Ei_best_model.pt')
vocab_sup = pd.read_csv('data/vocab/vocab_sup_PE_I.csv', header=None).values.flatten().tolist()

scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))


PretrainedModel = RobertaModel.from_pretrained('ckpt/pretrain.pt')
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
tokenizer.add_tokens(vocab_sup)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)

model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
model.load_state_dict(saved_state['model'])
loss_fn = nn.MSELoss()

train_dataloader = DataLoader(train_dataset, 1, shuffle=False, num_workers=8)

Some weights of the model checkpoint at ckpt/pretrain.pt were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you 

In [6]:
print("hello")
test(model, loss_fn, train_dataloader, device)

fingerprint = fingerprint.detach().cpu().numpy().tolist()
pred_output = pred_output.detach().cpu().numpy().tolist()

hello
Smiles: 1
Smiles: 2
Smiles: 3
Smiles: 4
Smiles: 5
Smiles: 6
Smiles: 7
Smiles: 8
Smiles: 9
Smiles: 10
Smiles: 11
Smiles: 12
Smiles: 13
Smiles: 14
Smiles: 15
Smiles: 16
Smiles: 17
Smiles: 18
Smiles: 19
Smiles: 20
Smiles: 21
Smiles: 22
Smiles: 23
Smiles: 24
Smiles: 25
Smiles: 26
Smiles: 27
Smiles: 28
Smiles: 29
Smiles: 30
Smiles: 31
Smiles: 32
Smiles: 33
Smiles: 34
Smiles: 35
Smiles: 36
Smiles: 37
Smiles: 38
Smiles: 39
Smiles: 40
Smiles: 41
Smiles: 42
Smiles: 43
Smiles: 44
Smiles: 45
Smiles: 46
Smiles: 47
Smiles: 48
Smiles: 49
Smiles: 50
Smiles: 51
Smiles: 52
Smiles: 53
Smiles: 54
Smiles: 55
Smiles: 56
Smiles: 57
Smiles: 58
Smiles: 59
Smiles: 60
Smiles: 61
Smiles: 62
Smiles: 63
Smiles: 64
Smiles: 65
Smiles: 66
Smiles: 67
Smiles: 68
Smiles: 69
Smiles: 70
Smiles: 71
Smiles: 72
Smiles: 73
Smiles: 74
Smiles: 75
Smiles: 76
Smiles: 77
Smiles: 78
Smiles: 79
Smiles: 80
Smiles: 81
Smiles: 82
Smiles: 83
Smiles: 84
Smiles: 85
Smiles: 86
Smiles: 87
Smiles: 88
Smiles: 89
Smiles: 90
Smiles: 91
Sm

In [7]:
data = {'fingerprint': fingerprint, 'pred_out': pred_output, 'orig_out': original_output }
df = pd.DataFrame(data)
df.to_csv('mywork/result_data/Ei/Ei_best_fingerprint.csv', index=False)
df

Unnamed: 0,fingerprint,pred_out,orig_out
0,"[-0.6056618690490723, -0.2879466116428375, -0....",[-0.10683786123991013],6.1850
1,"[0.17731203138828278, -1.2010531425476074, 0.2...",[1.6284528970718384],7.6332
2,"[-0.22237494587898254, -0.9703258872032166, 0....",[2.2101566791534424],8.1531
3,"[-0.294385701417923, -1.02885103225708, 0.3449...",[2.6464226245880127],8.5986
4,"[0.018702805042266846, -1.2984548807144165, 1....",[2.8387887477874756],9.0178
...,...,...,...
365,"[-1.2633453607559204, -1.704049825668335, 0.53...",[0.01380742434412241],6.1951
366,"[1.20831298828125, -1.3530679941177368, 0.5141...",[-0.6642560958862305],5.6948
367,"[-1.0325931310653687, -1.3280613422393799, 1.3...",[-0.42180341482162476],5.8838
368,"[-0.26856863498687744, -0.6897868514060974, 0....",[-0.5758365392684937],5.8176


In [8]:
df = pd.read_csv('mywork/result_data/Ei/Ei_best_fingerprint.csv')
df

Unnamed: 0,fingerprint,pred_out,orig_out
0,"[-0.6056618690490723, -0.2879466116428375, -0....",[-0.10683786123991013],6.1850
1,"[0.17731203138828278, -1.2010531425476074, 0.2...",[1.6284528970718384],7.6332
2,"[-0.22237494587898254, -0.9703258872032166, 0....",[2.2101566791534424],8.1531
3,"[-0.294385701417923, -1.02885103225708, 0.3449...",[2.6464226245880127],8.5986
4,"[0.018702805042266846, -1.2984548807144165, 1....",[2.8387887477874756],9.0178
...,...,...,...
365,"[-1.2633453607559204, -1.704049825668335, 0.53...",[0.01380742434412241],6.1951
366,"[1.20831298828125, -1.3530679941177368, 0.5141...",[-0.6642560958862305],5.6948
367,"[-1.0325931310653687, -1.3280613422393799, 1.3...",[-0.42180341482162476],5.8838
368,"[-0.26856863498687744, -0.6897868514060974, 0....",[-0.5758365392684937],5.8176
