In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fingerprint = torch.empty(370, 768)
pred_output = torch.empty(370,1)

In [3]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask,step):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        fingerprint[step] = logits
        output = self.Regressor(logits)
        return output

def test(model, loss_fn, train_dataloader,device):

    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(train_dataloader):
            print(f'Smiles: {step+1}')
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            prop = batch["prop"].to(device).float()
            outputs = model(input_ids, attention_mask,step).float()
            pred_output[step] = outputs

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/Ei.csv')
original_output = train_data['value'].tolist()

In [5]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))


PretrainedModel = RobertaModel.from_pretrained('ckpt/pretrain.pt')
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)

model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

train_dataloader = DataLoader(train_dataset, 1, shuffle=False, num_workers=8)

steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

Some weights of the model checkpoint at ckpt/pretrain.pt were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you 

In [6]:
for epoch in range(1):
    test(model, loss_fn, train_dataloader, device)

fingerprint = fingerprint.detach().cpu().numpy().tolist()
pred_output = pred_output.detach().cpu().numpy().tolist()
original_output

Smiles: 1
Smiles: 2
Smiles: 3
Smiles: 4
Smiles: 5
Smiles: 6
Smiles: 7
Smiles: 8
Smiles: 9
Smiles: 10
Smiles: 11
Smiles: 12
Smiles: 13
Smiles: 14
Smiles: 15
Smiles: 16
Smiles: 17
Smiles: 18
Smiles: 19
Smiles: 20
Smiles: 21
Smiles: 22
Smiles: 23
Smiles: 24
Smiles: 25
Smiles: 26
Smiles: 27
Smiles: 28
Smiles: 29
Smiles: 30
Smiles: 31
Smiles: 32
Smiles: 33
Smiles: 34
Smiles: 35
Smiles: 36
Smiles: 37
Smiles: 38
Smiles: 39
Smiles: 40
Smiles: 41
Smiles: 42
Smiles: 43
Smiles: 44
Smiles: 45
Smiles: 46
Smiles: 47
Smiles: 48
Smiles: 49
Smiles: 50
Smiles: 51
Smiles: 52
Smiles: 53
Smiles: 54
Smiles: 55
Smiles: 56
Smiles: 57
Smiles: 58
Smiles: 59
Smiles: 60
Smiles: 61
Smiles: 62
Smiles: 63
Smiles: 64
Smiles: 65
Smiles: 66
Smiles: 67
Smiles: 68
Smiles: 69
Smiles: 70
Smiles: 71
Smiles: 72
Smiles: 73
Smiles: 74
Smiles: 75
Smiles: 76
Smiles: 77
Smiles: 78
Smiles: 79
Smiles: 80
Smiles: 81
Smiles: 82
Smiles: 83
Smiles: 84
Smiles: 85
Smiles: 86
Smiles: 87
Smiles: 88
Smiles: 89
Smiles: 90
Smiles: 91
Smiles: 

[6.185,
 7.6332,
 8.1531,
 8.5986,
 9.0178,
 7.5826,
 7.0344,
 7.9431,
 6.904,
 6.1428,
 6.9158,
 7.1012,
 5.6689,
 5.7733,
 6.1739,
 7.8277,
 7.7636,
 7.1539,
 7.6583,
 8.2018,
 7.458,
 7.7853,
 6.3961,
 7.179,
 6.8452,
 6.5849,
 7.2525,
 6.1179,
 6.8713,
 5.1674,
 7.8184,
 7.4318,
 6.5915,
 5.8246,
 6.383,
 5.0935,
 5.6867,
 6.8689,
 6.5689,
 7.5966,
 8.3643,
 4.5257,
 6.2671,
 4.9916,
 6.0452,
 3.5577,
 4.5257,
 5.6338,
 6.2134,
 7.4162,
 6.2005,
 5.4248,
 4.7769,
 5.7571,
 6.0778,
 6.3892,
 5.0541,
 6.1007,
 4.7302,
 5.6406,
 5.1637,
 5.6674,
 5.1678,
 5.7103,
 4.9968,
 5.9399,
 5.1012,
 5.1853,
 5.6338,
 5.5837,
 5.9851,
 6.2119,
 5.8629,
 5.6051,
 5.7685,
 6.5598,
 5.9034,
 5.9615,
 6.6287,
 5.1385,
 5.7652,
 5.2494,
 5.2326,
 5.6896,
 5.4992,
 5.797,
 5.8858,
 6.3555,
 6.4325,
 6.8862,
 5.9541,
 6.4155,
 5.2507,
 5.7769,
 5.1816,
 5.8606,
 6.155,
 5.1679,
 5.3092,
 5.8475,
 5.3793,
 5.528,
 5.6651,
 5.5737,
 6.2368,
 5.9697,
 6.578,
 6.6326,
 6.2998,
 5.8109,
 5.6687,
 6.2737,
 

In [7]:
data = {'fingerprint': fingerprint, 'pred_out': pred_output, 'orig_out': original_output }
df = pd.DataFrame(data)
df.to_csv('mywork/result_data/Ei/Ei_pretrain_fingerprint.csv', index=False)
df

Unnamed: 0,fingerprint,pred_out,orig_out
0,"[-1.2476686239242554, -0.3180617392063141, -0....",[-0.01442678365856409],6.1850
1,"[-1.0623645782470703, -0.5424985289573669, -0....",[-0.020341750234365463],7.6332
2,"[-1.1539753675460815, -0.8528841137886047, -0....",[-0.06959985196590424],8.1531
3,"[-0.881580650806427, -1.350131869316101, -0.28...",[0.00043667005957104266],8.5986
4,"[-0.6920124292373657, -1.819503664970398, -0.4...",[-0.004440829623490572],9.0178
...,...,...,...
365,"[-0.2644903361797333, -1.8372126817703247, -0....",[0.027845410630106926],6.1951
366,"[1.2273632287979126, -1.4169275760650635, -0.2...",[0.1757153570652008],5.6948
367,"[-0.33458125591278076, -1.3647533655166626, 0....",[0.01976815052330494],5.8838
368,"[0.4122636914253235, -0.716169536113739, -0.09...",[0.02150258608162403],5.8176
