In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fingerprint = torch.empty(368, 768)
pred_output = torch.empty(368,1)

In [3]:
class DownstreamRegression(nn.Module):
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask,step):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        fingerprint[step] = logits
        output = self.Regressor(logits)
        return output

def test(model, loss_fn, train_dataloader,device):

    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(train_dataloader):
            print(f'Smiles: {step+1}')
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            prop = batch["prop"].to(device).float()
            outputs = model(input_ids, attention_mask,step).float()
            pred_output[step] = outputs

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/Eea.csv')
original_output = train_data['value'].tolist()

In [5]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))


PretrainedModel = RobertaModel.from_pretrained('ckpt/pretrain.pt')
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)

model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

train_dataloader = DataLoader(train_dataset, 1, shuffle=False, num_workers=8)

steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

Some weights of the model checkpoint at ckpt/pretrain.pt were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you 

In [6]:
for epoch in range(1):
    test(model, loss_fn, train_dataloader, device)

fingerprint = fingerprint.detach().cpu().numpy().tolist()
pred_output = pred_output.detach().cpu().numpy().tolist()
original_output

Smiles: 1
Smiles: 2
Smiles: 3
Smiles: 4
Smiles: 5
Smiles: 6
Smiles: 7
Smiles: 8
Smiles: 9
Smiles: 10
Smiles: 11
Smiles: 12
Smiles: 13
Smiles: 14
Smiles: 15
Smiles: 16
Smiles: 17
Smiles: 18
Smiles: 19
Smiles: 20
Smiles: 21
Smiles: 22
Smiles: 23
Smiles: 24
Smiles: 25
Smiles: 26
Smiles: 27
Smiles: 28
Smiles: 29
Smiles: 30
Smiles: 31
Smiles: 32
Smiles: 33
Smiles: 34
Smiles: 35
Smiles: 36
Smiles: 37
Smiles: 38
Smiles: 39
Smiles: 40
Smiles: 41
Smiles: 42
Smiles: 43
Smiles: 44
Smiles: 45
Smiles: 46
Smiles: 47
Smiles: 48
Smiles: 49
Smiles: 50
Smiles: 51
Smiles: 52
Smiles: 53
Smiles: 54
Smiles: 55
Smiles: 56
Smiles: 57
Smiles: 58
Smiles: 59
Smiles: 60
Smiles: 61
Smiles: 62
Smiles: 63
Smiles: 64
Smiles: 65
Smiles: 66
Smiles: 67
Smiles: 68
Smiles: 69
Smiles: 70
Smiles: 71
Smiles: 72
Smiles: 73
Smiles: 74
Smiles: 75
Smiles: 76
Smiles: 77
Smiles: 78
Smiles: 79
Smiles: 80
Smiles: 81
Smiles: 82
Smiles: 83
Smiles: 84
Smiles: 85
Smiles: 86
Smiles: 87
Smiles: 88
Smiles: 89
Smiles: 90
Smiles: 91
Smiles: 

[0.4343,
 0.874,
 1.1415,
 1.524,
 0.4489,
 0.9897,
 0.3936,
 0.7546,
 0.5672,
 1.1795,
 0.5413,
 0.573,
 3.0398,
 0.5118,
 2.0372,
 1.0754,
 0.8924,
 2.2723,
 2.4936,
 1.2063,
 0.5062,
 0.7562,
 1.5033,
 2.2574,
 1.1904,
 0.6934,
 0.7672,
 2.356,
 1.8315,
 0.9199,
 3.2451,
 1.4499,
 0.5296,
 2.4594,
 2.6013,
 1.482,
 0.8344,
 1.6106,
 0.4954,
 0.4911,
 2.9681,
 2.4015,
 0.9827,
 2.2143,
 1.5606,
 2.9681,
 3.155,
 0.8499,
 0.7602,
 1.1922,
 3.9365,
 2.4093,
 3.2944,
 3.4574,
 4.0075,
 1.3443,
 2.5249,
 2.752,
 1.7103,
 2.3385,
 3.3503,
 1.8588,
 3.6267,
 2.1082,
 2.8418,
 1.897,
 2.2963,
 3.0264,
 1.6576,
 2.7321,
 4.1807,
 3.2177,
 2.9311,
 3.1315,
 3.7508,
 1.2264,
 3.1094,
 1.8722,
 2.0787,
 3.2587,
 1.7705,
 1.9244,
 2.9754,
 1.4385,
 1.174,
 2.588,
 2.8498,
 3.3428,
 2.358,
 3.3843,
 2.9654,
 1.1194,
 2.0497,
 1.3873,
 1.2199,
 2.3956,
 1.9547,
 1.8331,
 3.0651,
 1.812,
 1.7487,
 2.7075,
 0.9966,
 1.0649,
 2.6743,
 2.5375,
 2.314,
 3.3639,
 3.0211,
 2.874,
 2.8319,
 0.9705,
 0.954

In [7]:
data = {'fingerprint': fingerprint, 'pred_out': pred_output, 'orig_out': original_output }
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
df

Unnamed: 0,fingerprint,pred_out,orig_out
0,"[-1.2476686239242554, -0.3180617392063141, -0....",[-0.14158181846141815],0.4343
1,"[-1.0623645782470703, -0.5424985289573669, -0....",[-0.11254920065402985],0.8740
2,"[-1.1539753675460815, -0.8528841137886047, -0....",[-0.15427304804325104],1.1415
3,"[-0.881580650806427, -1.350131869316101, -0.28...",[-0.1554633527994156],1.5240
4,"[-0.6920124292373657, -1.819503664970398, -0.4...",[-0.02466488629579544],0.4489
...,...,...,...
363,"[-0.2644903361797333, -1.8372126817703247, -0....",[-0.05417758226394653],1.5387
364,"[1.2273632287979126, -1.4169275760650635, -0.2...",[-0.07640951126813889],1.3385
365,"[-0.33458125591278076, -1.3647533655166626, 0....",[0.05125347524881363],3.8901
366,"[0.4122636914253235, -0.716169536113739, -0.09...",[-0.13112933933734894],2.7568
