In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from PolymerSmilesTokenization import PolymerSmilesTokenizer
from dataset import Downstream_Dataset, DataAugmentation, LoadPretrainData
import torch
import torch.nn as nn
from torchmetrics import R2Score
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader


  device: Optional[torch.device] = torch.device("cuda"),


Downstream_Dataset class is called


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = pd.read_csv('data/practice.csv')
len(train_data)


10

In [3]:
scaler = StandardScaler()
train_data.iloc[:, 1] = scaler.fit_transform(train_data.iloc[:, 1].values.reshape(-1, 1))
train_data

Unnamed: 0,smiles,value
0,*C*,0.67187
1,*CC(*)C,0.440891
2,*CC(*)CC,0.439301
3,*CC(*)CCC,0.571796
4,*CC(*)CC(C)C,0.575343
5,*CC1CCC(*)C1,0.71108
6,*CC(*)CCCC1CCCCC1,0.349564
7,*C=CCCC*,-0.710518
8,*C=CCC*,-0.358177
9,*C=C*,-2.691151


In [4]:
tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=411)
text = train_data['smiles'][6]
print(text)
tokens = tokenizer.tokenize(text)
print(tokens)
train_dataset = Downstream_Dataset(train_data, tokenizer, 411)
train_dataset[6]['input_ids']


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PolymerSmilesTokenizer'.


*CC(*)CCCC1CCCCC1
['*', 'C', 'C', '(', '*', ')', 'C', 'C', 'C', 'C', '1', 'C', 'C', 'C', 'C', 'C', '1']


tensor([   0, 3226,  347,  347, 1640, 3226,   43,  347,  347,  347,  347,  134,
         347,  347,  347,  347,  347,  134,    2,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [5]:

config = RobertaConfig(
            vocab_size=50265,
            max_position_embeddings=514,
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1
        )
PretrainedModel = RobertaModel(config=config)
embeddings = PretrainedModel.embeddings.word_embeddings(train_dataset[6]['input_ids'])
print(embeddings)
print(embeddings.size())
print(embeddings[0])

tensor([[-0.0014,  0.0059,  0.0089,  ..., -0.0203, -0.0186,  0.0288],
        [ 0.0365, -0.0035,  0.0030,  ...,  0.0026, -0.0073, -0.0154],
        [ 0.0061, -0.0477,  0.0195,  ...,  0.0075, -0.0146, -0.0528],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([411, 768])
tensor([-0.0014,  0.0059,  0.0089, -0.0007, -0.0136, -0.0154, -0.0051, -0.0126,
         0.0142, -0.0357, -0.0092, -0.0254, -0.0044, -0.0343, -0.0358, -0.0219,
         0.0267, -0.0130, -0.0353, -0.0004,  0.0254, -0.0411, -0.0052,  0.0124,
         0.0079, -0.0140,  0.0470,  0.0152,  0.0154,  0.0119,  0.0164,  0.0035,
        -0.0114, -0.0068,  0.0060,  0.0094,  0.0240,  0.0187, -0.0291,  0.0057,
         0.0004,  0.0156,  0.0009,  0.0167, -0.0058,  0.0005,  0.0271,  0.0202,
        -0.0118, -0.0069, 

In [6]:
class DownstreamRegression(nn.Module):
    print('Class DownstreamRegression is called')
    def __init__(self, drop_rate=0.1):
        super(DownstreamRegression, self).__init__()
        self.PretrainedModel = deepcopy(PretrainedModel)
        self.PretrainedModel.resize_token_embeddings(len(tokenizer))
        
        self.Regressor = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(self.PretrainedModel.config.hidden_size, self.PretrainedModel.config.hidden_size),
            nn.SiLU(),
            nn.Linear(self.PretrainedModel.config.hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.PretrainedModel(input_ids=input_ids, attention_mask=attention_mask)
        print(outputs.last_hidden_state.size())
        logits = outputs.last_hidden_state[:, 0, :] #fingerprint
        print(f'Finger print is:')
        print(logits)
        print(logits.size())
        output = self.Regressor(logits)
        return output

Class DownstreamRegression is called


In [7]:
def train(model, optimizer, scheduler, loss_fn, train_dataloader, device):
    print('Train func is called')
    model.train()

    for step, batch in enumerate(train_dataloader):
        print(f'Batch and Step: {step}')
        input_ids = batch["input_ids"].to(device)
        print('Batch')
        print(input_ids)
        attention_mask = batch["attention_mask"].to(device)
        prop = batch["prop"].to(device).float()
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).float()
        print('Output for step',step)
        print(outputs)
        loss = loss_fn(outputs.squeeze(), prop.squeeze())
        loss.backward()
        optimizer.step()
        scheduler.step()
        print('End of one step')
        print('--------------------------------------------------------------------------------')

    return None

In [8]:
model = DownstreamRegression(drop_rate=0.1).to(device)
model = model.double()
loss_fn = nn.MSELoss()

In [9]:
train_dataloader = DataLoader(train_dataset, 1, shuffle=True, num_workers=8)

In [10]:
"""Parameters for scheduler"""
steps_per_epoch = train_data.shape[0] // 1
training_steps = steps_per_epoch * 1
warmup_steps = int(training_steps * 0.05)

In [11]:
optimizer = AdamW(
                    [
                        {"params": model.PretrainedModel.parameters(), "lr":  0.00005,
                         "weight_decay": 0.0},
                        {"params": model.Regressor.parameters(), "lr": 0.0001,
                         "weight_decay": 0.01},
                    ],
    				no_deprecation_warning=True
                )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                        num_training_steps=training_steps)

In [12]:
for epoch in range(1):
    print("Training epoch: %s/%s" % (epoch+1, 1))
    train(model, optimizer, scheduler, loss_fn, train_dataloader, device)
    print('End of one Epoch')
    print('|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||')

Training epoch: 1/1
Train func is called
Batch and Step: 0
Batch
tensor([[   0, 3226,  347,  347, 1640, 3226,   43,  347,  347,  347,    2,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,

Output for step 0
tensor([[-0.0595]], grad_fn=<ToCopyBackward0>)
End of one step
--------------------------------------------------------------------------------
Batch and Step: 1
Batch
tensor([[   0, 3226,  347, 5214,  347, 3226,    2,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    

End of one step
--------------------------------------------------------------------------------
Batch and Step: 2
Batch
tensor([[   0, 3226,  347,  347,  134,  347,  347,  347, 1640, 3226,   43,  347,
          134,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 3
Batch
tensor([[   0, 3226,  347,  347, 1640, 3226,   43,  347,  347,  347,  347,  134,
          347,  347,  347,  347,  347,  134,    2,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 4
Batch
tensor([[   0, 3226,  347,  347, 1640, 3226,   43,  347,  347, 1640,  347,   43,
          347,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 5
Batch
tensor([[   0, 3226,  347,  347, 1640, 3226,   43,  347,  347,    2,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 6
Batch
tensor([[   0, 3226,  347, 5214,  347,  347,  347,  347, 3226,    2,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 7
Batch
tensor([[   0, 3226,  347,  347, 1640, 3226,   43,  347,    2,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 8
Batch
tensor([[   0, 3226,  347, 5214,  347,  347,  347, 3226,    2,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
Batch and Step: 9
Batch
tensor([[   0, 3226,  347, 3226,    2,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1, 

End of one step
--------------------------------------------------------------------------------
End of one Epoch
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||


# Pretraining 

In [13]:
def split(file_path):
    dataset = pd.read_csv(file_path, header=None).values
    train_data, valid_data = train_test_split(dataset, test_size=0.2, random_state=1)
    return train_data, valid_data

config = RobertaConfig(
        vocab_size=50265,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
    )

tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=175)
model = RobertaForMaskedLM(config=config).to(device)
config.hidden_size

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PolymerSmilesTokenizer'.


768

In [14]:
train_data, valid_data = split('data/P1_practice.csv')
data_train = LoadPretrainData(tokenizer=tokenizer, dataset=train_data, blocksize=175)
data_valid = LoadPretrainData(tokenizer=tokenizer, dataset=valid_data, blocksize=175)
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15 
    )
text = train_data[3][0]
print(text)
tokens = tokenizer.tokenize(text)
print(tokens)
print(data_train[3]['input_ids'])
print(data_train[3]['input_ids'].size())

*c1ccc(C(=O)c2ccc(*)cc2)cc1
['*', 'c', '1', 'c', 'c', 'c', '(', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', '(', '*', ')', 'c', 'c', '2', ')', 'c', 'c', '1']
tensor([   0, 3226,  438,  134,  438,  438,  438, 1640,  347, 1640, 5214,  673,
          43,  438,  176,  438,  438,  438, 1640, 3226,   43,  438,  438,  176,
          43,  438,  438,  134,    2,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1

In [15]:
embeddings = model.roberta.embeddings.word_embeddings(data_train[3]['input_ids'])
print(embeddings)
print(embeddings.size())
print(embeddings[0])

tensor([[-0.0218, -0.0088, -0.0123,  ..., -0.0149,  0.0252, -0.0157],
        [ 0.0205, -0.0039, -0.0430,  ...,  0.0280, -0.0170, -0.0392],
        [ 0.0166,  0.0135,  0.0256,  ...,  0.0286,  0.0371, -0.0025],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([175, 768])
tensor([-2.1791e-02, -8.8455e-03, -1.2305e-02,  1.4296e-02, -4.5281e-03,
         9.2819e-03, -1.0018e-02,  2.4248e-03,  2.3112e-02,  1.5983e-02,
         1.0988e-02,  1.2026e-04,  1.7883e-02,  7.0547e-03, -1.5568e-02,
         2.8132e-02, -4.7467e-04, -3.9683e-02,  1.8832e-02,  1.2122e-02,
         2.3052e-02, -2.6296e-02, -8.4875e-03, -5.8106e-03, -2.8457e-03,
         1.3718e-02, -2.0315e-02, -2.0668e-02, -3.9749e-03, -1.0557e-02,
        -1.7580e-02, -7.2370e-03, -1.4162e-02,  8.7720e-03, -2.5846e