In [1]:
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, BertForNextSentencePrediction
from transformers import ElectraTokenizerFast, ElectraModel, AutoTokenizer
import torch
import json

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm
import random
import torch.nn.functional as F



In [2]:
tok = AutoTokenizer.from_pretrained('/home/ubuntu/joonkee/pretraining/tokenizer_base')

In [3]:
with open('pretrain_data/all_in_one.json', 'r') as f:
    data = json.load(f)
data = data['questions']

In [4]:
class BERTLanguageModelingDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, sep_id: str='[SEP]', cls_id: str='[CLS]',
                mask_id: str='[MASK]', pad_id: str="[PAD]", seq_len: int=256, mask_frac: float=0.15, p: float=0.5):
        """Initiate language modeling dataset.
        Arguments:
            data (list): a tensor of tokens. tokens are ids after
                numericalizing the string tokens.
                torch.tensor([token_id_1, token_id_2, token_id_3, token_id1]).long()
            vocab (sentencepiece.SentencePieceProcessor): Vocabulary object used for dataset.
            p (float): probability for NSP. defaut 0.5
        """
        super(BERTLanguageModelingDataset, self).__init__()
        self.tokenizer = tokenizer
        self.data = data
        self.seq_len = seq_len
        self.sep_id = tokenizer.sep_token_id
        self.cls_id = tokenizer.cls_token_id
        self.mask_id = tokenizer.mask_token_id
        self.pad_id = tokenizer.pad_token_id
        self.p = p
        self.mask_frac = mask_frac
        self.mlm_probability = mask_frac

    def __getitem__(self, i):
        seq1 = self.data[i]
        # seq2 = self.data[i] # 너무 짧으니까 두개를 이어붙이자.

        # print(seq1)
        encoded_dict = tok.encode_plus(
                        seq1,
                        # [seq1, seq2], 
                        #                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = self.seq_len,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
        labels = encoded_dict['input_ids'].clone()
        inputs = encoded_dict['input_ids'].clone()

        special_tokens_mask = [tok.get_special_tokens_mask(val,already_has_special_tokens=True) for val in labels.tolist()]
        special_tokens_mask = torch.tensor(special_tokens_mask,dtype=torch.bool)

        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        mlm_train = inputs
        mlm_target = labels
        attn_masks = encoded_dict['attention_mask']

        return mlm_train.squeeze(0), mlm_target.squeeze(0), attn_masks.squeeze(0)
        # return self.data[i]

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        for x in self.data:
            yield x
dataset = BERTLanguageModelingDataset(data=data,tokenizer=tok)

In [5]:
dataset[0][0].shape, dataset[0][1].shape

(torch.Size([256]), torch.Size([256]))

In [6]:
import torch
import torch.nn as nn

class MLM_NSP(nn.Module):
    def __init__(self, voc_size:int=30000):
        super(MLM_NSP, self).__init__()
        d_model = 1024
        # intermediate_hidden = 3072
        self.linear_mlm1 = nn.Linear(d_model, d_model)
        self.act = nn.GELU()
        self.layer_norm = nn.LayerNorm(d_model,eps=1e-12)
        self.linear_mlm2 = nn.Linear(d_model, voc_size)

    def forward(self, input_seq):
        '''
        param:
            input: a batch of sequences of words
            seg: Segmentation embedding for input tokens
        dim:
            input:
                input: [B, S]
                seg: [B, S]
            output:
                result: [B, S, V]
        '''

        output_mlm = self.linear_mlm1(input_seq) # [B, S, voc_size]
        output_mlm = self.act(output_mlm) # [B, S, voc_size]
        output_mlm = self.layer_norm(output_mlm) # [B, S, voc_size]
        output_mlm = self.linear_mlm2(output_mlm) # [B, S, voc_size]


        # return output_nsp
        return output_mlm

In [7]:
class MyModel(nn.Module):
    def __init__(self, voc_size, pretrained_path):
        super(MyModel, self).__init__()
        d_model = 1024
        # intermediate_hidden = 3072
        self.lm_model = RobertaModel.from_pretrained(pretrained_path)
        self.mlm_nsp_model = MLM_NSP(voc_size)
    def forward(self, mlm_train, attention_mask):
        '''
        param:
            input: a batch of sequences of words
            seg: Segmentation embedding for input tokens
        dim:
            input:
                input: [B, S]
                seg: [B, S]
            output:
                result: [B, S, V]
        '''
        output = self.lm_model(mlm_train, attention_mask=attention_mask).last_hidden_state
        output = self.mlm_nsp_model(output)
        return output

In [8]:
DEVICE = 'cuda:0'
import torch.nn as nn
import torch
import torch.optim as optim

def accuracy(log_pred, y_true):
    y_pred = torch.argmax(log_pred, dim=1).to(y_true.device)
    return (y_pred == y_true).to(torch.float)

def train(model, dataloader, optimizer,valid_loader, total_leng, early_stop_cnt, scheduler, min_loss):
    mlm_epoch_loss = 0
    # min_loss = 100
    cnt = 0 # count length for avg loss
    # early_stop_cnt = 0
    stop = False
    for batch, (mlm_train, mlm_target, attn_masks) in enumerate(tqdm(dataloader)):
        # print(cnt)
        # MLM task
        model.train()
        optimizer.zero_grad()

        output = model(mlm_train.to(DEVICE), attention_mask=attn_masks.to(DEVICE))
        mlm_output = output.reshape(-1, output.shape[-1])
        mlm_loss = criterion(mlm_output.reshape(-1, output.shape[-1]), mlm_target.to(DEVICE).reshape(-1)) # CE
        loss = mlm_loss
        # loss = nsp_loss
        # torch.nn.utils.clip_grad_norm_(_loss.parameters(), 1)
        loss.backward()
        optimizer.step()
        # NSP tasks

        mlm_epoch_loss += mlm_loss.item()
        # mlm_loss = 0
        cnt += 1
        if cnt % 20 == 0:
            with open('log_mlm.txt', 'a') as f:
                f.write(f'train : {cnt} step,  mlm : {mlm_loss.item():.2f}\n')
            print(f'train : {cnt} step,  mlm : {mlm_loss:.2f}\n')
        if cnt % 300 == 0:
            mlm = valid(model, valid_loader, total_leng)
            if mlm<min_loss:
                early_stop_cnt = 0
                min_loss = mlm
                print('min loss:',min_loss)
                model.module.lm_model.save_pretrained('pretrained_lm_mlm')
            else:
                early_stop_cnt += 1
            with open('log_mlm.txt', 'a') as f:
                f.write(f'validation : {cnt} step, early_stop_cnt{early_stop_cnt}, mlm : {mlm:.2f}\n')
            print(f'validation : {cnt} step, {early_stop_cnt}, mlm : {mlm:.2f}\n')
            if early_stop_cnt > 10:
                stop = True
                break
        scheduler.step()
    return mlm_epoch_loss / cnt, stop, early_stop_cnt, min_loss

def valid(model, dataloader, total_leng):
    model.eval()
    mlm_epoch_loss = 0
    with torch.no_grad():
        cnt = 0 # count length for avg loss
        for batch, (mlm_train, mlm_target, attn_masks) in enumerate(tqdm(dataloader)):

            output = model(mlm_train.to(DEVICE), attention_mask=attn_masks.to(DEVICE))
            mlm_output = output.reshape(-1, output.shape[-1])
            mlm_loss = criterion(mlm_output, mlm_target.to(DEVICE).reshape(-1)) # CE
            mlm_epoch_loss += mlm_loss.item()
            cnt += 1
    return mlm_epoch_loss / cnt

In [10]:
model = MyModel(tok.vocab_size,'klue/roberta-large')
model.load_state_dict(torch.load('/home/ubuntu/joonkee/pretraining/pretrained_running/pretrained_running_mlm.pt'))
model = nn.DataParallel(model)
criterion = nn.CrossEntropyLoss()
model.cuda()

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

DataParallel(
  (module): MyModel(
    (lm_model): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(32000, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_features=102

In [11]:
from torch.utils.data.dataset import random_split
from torch import optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
train_dataset, val_dataset = random_split(dataset, [int(len(dataset)*0.95),len(dataset)- int(len(dataset)*0.95)])
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(val_dataset,batch_size=32, shuffle=False)


In [12]:
optimizer = optim.AdamW(model.parameters(), lr=1e-4)# 3e-5 -> 1e-7
optimizer_scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1000, T_mult=1, eta_min=1e-6)

In [13]:
import time
N_EPOCHS = 10
early_stop_cnt = 0
min_loss = 1000
for epoch in range(1, N_EPOCHS+1):
    start_time = time.time()
    mlm_loss, stop, early_stop_cnt, min_loss= train(model, dataloader, optimizer, valid_loader,len(dataset)- int(len(dataset)*0.95), early_stop_cnt,optimizer_scheduler, min_loss)
    end_time = time.time()
    if stop == True:
        break
    print('!!!!!',epoch, mlm_loss, end_time-start_time)

  0%|          | 0/6612 [00:00<?, ?it/s]

train : 20 step,  mlm : 0.79

train : 40 step,  mlm : 0.69

train : 60 step,  mlm : 0.97

train : 80 step,  mlm : 1.18

train : 100 step,  mlm : 0.62

train : 120 step,  mlm : 0.96

train : 140 step,  mlm : 0.96

train : 160 step,  mlm : 0.87

train : 180 step,  mlm : 0.76

train : 200 step,  mlm : 1.06

train : 220 step,  mlm : 0.79

train : 240 step,  mlm : 1.06

train : 260 step,  mlm : 0.88

train : 280 step,  mlm : 0.73

train : 300 step,  mlm : 0.87



  0%|          | 0/348 [00:00<?, ?it/s]

min loss: 0.7887976151602022
validation : 300 step, 0, mlm : 0.79

train : 320 step,  mlm : 0.85

train : 340 step,  mlm : 0.78

train : 360 step,  mlm : 0.72

train : 380 step,  mlm : 1.01

train : 400 step,  mlm : 0.90

train : 420 step,  mlm : 0.63

train : 440 step,  mlm : 0.84

train : 460 step,  mlm : 0.79

train : 480 step,  mlm : 0.95

train : 500 step,  mlm : 0.61

train : 520 step,  mlm : 0.89

train : 540 step,  mlm : 0.76

train : 560 step,  mlm : 0.70

train : 580 step,  mlm : 0.59

train : 600 step,  mlm : 0.82



  0%|          | 0/348 [00:00<?, ?it/s]

min loss: 0.7048363343052481
validation : 600 step, 0, mlm : 0.70

train : 620 step,  mlm : 0.85

train : 640 step,  mlm : 0.84

train : 660 step,  mlm : 0.88

train : 680 step,  mlm : 0.60

train : 700 step,  mlm : 0.75

train : 720 step,  mlm : 0.59

train : 740 step,  mlm : 0.72

train : 760 step,  mlm : 0.72

train : 780 step,  mlm : 0.58

train : 800 step,  mlm : 0.83

train : 820 step,  mlm : 1.10

train : 840 step,  mlm : 0.93

train : 860 step,  mlm : 0.84

train : 880 step,  mlm : 0.66

train : 900 step,  mlm : 0.69



  0%|          | 0/348 [00:00<?, ?it/s]

min loss: 0.639425188731188
validation : 900 step, 0, mlm : 0.64

train : 920 step,  mlm : 0.95

train : 940 step,  mlm : 0.47

train : 960 step,  mlm : 0.72

train : 980 step,  mlm : 0.89

train : 1000 step,  mlm : 0.83

train : 1020 step,  mlm : 0.71

train : 1040 step,  mlm : 0.75

train : 1060 step,  mlm : 0.85

train : 1080 step,  mlm : 0.90

train : 1100 step,  mlm : 1.23

train : 1120 step,  mlm : 0.83

train : 1140 step,  mlm : 0.88

train : 1160 step,  mlm : 0.99

train : 1180 step,  mlm : 0.96

train : 1200 step,  mlm : 0.91



  0%|          | 0/348 [00:00<?, ?it/s]

validation : 1200 step, 1, mlm : 0.83

train : 1220 step,  mlm : 0.91

train : 1240 step,  mlm : 1.03

train : 1260 step,  mlm : 0.61

train : 1280 step,  mlm : 0.91

train : 1300 step,  mlm : 0.92

train : 1320 step,  mlm : 0.87

train : 1340 step,  mlm : 0.94

train : 1360 step,  mlm : 1.19

train : 1380 step,  mlm : 0.69

train : 1400 step,  mlm : 0.89

train : 1420 step,  mlm : 0.66

train : 1440 step,  mlm : 0.75

train : 1460 step,  mlm : 0.76

train : 1480 step,  mlm : 0.95

train : 1500 step,  mlm : 0.74



  0%|          | 0/348 [00:00<?, ?it/s]

validation : 1500 step, 2, mlm : 0.74

train : 1520 step,  mlm : 0.91

train : 1540 step,  mlm : 0.71

train : 1560 step,  mlm : 0.86

train : 1580 step,  mlm : 1.08

train : 1600 step,  mlm : 0.78

train : 1620 step,  mlm : 0.80

train : 1640 step,  mlm : 0.96

train : 1660 step,  mlm : 0.75

train : 1680 step,  mlm : 0.71

train : 1700 step,  mlm : 0.68

train : 1720 step,  mlm : 1.04

train : 1740 step,  mlm : 0.89

train : 1760 step,  mlm : 0.71

train : 1780 step,  mlm : 0.79

train : 1800 step,  mlm : 0.80



  0%|          | 0/348 [00:00<?, ?it/s]

validation : 1800 step, 3, mlm : 0.67

train : 1820 step,  mlm : 0.72

train : 1840 step,  mlm : 0.81

train : 1860 step,  mlm : 0.70

train : 1880 step,  mlm : 0.77

train : 1900 step,  mlm : 0.73

train : 1920 step,  mlm : 0.83

train : 1940 step,  mlm : 0.92

train : 1960 step,  mlm : 0.76

train : 1980 step,  mlm : 0.62

train : 2000 step,  mlm : 0.75

train : 2020 step,  mlm : 0.69

train : 2040 step,  mlm : 0.98

train : 2060 step,  mlm : 0.88

train : 2080 step,  mlm : 1.13

train : 2100 step,  mlm : 1.10



  0%|          | 0/348 [00:00<?, ?it/s]

validation : 2100 step, 4, mlm : 0.92

train : 2120 step,  mlm : 1.29



KeyboardInterrupt: 

In [None]:
model.module.lm_model.save_pretrained('pretrained_lm5')
# tok.save_pretrained('pretrained_lm4')

('pretrained_lm4/tokenizer_config.json',
 'pretrained_lm4/special_tokens_map.json',
 'pretrained_lm4/vocab.txt',
 'pretrained_lm4/added_tokens.json',
 'pretrained_lm4/tokenizer.json')

In [16]:
torch.save(model.module.state_dict(), '/home/ubuntu/joonkee/pretraining/pretrained_running/pretrained_running_mlm.pt')

In [None]:
tok.save_pretrained('pretrained_lm')

('pretrained_lm/tokenizer_config.json',
 'pretrained_lm/special_tokens_map.json',
 'pretrained_lm/vocab.txt',
 'pretrained_lm/added_tokens.json',
 'pretrained_lm/tokenizer.json')