**Сделаем Pre-Train модели BERT на задаче Masked Language Modelling**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM, AdamW


In [2]:
train = pd.read_csv('/kaggle/input/sentiment-analysis-company-reviews/train.csv')
test = pd.read_csv('/kaggle/input/sentiment-analysis-company-reviews/test.csv')
submission=pd.read_csv('/kaggle/input/sentiment-analysis-company-reviews/sample_submission.csv')


In [3]:
train.head() # Данные вида review - rating


Unnamed: 0,Id,Review,Rating
0,0,Very good value and a great tv very happy and ...,5
1,1,After 6 month still can't access my account,3
2,2,I couldn't make an official review on a produc...,1
3,3,"Fantastic! Extremely easy to use website, fant...",5
4,4,So far annoyed as hell with this bt monthly pa...,1


In [3]:
y_train = train['Rating']
train = train['Review']
test = test['Review']


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
bertmlm = BertForMaskedLM.from_pretrained('bert-base-uncased')  # Возьмем стандартный BERT


In [5]:
inputs = tokenizer(train.tolist(), 
                   return_tensors='pt', 
                   max_length=512, 
                   truncation=True, 
                   padding='max_length')
inputs['labels'] = inputs.input_ids.detach().clone()

rand = torch.rand(inputs.input_ids.shape)

mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \  # Задаем "распределение вероятностей" 
           (inputs.input_ids != 102) * (inputs.input_ids != 0)  # и маску над ним

In [11]:
tokenizer.decode(103)  # Токен с id == 103 - специальный токен MASK


'[ M A S K ]'

In [6]:
selection = []

for i in tqdm(range(inputs.input_ids.shape[0])):  # Накладываем маску на предложения
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    ) 
    

100%|██████████| 60000/60000 [00:00<00:00, 94498.52it/s] 


In [8]:
for i in range(inputs.input_ids.shape[0]):  # Токены, которым соответствует число < 0.15, заменим
                                            # токеном MASK
    inputs.input_ids[i, selection[i]] = 103
    

In [9]:
class MeditationsDataset(torch.utils.data.Dataset):  # Завернем данные в специальный класс Dataset
    def __init__(self, data):
        self.data = data
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.data.items()}
    def __len__(self):
        return len(self.data.input_ids)
    

In [10]:
dataset = MeditationsDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)  # Создадим Dataloader


In [None]:
optim = AdamW(bertmlm.parameters(), lr=5e-5)


In [13]:
def trainloop(model, optimizer, dataloader, device, epochs):
        model.to(device)
        for epoch in range(epochs):

            loop = tqdm(dataloader, leave=True)
            for batch in loop:

                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                labels=labels)

                loss = outputs.loss

                loss.backward()

                optimizer.step()

                loop.set_description(f'Epoch {epoch}')
                loop.set_postfix(loss=loss.item())
                
        return model

  """
Epoch 0: 100%|██████████| 3750/3750 [59:59<00:00,  1.04it/s, loss=0.0769]
Epoch 1: 100%|██████████| 3750/3750 [59:59<00:00,  1.04it/s, loss=0.0493]
Epoch 2: 100%|██████████| 3750/3750 [1:00:02<00:00,  1.04it/s, loss=0.0156]


In [None]:
trained_model = trainloop(
    model=bertmlm,
    optimizer=optim,
    dataloader=loader,
    device='cuda:0',
    epochs=3
    )

In [14]:
torch.save(trained_model, 'bertpretrained3eps.pth') 
