<a href="https://colab.research.google.com/github/ilyuzaaaaa/RuBertArticles/blob/main/RuBert_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp "gdrive/My Drive/Transformer/modeling.py" .
!cp "gdrive/My Drive/Transformer/optimization.py" .
!cp "gdrive/My Drive/Transformer/tokenization.py" .

In [None]:
import tokenization 
import torch
import sys 
import time
import numpy as np
import pandas as pd
from IPython.display import clear_output
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from modeling import BertConfig, BertForSequenceClassification
from optimization import BERTAdam
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score, matthews_corrcoef

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./gdrive/My Drive/dataset_5.csv', encoding='utf8')
df = df[['topics', 'title']]
print(df.shape)
df.head()

In [None]:
maps = pd.factorize(df.topics)[1]
print(maps)
df['topics'] = pd.factorize(df.topics)[0]
print(df.head())
df['title'] = df['title'].astype('str')
for c in df:
    if df[c].dtype == 'object':
        print('Max длина предложения %s: %s\n' %  (c, df[c].map(len).max())) 

# Train

In [None]:
sentences = df['title'].values
labels = df['topics'].values # Никакой разницы что list, что values, просто удобство с размерностями для меня
assert len(sentences) == len(labels)

In [None]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.label = label

In [None]:
train_sentences, test_sentences, train_gt, test_gt = train_test_split(sentences, labels, shuffle=True,test_size=0.3, random_state=42)
# shuffle - перемешивание данных, random_state - фиксируем генератор, чтобы разбивка датасета была случайна, но повторялась от запуска к запуску
# train_gt, test_gt - ответы, которые соответсвуют предложениям (номера классов)
assert len(set(train_gt)) == len(set(test_gt))
num_classes = len(set(train_gt)) # сет нужен чтобы узнать количество классов (сколько уникальных номеров классов (у нас 5))

In [None]:
kf = KFold(n_splits=3, shuffle=True)

In [None]:
def preprocessing(sentences, labels, tokenizer, max_len):
    features = []
    for i,sentence in enumerate(sentences):
        
        tokens_a = tokenizer.tokenize(sentence)
        
        if len(tokens_a) > max_len - 2:
            tokens_a = tokens_a[0:(max_len - 2)]# берт принимает на вход последовательности длинны не более 512, поэтому мы проверяем, большо ли 

        tokens = []
        tokens.append("[CLS]")
        for token in tokens_a:
            tokens.append(token)
        tokens.append("[SEP]")

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_len:
            input_ids.append(0)
            input_mask.append(0)
            
        assert len(input_ids) == max_len
        assert len(input_mask) == max_len

        features.append(
                    InputFeatures(
                            input_ids=input_ids,
                            input_mask=input_mask,
                            label=[labels[i]]))
    
    return features

In [None]:
class Dataload(torch.utils.data.Dataset):
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):

        
        return torch.LongTensor(self.features[index].input_ids),\
               torch.LongTensor(self.features[index].input_mask),\
               torch.LongTensor(self.features[index].label)

In [None]:
tokenizer = tokenization.FullTokenizer(vocab_file='./gdrive/My Drive/Transformer/vocab.txt', do_lower_case=False)

In [None]:
for fold, (train_index, test_index) in enumerate(kf.split(sentences, labels)):
    print("TRAIN:", len(train_index), "TEST:", len(test_index), fold)
    train_sentences, test_sentences = sentences[train_index], sentences[test_index]
    train_gt, test_gt = labels[train_index], labels[test_index]
    
    features = preprocessing(train_sentences, train_gt, tokenizer, 512)
    dataset_train = Dataload(features)
    train_dataloader = torch.utils.data.DataLoader(dataset_train,batch_size = 8, shuffle=True,\
                                                  num_workers=6, pin_memory=True)
    features = preprocessing(test_sentences, test_gt, tokenizer, 512)
    dataset_test = Dataload(features)
    test_dataloader = torch.utils.data.DataLoader(dataset_test,batch_size = 1, shuffle=False,\
                                                  num_workers=6, pin_memory=True)
    
    print(len(train_dataloader))
    print(len(test_dataloader))

    device = 'cuda'
    bert_config = BertConfig.from_json_file('./gdrive/My Drive/Transformer/bert_config.json')
    model = BertForSequenceClassification(bert_config, num_classes)
    model.bert.load_state_dict(torch.load('./gdrive/My Drive/Transformer/pytorch_model.bin'\
                                          , map_location='cpu'))
    model.to(device)
    model = torch.nn.DataParallel(model)

    num_epoch = 5
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
    num_train_steps = len(train_dataloader) * num_epoch
    optimizer = BERTAdam(optimizer_parameters,
                        lr=5e-5,
                        warmup=0.1,
                        t_total=num_train_steps)
    
    f = open('log'+str(fold)+'.txt', 'w')
    f.close()


    train_loss_set = []

    batch_iterator = iter(train_dataloader)

    total_step = 0
    model.train()
    train_loss = 0

    while total_step<5*len(train_dataloader):

        total_step += 1
        try:
            batch = next(batch_iterator)
        except:
            batch_iterator = iter(data_loader)
            batch = next(batch_iterator)

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()

        loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels.squeeze(1), token_type_ids=None)
        
        train_loss_set.append(loss.mean().item())

        loss.mean().backward()

        optimizer.step()

        train_loss += loss.mean().item()

        clear_output(True)
        plt.plot(train_loss_set)
        plt.title("Training loss")
        plt.xlabel("Batch")
        plt.ylabel("Loss")
        plt.show()

        if total_step%1000 == 0:
            print("Mean loss: {0:.5f}".format(train_loss / len(train_dataloader)))
            with open('log'+str(fold)+'.txt', 'a') as f:
                f.write("Mean loss: {0:.5f}\n".format(train_loss / len(train_dataloader)))
            torch.save(model.state_dict(), './gdrive/My Drive/weights'+str(fold)+'_'+str(total_step)+'.pth')
            train_loss = 0
            model.eval()
            valid_preds, valid_labels = [], []

            for batch in test_dataloader:   

                batch = tuple(t.to(device) for t in batch)

                b_input_ids, b_input_mask, b_labels = batch

                with torch.no_grad():
                    logits = model(b_input_ids, attention_mask=b_input_mask, token_type_ids=None)

                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.squeeze(1).to('cpu').numpy()
                batch_preds = np.argmax(logits, axis=1)
                batch_labels = np.hstack(label_ids)

                valid_preds.extend(batch_preds)
                valid_labels.extend(batch_labels)

            with open('log'+str(fold)+'.txt', 'a') as f:
                f.write("Accuracy: {0:.2f}%".format(
                  accuracy_score(valid_labels, valid_preds) * 100
                ))
                f.write("Matthews: {0:.2f}%".format(
                  matthews_corrcoef(valid_labels, valid_preds) * 100
                ))
            model.train()

# Predict

In [None]:
device = 'cuda'
bert_config = BertConfig.from_json_file('./gdrive/My Drive/Transformer/bert_config.json')
model = BertForSequenceClassification(bert_config, 5)
model.to(device)
model = torch.nn.DataParallel(model)
model.load_state_dict(torch.load('./gdrive/My Drive/weights5000.pth'))

In [None]:
tokenizer = tokenization.FullTokenizer(vocab_file='./gdrive/My Drive/Transformer/vocab.txt', do_lower_case=False)

In [None]:
def predict(sentence, tokenizer, max_len):
#     print(sentence)
    tokens_a = tokenizer.tokenize(sentence)
    if len(tokens_a) > max_len - 2:
        tokens_a = tokens_a[0:(max_len - 2)]

    tokens = []
    tokens.append("[CLS]")
    for token in tokens_a:
        tokens.append(token)
    tokens.append("[SEP]")
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)
    while len(input_ids) < max_len:
        input_ids.append(0)
        input_mask.append(0)
    input_ids = torch.LongTensor(input_ids).unsqueeze(0)
    input_mask = torch.LongTensor(input_mask).unsqueeze(0)
    a = time.time()

    logits = model(input_ids.cuda(), None, input_mask.cuda())
    logits = logits.squeeze(0)
    
    b = time.time()
    logits = F.softmax(logits, dim=-1)
    clas = logits.argmax().item()
    return clas

In [None]:
maps[predict("Василий Ломаченко боксировал с Теофимо Лопесом с травмой плеча", tokenizer, 512)]

In [None]:
maps

In [None]:
Accuracy = [88.39, 90.11, 89.93, 91.26, 91.32, 90.96, 90.93, 90.75]
Matthews = [82.75, 85.16, 85.02, 86.91, 86.98, 86.42, 86.42, 86.13]
Mean_loss = [0.24487, 0.16310, 0.10926, 0.09439, 0.05900, 0.04465, 0.02807,  0.01792]

In [None]:
plt.plot(Accuracy)
plt.plot(Matthews)
plt.legend()

In [None]:
plt.plot(Mean_loss)