In [1]:
!pip install transformers
# !pip install seqeval



In [0]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForNextSentencePrediction, BertTokenizer, AdamW
import multiprocessing
import numpy as np
import os
import string
# from seqeval.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import f1_score, precision_score, recall_score

In [0]:
class CustomDataset(Dataset):
    def __init__(self, path):
        self.data = None
        with open(path, 'r') as f:
            self.data = f.readlines()
            if len(self.data) > 10000:
              self.data = self.data[:15000]
            self.data = [x.translate(str.maketrans('', '', string.punctuation)) for x in self.data]
            self.data = [x for x in self.data if len(x.split())>=2]
            self.lengths = [len(y.split()) for y in self.data]
            # print(self.lengths)
            self.substr_len = 2

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.lengths[index]
        try:
            neg_x = self.data[index + 1]
            neg_y = self.lengths[index + 1]
        except:
            neg_x = self.data[index - 1]
            neg_y = self.lengths[index - 1]
        try:
          rand_idx_pos = np.random.randint(0, y - self.substr_len - 1)
        except:
          rand_idx_pos = np.random.randint(0, y - 1)
        try:
          rand_idx_neg = np.random.randint(0, neg_y - self.substr_len - 1)
        except:
          rand_idx_neg = np.random.randint(0, neg_y - 1)
        


        return x, \
               " ".join(x.split()[rand_idx_pos:rand_idx_pos + self.substr_len]), \
               " ".join(neg_x.split()[rand_idx_neg:rand_idx_neg + self.substr_len])


In [0]:
path = '/content/drive/My Drive/DirectedStudy/BERT_NSP/wikitext-2'

In [0]:
train_dataset = CustomDataset(os.path.join(path, "train_.txt"))
valid_dataset = CustomDataset(os.path.join(path, "valid_.txt"))
test_dataset = CustomDataset(os.path.join(path, "test_.txt"))

In [0]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [0]:
batch_size = 4
num_workers = 8 if cuda else multiprocessing.cpu_count()
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size, num_workers=num_workers)


In [0]:
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.to(device)
lr = 3e-5
# optimizer = optim.SGD(model.parameters(), lr=lr)
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 25

In [9]:
print("len(trainloader): ", len(train_dataloader), "batch_size: ", batch_size,
      "len(train_dataloader)//batch_size - 1: ", len(train_dataloader) // batch_size - 1)

len(trainloader):  3561 batch_size:  4 len(train_dataloader)//batch_size - 1:  889


In [0]:
def train(model, train_loader, valid_loader, test_loader, optimizer, num_epochs):
    for epoch in range(num_epochs):
        print("Epoch {}:".format(epoch))
        model.train()
        num_correct = 0
        running_loss = 0.
        num_total = 0
        label_ids = list()
        preds_list = list()
        for batch_num, d in enumerate(train_loader):
            pos, pos_sub, neg_sub = d[0], d[1], d[2]
            # print(pos)
            # print(pos_sub)
            # print(neg_sub)
            labels = torch.cuda.LongTensor(np.zeros((len(pos) * 2), float))
            labels[len(pos):] = 1
            # print(labels)
            labels.to(device)
            optimizer.zero_grad()
            pairofstrings = list(zip(pos, pos_sub))
            pairofstrings.extend(list(zip(pos, neg_sub)))
            del pos
            del pos_sub
            del neg_sub
            encoded_batch = tokenizer.batch_encode_plus(pairofstrings, add_special_tokens=True, return_tensors='pt',
                                                        return_special_tokens_masks=True, max_length=512,
                                                        pad_to_max_length=True)
            attention_mask = (encoded_batch['attention_mask'] - encoded_batch['special_tokens_mask']).to(device)
            input_ids, token_type_ids = encoded_batch['input_ids'].to(device), encoded_batch['token_type_ids'].to(
                device)
            loss, logits = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
                                 next_sentence_label=labels)
            # print(logits)
            predicted = torch.max(logits, 1)[1]
            # predicted = torch.max(logits, 1)[1]
            num_total += labels.size(0)
            # print("predicted:", predicted)
            # print("labels:", labels)
            num_correct += (predicted == labels).sum().item()

            # loss = criterion(outputs, labels)
            loss.backward()
            running_loss += loss.item()

            optimizer.step()
            # print(input_ids.shape)
            # print(labels.shape)
            batch_size, max_seq_len = input_ids.shape
            for b in range(batch_size):
                label_ids_temp = []
                preds_list_temp = []
                label_ids_temp.append(labels[b].item())
                preds_list_temp.append(predicted[b].item())
                label_ids.extend(label_ids_temp.copy())
                preds_list.extend(preds_list_temp.copy())

            del labels
            del loss

            if batch_num % 100 == 0 or batch_num == len(train_dataloader) // batch_size - 1:
                print("acc : ", (num_correct) / num_total, "batch_num:", batch_num)
                torch.save(model.state_dict(), 'model.npy')
                torch.save(optimizer.state_dict(), 'optimizer.npy')

        print('Train Accuracy: {}'.format(num_correct / num_total),
              'Average Train Loss: {}'.format(running_loss / len(train_loader)))
        # print(type(label_ids), type(preds_list))
        # # label_ids, preds_list = label_ids.flatten(), preds_list.flatten()
        # print(label_ids, preds_list)
        # print(len(label_ids), len(preds_list))
        print("Precision: " + str(precision_score(label_ids, preds_list)))
        print("Recall: " + str(recall_score(label_ids, preds_list)))
        print("F1: " + str(f1_score(label_ids, preds_list)))

        if epoch % 1 == 0:
            ep_num = epoch + 1
            torch.save(model.state_dict(), 'model' + str(ep_num) + '.npy')
            torch.save(optimizer.state_dict(), 'optimizer' + str(ep_num) + '.npy')

        model.eval()
        num_correct = 0
        running_loss = 0.
        num_total = 0
        with torch.no_grad():
            label_ids = list()
            preds_list = list()
            for batch_num, d in enumerate(valid_loader):
                pos, pos_sub, neg_sub = d[0], d[1], d[2]
                labels = torch.cuda.LongTensor(np.zeros((len(pos) * 2), float))
                labels[len(pos):] = 1
                labels.to(device)
                pairofstrings = list(zip(pos, pos_sub))
                pairofstrings.extend(list(zip(pos, neg_sub)))
                del pos
                del pos_sub
                del neg_sub
                encoded_batch = tokenizer.batch_encode_plus(pairofstrings, add_special_tokens=True, return_tensors='pt',
                                                            max_length=512, return_special_tokens_masks=True,
                                                            pad_to_max_length=True)
                attention_mask = (encoded_batch['attention_mask'] - encoded_batch['special_tokens_mask']).to(device)
                input_ids, token_type_ids = encoded_batch['input_ids'].to(device), encoded_batch['token_type_ids'].to(
                    device)
                loss, logits = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
                                     next_sentence_label=labels)
                predicted = torch.max(logits, 1)[1]
                # print("labels:")
                num_total += labels.size(0)
                num_correct += (predicted == labels).sum().item()
                running_loss += loss.item()
                print("predicted, labels:", predicted.cpu().detach().numpy(), labels.cpu().detach().numpy())
                batch_size, max_seq_len = input_ids.shape
                for b in range(batch_size):
                    label_ids_temp = []
                    preds_list_temp = []
                    label_ids_temp.append(labels[b].item())
                    preds_list_temp.append(predicted[b].item())
                    label_ids.extend(label_ids_temp.copy())
                    preds_list.extend(preds_list_temp.copy())

                del labels
                del loss

        print('Validation Accuracy: {}'.format(num_correct / num_total),
              'Average Validation Loss: {}'.format(running_loss / len(valid_loader)))
        print("Precision: " + str(precision_score(label_ids, preds_list)))
        print("Recall: " + str(recall_score(label_ids, preds_list)))
        print("F1: " + str(f1_score(label_ids, preds_list)))
        num_correct = 0
        running_loss = 0.
        num_total = 0
        with torch.no_grad():
            label_ids = list()
            preds_list = list()
            for batch_num, d in enumerate(test_loader):
                pos, pos_sub, neg_sub = d[0], d[1], d[2]
                labels = torch.cuda.LongTensor(np.zeros((len(pos) * 2), float))
                labels[len(pos):] = 1
                labels.to(device)
                pairofstrings = list(zip(pos, pos_sub))
                pairofstrings.extend(list(zip(pos, neg_sub)))
                del pos
                del pos_sub
                del neg_sub
                encoded_batch = tokenizer.batch_encode_plus(pairofstrings, add_special_tokens=True, return_tensors='pt',
                                                            max_length=512, return_special_tokens_masks=True,
                                                            pad_to_max_length=True)
                attention_mask = (encoded_batch['attention_mask'] - encoded_batch['special_tokens_mask']).to(device)
                input_ids, token_type_ids = encoded_batch['input_ids'].to(device), encoded_batch['token_type_ids'].to(
                    device)
                loss, logits = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
                                     next_sentence_label=labels)
                predicted = torch.max(logits, 1)[1]
                # print("labels:")
                num_total += labels.size(0)
                num_correct += (predicted == labels).sum().item()
                running_loss += loss.item()
                batch_size, max_seq_len = input_ids.shape
                for b in range(batch_size):
                    label_ids_temp = []
                    preds_list_temp = []
                    label_ids_temp.append(labels[b].item())
                    preds_list_temp.append(predicted[b].item())
                    label_ids.extend(label_ids_temp.copy())
                    preds_list.extend(preds_list_temp.copy())

                del labels
                del loss
        print('Test Accuracy: {}'.format(num_correct / num_total),
              'Average Test Loss: {}'.format(running_loss / len(valid_loader)))
        print("Precision: " + str(precision_score(label_ids, preds_list)))
        print("Recall: " + str(recall_score(label_ids, preds_list)))
        print("F1: " + str(f1_score(label_ids, preds_list)))

In [11]:
print(device)

cuda


In [0]:
train(model, train_dataloader, valid_dataloader, test_dataloader, optimizer, num_epochs)

torch.save(model.state_dict(), 'model_final.npy')
torch.save(optimizer.state_dict(), 'optimizer_final.npy')

Epoch 0:
acc :  0.625 batch_num: 0


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!nvidia-smi