# Dependencies

In [None]:
!nvidia-smi

Sat Jun  5 03:45:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip3 install transformers



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from typing import Counter
import numpy as np
from keras.preprocessing import sequence
from nltk.corpus import gutenberg
from string import punctuation
import nltk
from collections import defaultdict
import re
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
phobert = AutoModel.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class MLP_Head(nn.Module):
  def __init__(self, input_dim=512, hidden_dim=256, output_dim=4):
    super(MLP_Head, self).__init__()
    self.net = nn.Sequential( 
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
    )
    self.softmax = nn.LogSoftmax(dim=-1)

  
  def forward(self, x):
    logits = self.net(x)
    return self.softmax(logits)
  



In [None]:
class My_Model(nn.Module):
  def __init__(self, bert, rnn, head, device):
    super(My_Model, self).__init__()

    self.bert = bert.to(device)
    self.rnn = rnn.to(device)
    self.head = head.to(device)
    
    for param in self.bert.parameters():
      param.requires_grad = False

  def forward(self, x):

    features = self.bert(x)
    
    hidden, pool = features['last_hidden_state'], features['pooler_output']


    feat, _ = self.rnn(hidden)


    output = self.head(feat)

    return output  


In [None]:
class MyDataset(Dataset):
    def __init__(self, data_path, label_path, length):
       
        self.data_path = data_path
        self.label_path = label_path
        self.length = length

        with open(self.data_path, 'r') as f:
            sents = f.read().splitlines()

        self.sents = [sent.split('<fff>') for sent in sents]
        self.sents = [[re.sub(" ", "_", token) for token in line] for line in self.sents]
        
        self.sents = [['<s>']+line+['</s>'] for line in  self.sents]


        self.ids = [tokenizer.convert_tokens_to_ids(line) for line in self.sents]
        
        self.ids = sequence.pad_sequences(
            self.ids, maxlen=self.length, padding="post", value=1)
        


        with open(self.label_path, 'r') as f:
            labels = f.read().splitlines()

        

        self.labels = [list(map(int, label.split())) for label in labels]
        self.labels = [[0]+label+[3] for label in self.labels]

        self.labels = sequence.pad_sequences(
            self.labels, maxlen=self.length, padding="post", value=3)

    def __getitem__(self, index):

        return {'data': self.ids[index], 'label': self.labels[index]}

    def __len__(self):
        return len(self.labels)

In [None]:
def dataset_batch_iter(dataset, batch_size):
    b_words = []
    b_labels = []
    for data in dataset:
        b_words.append(data['data'])
        b_labels.append(data['label'])

        if len(b_words) == batch_size:
            yield {'data': np.array(b_words, dtype=int), 'label': np.array(b_labels, dtype=int)}
            b_words, b_labels = [], []

In [None]:
train_dataset = MyDataset(data_path='/content/drive/MyDrive/PP/demo_data_hoang/data_text(50k).txt',
                    label_path='/content/drive/MyDrive/PP/demo_data_hoang/data_label(50k).txt',
                    length=32)

In [None]:
test_dataset = MyDataset(data_path='/content/drive/MyDrive/PP/demo_data_hoang/test_text(5k).txt',
                    label_path='/content/drive/MyDrive/PP/demo_data_hoang/test_label(5k).txt',
                    length=32)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn = nn.RNN(768, 512, 2, batch_first = True, dropout=0.2, bidirectional=True)
head = MLP_Head(512*2, 256, 4)
mymodel = My_Model(phobert, rnn, head, device)

In [None]:
optimizer = torch.optim.Adam(mymodel.parameters())

In [None]:
nll_loss = nn.NLLLoss()

In [None]:
def cal_score(valid_dataset):
    mymodel.eval()
    print("/nTesting")
    correct = 0.
    cnf_matrix = np.zeros((4, 4))
    for batch, data in tqdm(enumerate(dataset_batch_iter(valid_dataset, 16)),leave=False):
      input_tensor = torch.Tensor(data['data']).type(
                        torch.LongTensor).to(device)
      target_tensor = torch.Tensor(data['label']).type(
                        torch.LongTensor).to(device)

      output = mymodel(input_tensor)
      prediction = output.view(-1, 4).argmax(dim=-1)   
      
      correct += torch.sum(prediction == target_tensor.view(-1)).item()/(prediction.shape[0])

      for t, p in zip(target_tensor.view(-1), prediction):
          cnf_matrix[t.cpu().long(), p.cpu().long()] += 1
    
    correct /= (batch+1)  
    accuracy = np.diagonal(cnf_matrix).sum()/cnf_matrix.sum()

    precision_1 = cnf_matrix[1][1]/cnf_matrix[:, 1].sum()
    recall_1 = cnf_matrix[1][1]/cnf_matrix[1, :].sum()
    precision_2 = cnf_matrix[2][2]/cnf_matrix[:, 2].sum()
    recall_2 = cnf_matrix[2][2]/cnf_matrix[2, :].sum()
    precision_3 = cnf_matrix[3][3]/cnf_matrix[:, 3].sum()
    recall_3 = cnf_matrix[3][3]/cnf_matrix[3, :].sum()
    precision_0 = cnf_matrix[0][0]/cnf_matrix[:, 0].sum()
    recall_0 = cnf_matrix[0][0]/cnf_matrix[0, :].sum()
    return accuracy,  precision_0, recall_0, precision_1, recall_1, precision_2, recall_2, precision_3, recall_3


In [None]:
def load():
  path = '/content/drive/MyDrive/QA_Bert/bertPP_big.pt'
  
  try:
    checkpoint = torch.load(path)
    mymodel.load_state_dict(checkpoint['model_state_dict'])  
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print("Load successfully")
  except:
    print("Load fail")

In [None]:
for epoch in range(10):
  mymodel.train()
  epoch_loss = 0.
  for batch, data in tqdm(enumerate(dataset_batch_iter(train_dataset, 16)), leave=False):
    input_tensor = torch.Tensor(data['data']).type(
                      torch.LongTensor).to(device)
    target_tensor = torch.Tensor(data['label']).type(
                      torch.LongTensor).to(device)

    output = mymodel(input_tensor)
    optimizer.zero_grad()
    loss = nll_loss(output.view(-1, 4), target_tensor.view(-1))
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
  
  acc = cal_score(test_dataset)

  print(f"\nEpoch {epoch} with loss {epoch_loss} \nand test acc {acc}")

  if epoch % 1 == 0:
    path = '/content/drive/MyDrive/QA_Bert/bertPP_big.pt'
    torch.save(
        {
            'model_state_dict': mymodel.state_dict(),
            'optimizer_state_dict':optimizer.state_dict(),
        }, path            
    )

3it [00:00, 21.88it/s]

/nTesting





Epoch 0 with loss 429.71626645326614 
and test acc (0.9548590244391025, 0.9566813373770249, 0.9771731328623777, 0.7729058189458299, 0.649148381156974, 0.9920979849861715, 0.9578485599847416, 1.0, 1.0)


3it [00:00, 22.30it/s]

/nTesting





Epoch 1 with loss 382.24288139119744 
and test acc (0.9562800480769231, 0.9538935553680813, 0.982433606588342, 0.807975338106603, 0.6233696486113243, 0.9950524440926183, 0.9589929429715811, 1.0, 1.0)


3it [00:00, 21.94it/s]

/nTesting





Epoch 2 with loss 372.87676361203194 
and test acc (0.9570062099358975, 0.9564436502965774, 0.9807241887348419, 0.7995630284031537, 0.6457725947521866, 0.9944674965421854, 0.9599465954606141, 0.9999718974820144, 1.0)


3it [00:00, 22.11it/s]

/nTesting





Epoch 3 with loss 365.01013465225697 
and test acc (0.9573004306891025, 0.96075716462848, 0.976436477654792, 0.7766652154984762, 0.6843639711523707, 0.9936796365791033, 0.9595651344650009, 0.9999718974820144, 1.0)


3it [00:00, 22.39it/s]

/nTesting





Epoch 4 with loss 359.6585277058184 
and test acc (0.9582832532051282, 0.958039313353811, 0.9809036303879718, 0.8044028103044496, 0.6588154058615927, 0.9946808510638298, 0.9629982834255197, 1.0, 0.9999718966922406)


3it [00:00, 22.09it/s]

/nTesting





Epoch 5 with loss 358.20995550602674 
and test acc (0.9585023537660257, 0.9609713548674456, 0.9780608968304938, 0.7885022960084775, 0.6850544729169863, 0.9944773175542406, 0.9616631699408735, 1.0, 1.0)


3it [00:00, 22.05it/s]

/nTesting





Epoch 6 with loss 355.7786992266774 
and test acc (0.9579952924679487, 0.9608635097493036, 0.9773431302179744, 0.7835486422356973, 0.6840570814792082, 0.9937082186394023, 0.9639519359145527, 1.0, 0.9998875867689627)


3it [00:00, 21.48it/s]

/nTesting





Epoch 7 with loss 355.3194943368435 
and test acc (0.958320813301282, 0.9598848030817954, 0.9789675493936761, 0.7931376080691642, 0.6756943378855301, 0.994277821625888, 0.9610909784474537, 1.0, 1.0)


3it [00:00, 21.98it/s]

/nTesting





Epoch 8 with loss 354.9091358035803 
and test acc (0.9584460136217948, 0.9620042413870079, 0.9767953609610517, 0.7813604621088025, 0.6953352769679301, 0.9966336633663366, 0.9599465954606141, 1.0, 1.0)


3it [00:00, 21.89it/s]

/nTesting





Epoch 9 with loss 356.1103294044733 
and test acc (0.9581956129807693, 0.9590027547190741, 0.9797797589815269, 0.796999359619431, 0.6684057081479208, 0.9954464462482677, 0.9589929429715811, 0.9999718974820144, 1.0)
