## Install dependences
- pytorch-lightning: a simple trainer to help you minize code base
- transformers: library contains multiple BERT models
- sentencepiece: a word-to-vect library with fast implementation

In [2]:
!pip install pytorch-lightning
!pip install transformers
!pip install sentencepiece

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.5.3-py3-none-any.whl (523 kB)
[K     |████████████████████████████████| 523 kB 11.0 MB/s 
[?25hCollecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 54.7 MB/s 
[?25hCollecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.8 MB/s 
[?25hCollecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 50.5 MB/s 
Collecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.6.0-py3-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 45.8 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x

In [3]:
# mount to your drive and access your dataset
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# LOAD DATA

In [4]:
# replace this path to your dataset directory
DATA_ROOT_DIR="/content/drive/MyDrive/Colab/BERT"
!ls $DATA_ROOT_DIR

 envibert			      RoBERTa-Sentiment_non_sw.ipynb
 preprocess_data.csv		      shopee-sentiment
 preprocess_data_non_sw.csv	     'Tur code RoBERTa-Sentiment.ipynb'
 preprocess_data_non_sw_segment.csv   vietnamese-stopwords-dash.txt
 Preprocess_Shopee_data.ipynb	      vietnamese-stopwords.txt
 RoBERTa-Sentiment.ipynb


In [5]:
# include some dependence
import pandas as pd
import numpy as np
from torch.utils.data import random_split, DataLoader, Dataset
import pytorch_lightning as pl
import torch.nn as nn
import torch

train_ratio = 0.8
DATA_DIR = '/content/drive/MyDrive/Colab/BERT/preprocess_data_non_sw_segment.csv'

In [6]:
# Use pandas to read csv, this will return a excel like table data
train = pd.read_csv(DATA_DIR,index_col=0).dropna()
train.head()

Unnamed: 0,text,class
0,đến quán lần thôi rất là thích quán tuy nằm tr...,1
1,đến quán vào tối chủ_nhật có band hát khá ổn t...,0
2,phục_vụ lâu quá mặc_dù khách rất vắng đợi gần ...,0
3,ko gian bé tí quán chật_chội đông người nên ph...,0
4,khi mình order đặt bánh thì nhận được sự tiếp_...,1


In [7]:
train

Unnamed: 0,text,class
0,đến quán lần thôi rất là thích quán tuy nằm tr...,1
1,đến quán vào tối chủ_nhật có band hát khá ổn t...,0
2,phục_vụ lâu quá mặc_dù khách rất vắng đợi gần ...,0
3,ko gian bé tí quán chật_chội đông người nên ph...,0
4,khi mình order đặt bánh thì nhận được sự tiếp_...,1
...,...,...
26995,không_gian đẹp đồ uống bình thg cheese ngon ta...,0
26996,chỉ có hai từ thất_vọng mới diễn_tả được cảm_g...,0
26997,hôm vào quán có bói bài tarot nên cũng bon_che...,1
26998,va ngô đơ mơ đươ phu vu cô freeze tra xanh va ...,0


In [8]:
from typing import Optional
class SentimentData(Dataset):
    """
    Dataset class for sentiment analysis. 
    Every dataset using pytorch should be overwrite this class
    This require 2 function, __len__ and __getitem__
    """
    def __init__(self, data_dir):
        """
        Args:
            data_dir (string): Directory with the csv file
        """
        self.df = pd.read_csv(data_dir, index_col=0).dropna()
        self.df = self.df.reset_index(drop=True)

    def __len__(self):
        """
        length of the dataset, i.e. number of rows in the csv file
        Returns: int 
        """
        return len(self.df)

    def __getitem__(self, idx):
        """
        given a row index, returns the corresponding row of the csv file
        Returns: text (string), label (int) 
        """
        text = self.df["text"][idx]
        label = self.df["class"][idx]

        return text, label


class SentimentDataModule(pl.LightningDataModule):
    """
    Module class for sentiment analysis. this class is used to load the data to the model. 
    It is a subclass of LightningDataModule. 
    """

    def __init__(self, data_dir: str = DATA_DIR, batch_size: int = 16):
        """
        Args:
            data_dir (string): Directory with the csv file
            batch_size (int): batch size for dataloader
        """
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        """
        Loads the data to the model. 
        the data is loaded in the setup function, so that it is loaded only once. 
        """
        data_full = SentimentData(self.data_dir)
        train_size = round(len(data_full) * train_ratio)
        val_size = len(data_full) - train_size
        print(len(data_full), train_size, val_size)
        self.data_train, self.data_val = random_split(data_full, [train_size, val_size])

    def train_dataloader(self):
        """
        Returns: dataloader for training
        """
        return DataLoader(self.data_train, batch_size=self.batch_size)

    def val_dataloader(self):
        """
        Returns: dataloader for validation
        """
        return DataLoader(self.data_val, batch_size=self.batch_size)

# Do some Test with data
if __name__ == "__main__":
	dm = SentimentDataModule(DATA_DIR)
	dm.setup()
	idx = 0
	for item in (dm.train_dataloader()):
		print(idx)
		print(item)
		idx += 1
		if idx > 5: break



26999 21599 5400
0
[('lần nào cũng thế tính ăn vui mà cứ ăn là no căng bụng chảo đầy_ắp đồ lun pate ngon nước sốt ngon khoai_tây nghiền thịt thơm ngon thôi rồi phục_vụ cái này mới là quan_trọng thực_sự là quá thân_thiện và nhiệt_tình luôn cả nhóm gần chục đứa đủ nhau là nghĩ ngay đến bánh_mì chảo xuyên việt quán vừa chuyển địa_điểm là tìm bằng được để đi ăn từ đầu khuất duy tiến vào cái auto bán tô rõ to số kdtien thì phải quán ngay trong cái ngõ cạnh đấy luôn ối ôi ngon quá', 'thích nhất đây là không_gian nhưng thích hơn cả nhất là đồ ăn và đồ uống cứ ngon bổ rẻ thì dù đâu cũng đến đc mà', 'mình không có chỗ nào chê về aeon mall cả sushi ngon mà quá rẻ lúc mình đi may_mắn là không đợi lâu quá ăn vào có vị ngọt ngọt cá tươi nhưng không có cảm_giác như đang ăn cá sống mùi_vị không tanh vì ham_hố mà cho mù_tạt vào nước tương nhưng nồng quá chấm vào ăn là xem như xông tới mũi không_thể nào ăn được mình với bạn mình quyết_định không chấm nữa nhưng ăn vẫn rất ngon không hề có cảm_giác ngán'

# Model

In [9]:
!pip install fairseq

Collecting fairseq
  Downloading fairseq-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 21.6 MB/s eta 0:00:01[K     |▍                               | 20 kB 27.4 MB/s eta 0:00:01[K     |▋                               | 30 kB 21.3 MB/s eta 0:00:01[K     |▊                               | 40 kB 17.4 MB/s eta 0:00:01[K     |█                               | 51 kB 14.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 12.9 MB/s eta 0:00:01[K     |█▍                              | 71 kB 12.2 MB/s eta 0:00:01[K     |█▌                              | 81 kB 13.5 MB/s eta 0:00:01[K     |█▊                              | 92 kB 13.9 MB/s eta 0:00:01[K     |██                              | 102 kB 12.9 MB/s eta 0:00:01[K     |██▏                             | 112 kB 12.9 MB/s eta 0:00:01[K     |██▎                             | 122 kB 12.9 MB/s eta 0:00:01[K     |██▌                             | 133 k

In [10]:
from fairseq.data import Dictionary
import sentencepiece as spm
from os.path import join as pjoin
from transformers import PreTrainedTokenizer
import sentencepiece as spm


class XLMRobertaTokenizer(PreTrainedTokenizer):
    """
    XLM-RoBERTa tokenizer adapted from transformers.PreTrainedTokenizer. This helps to convert the input text into 
    tokenized format. eg, 
    
    input: "Hello, how are you?" output: ["1", "2", "3", "65", "2", "1"]
    
    this class also provides the method to convert the tokenized format into the original text.
    
    eg, input: ["1", "2", "3", "65", "2", "1"] output: "Hello, how are you?"
    
    """
    def __init__(
            self,
            pretrained_file,
            bos_token="<s>",
            eos_token="</s>",
            sep_token="</s>",
            cls_token="<s>",
            unk_token="<unk>",
            pad_token="<pad>",
            mask_token="<mask>",
            **kwargs
    ):
        """
        :param pretrained_file: path to the pretrained model file
        :param bos_token: beginning of sentence token
        :param eos_token: end of sentence token
        :param sep_token: separation token
        :param cls_token: classification token
        :param unk_token: unknown token
        :param pad_token: padding token
        :param mask_token: mask token
        """
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model)  # please dont use anything from sp_model bcz it makes everything goes wrong
        self.bpe_dict = Dictionary().load(vocab_file)
        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0
        self.fairseq_tokens_to_ids["<mask>"] = len(self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

    def _tokenize(self, text):
        """ Tokenize a string. """
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        """ Size of the base vocabulary (without the added tokens) """
        return len(self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        """ Returns the vocabulary as a list of tokens. """
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

In [11]:
from transformers import XLMRobertaConfig, XLMRobertaForSequenceClassification
import torch

pretrained_path = '/content/drive/MyDrive/Colab/BERT/envibert/'
!ls $pretrained_path
# load tokenizer
roberta = XLMRobertaForSequenceClassification.from_pretrained(pretrained_path)
tokenizer = XLMRobertaTokenizer(pretrained_path)

config.json  dict.txt  model.pt  pytorch_model.bin  sentencepiece.bpe.model


Some weights of the model checkpoint at /content/drive/MyDrive/Colab/BERT/envibert/ were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Colab/BERT/envibert/ and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should pr

In [12]:
# try to convert some text into numbers
inputs = ["Tôi ghét nó", "Tôi thích nó", "Tôi quý nó"]
inputs = tokenizer(inputs, return_tensors='pt')
print(inputs)
outputs = roberta(**inputs, labels=torch.tensor([0, 1, 1]))
print(outputs)

{'input_ids': tensor([[ 842, 8919,  543],
        [ 842,  648,  543],
        [ 842,  976,  543]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}
SequenceClassifierOutput(loss=tensor(0.6734, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0442,  0.0246],
        [-0.0163,  0.0730],
        [-0.0819,  0.0218]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [13]:
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score


class SentimentRoberta(pl.LightningModule):
    """
    SentimentRoberta class inherits from LightningModule
    This class is used to train a model using PyTorch Lightning
    It overrides the following methods:
        - forward : forward pass of the model
        - training_step : training step of the model
        - validation_step : validation step of the model
        - validation_epoch_end : end of the validation epoch
        - configure_optimizers : configure optimizers
    """
    def __init__(self, lr_roberta, lr_classifier):
        """
        Initialize the model with the following parameters:
            - lr_roberta : learning rate of the roberta model
            - lr_classifier : learning rate of the classifier model
        """
        super().__init__()
        self.roberta = XLMRobertaForSequenceClassification.from_pretrained(pretrained_path)
        self.tokenizer = XLMRobertaTokenizer(pretrained_path)
        self.lr_roberta = lr_roberta
        self.lr_classifer = lr_classifier

    def forward(self, texts, labels=None):
        """
        Forward pass of the model
        Args:
            - texts : input texts
            - labels : labels of the input texts
        """
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        for key in inputs:
            inputs[key] = inputs[key].to(self.device)

        outputs = self.roberta(**inputs, labels=labels)
        return outputs

    def configure_optimizers(self):
        """
        Configure optimizers
        This method is used to configure the optimizers of the model by using the learning rate
        for specific parameter of the roberta model and the classifier model
        """
        roberta_params = self.roberta.roberta.named_parameters()
        classifier_params = self.roberta.classifier.named_parameters()

        grouped_params = [
            {"params": [p for n, p in roberta_params], "lr": self.lr_roberta},
            {"params": [p for n, p in classifier_params], "lr": self.lr_classifer}
        ]
        optimizer = torch.optim.AdamW(
            grouped_params
        )
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.98)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'f1/val',
            }
        }

    def training_step(self, batch, batch_idx):
        """
        Training step of the model
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step of the model, used to compute the metrics
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()

        output_scores = torch.softmax(logits, dim=-1)
        return loss, output_scores, labels

    def validation_epoch_end(self, validation_step_outputs):
        """
        End of the validation epoch, this method will be called at the end of the validation epoch,
        it will compute the multiple metrics of classification problem
        Args:
            - validation_step_outputs : outputs of the validation step
        """

        val_preds = torch.tensor([], device=self.device)
        val_scores = torch.tensor([], device=self.device)
        val_labels = torch.tensor([], device=self.device)
        val_loss = 0
        total_item = 0

        for idx, item in enumerate(validation_step_outputs):
            loss, output_scores, labels = item

            predictions = torch.argmax(output_scores, dim=-1)
            val_preds = torch.cat((val_preds, predictions), dim=0)
            val_scores = torch.cat((val_scores, output_scores[:, 1]), dim=0)
            val_labels = torch.cat((val_labels, labels), dim=0)

            val_loss += loss
            total_item += 1

        # print("VAL PREDS", val_preds.shape)
        # print("VAL SCORES", val_scores.shape)
        # print("VAL LABELS", val_labels.shape)
        val_preds = val_preds.cpu().numpy()
        val_scores = val_scores.cpu().numpy()
        val_labels = val_labels.cpu().numpy()

        reports = classification_report(val_labels, val_preds, output_dict=True)
        print("VAL LABELS", val_labels)
        print("VAL SCORES", val_scores)
        try:
            auc = roc_auc_score(val_labels, val_scores)
        except Exception as e:
            print(e)
            print("Cannot calculate AUC. Default to 0")
            auc = 0
        accuracy = accuracy_score(val_labels, val_preds)

        print(classification_report(val_labels, val_preds))

        self.log("loss/val", val_loss)
        self.log("auc/val", auc)
        self.log("accuracy/val", accuracy)
        self.log("precision/val", reports["weighted avg"]["precision"])
        self.log("recall/val", reports["weighted avg"]["recall"])
        self.log("f1/val", reports["weighted avg"]["f1-score"])

In [14]:
trainer = pl.Trainer(
    fast_dev_run=True,
)
model = SentimentRoberta(lr_roberta=1e-5, lr_classifier=3e-3)
dm = SentimentDataModule()

trainer.fit(model, dm)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`."
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
Some weights of the model checkpoint at /content/drive/MyDrive/Colab/BERT/envibert/ were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

26999 21599 5400


  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1.]
VAL SCORES [0.6546619  0.48077685 0.30695942 0.27447042 0.45221782 0.28273666
 0.43414697 0.49028432 0.4563989  0.5353307  0.43616897 0.49422556
 0.28345835 0.40376085 0.1109295  0.29817784]
              precision    recall  f1-score   support

         0.0       0.36      1.00      0.53         5
         1.0       1.00      0.18      0.31        11

    accuracy                           0.44        16
   macro avg       0.68      0.59      0.42        16
weighted avg       0.80      0.44      0.38        16



# Training

In [None]:
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(123)

tb_logger = pl_loggers.TensorBoardLogger('/content/drive/MyDrive/Colab/BERT/logs/')

trainer = pl.Trainer(
    min_epochs=1,
    max_epochs=5,
    gpus=1,
    precision=16,
    val_check_interval=0.5,
    # check_val_every_n_epoch=1,
    callbacks=[
      ModelCheckpoint(
          dirpath='/content/drive/MyDrive/Colab/BERT/ckpt',
          save_top_k=3,
          monitor='f1/val',
      ), 
      EarlyStopping('f1/val', patience=5)
    ],
    fast_dev_run=False,
    logger=tb_logger
)

dm.setup(stage="fit")
trainer.fit(model, dm)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                                | Params
----------------------------------------------------------------
0 | roberta | XLMRobertaForSequenceClassification | 70.7 M
----------------------------------------------------------------
70.7 M    Trainable params
0         Non-trainable params
70.7 M    Total params
141.409   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 1. 0. 0. 0. 1. 0. 0. 0.]
VAL SCORES [0.6547292  0.48103243 0.30808616 0.27439997 0.45230156 0.2833846
 0.43440133 0.4904263  0.45684895 0.5359415  0.43611163 0.49399406
 0.28358296 0.40340528 0.11119294 0.29930994 0.67329913 0.6173973
 0.39530924 0.5362218  0.4744148  0.4539577  0.49743652 0.4250085
 0.4638541  0.46393755 0.5469399  0.29731688 0.431674   0.51261824
 0.52215034 0.30074558]
              precision    recall  f1-score   support

         0.0       0.42      0.67      0.51        15
         1.0       0.38      0.18      0.24        17

    accuracy                           0.41        32
   macro avg       0.40      0.42      0.38        32
weighted avg       0.39      0.41      0.37        32



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 1. ... 1. 1. 1.]
VAL SCORES [0.97023994 0.90087914 0.30985844 ... 0.97409195 0.95305705 0.78340906]
              precision    recall  f1-score   support

         0.0       0.84      0.91      0.87      2675
         1.0       0.91      0.82      0.86      2725

    accuracy                           0.87      5400
   macro avg       0.87      0.87      0.87      5400
weighted avg       0.87      0.87      0.87      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 1. ... 1. 1. 1.]
VAL SCORES [0.99736315 0.992062   0.9666631  ... 0.9990854  0.9984452  0.98324674]
              precision    recall  f1-score   support

         0.0       0.91      0.86      0.88      2675
         1.0       0.87      0.92      0.89      2725

    accuracy                           0.89      5400
   macro avg       0.89      0.89      0.89      5400
weighted avg       0.89      0.89      0.89      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 1. ... 1. 1. 1.]
VAL SCORES [0.98549646 0.98168314 0.84479356 ... 0.99190676 0.9864026  0.9301416 ]
              precision    recall  f1-score   support

         0.0       0.91      0.86      0.89      2675
         1.0       0.87      0.92      0.90      2725

    accuracy                           0.89      5400
   macro avg       0.89      0.89      0.89      5400
weighted avg       0.89      0.89      0.89      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 1. ... 1. 1. 1.]
VAL SCORES [0.9957353  0.99582565 0.843702   ... 0.99762183 0.99783844 0.8802834 ]
              precision    recall  f1-score   support

         0.0       0.90      0.89      0.89      2675
         1.0       0.89      0.90      0.89      2725

    accuracy                           0.89      5400
   macro avg       0.89      0.89      0.89      5400
weighted avg       0.89      0.89      0.89      5400



TEST

In [None]:
# show the result here
%reload_ext tensorboard
%tensorboard --logdir '/content/drive/MyDrive/colab/BERT/logs/'

In [1]:
# test the model with some sentence
inputs = ["phục vụ lâu quá mặc dù khách rất vắng"]
outputs = model(inputs)
logits = outputs['logits']
score = torch.softmax(logits, dim=-1)
Labels = ["Positive", "Negative"]
print(score)
print(f"The sentence: '{inputs[0]}' has {Labels[torch.argmax(score, dim=-1).item()]} tone with confident score : {score[torch.argmax(score, dim=-1).item()]}" )


NameError: ignored