## Install dependences
- pytorch-lightning: a simple trainer to help you minize code base
- transformers: library contains multiple BERT models
- sentencepiece: a word-to-vect library with fast implementation

In [None]:
!pip install pytorch-lightning
!pip install transformers
!pip install sentencepiece

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.5.3-py3-none-any.whl (523 kB)
[K     |████████████████████████████████| 523 kB 12.9 MB/s 
[?25hCollecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 51.3 MB/s 
Collecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.3 MB/s 
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 54.8 MB/s 
Collecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.6.0-py3-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 56.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x

In [None]:
# mount to your drive and access your dataset
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# LOAD DATA

In [None]:
# replace this path to your dataset directory
DATA_ROOT_DIR="/content/drive/MyDrive/Colabs/shopee-sentiment"
!ls $DATA_ROOT_DIR

sample_submission.csv  train.csv	     train_preprocess_unsegment.csv
test.csv	       train.gsheet
test_preprocess.csv    train_preprocess.csv


In [None]:
# include some dependence
import pandas as pd
import numpy as np
from torch.utils.data import random_split, DataLoader, Dataset
import pytorch_lightning as pl
import torch.nn as nn
import torch
import time

# custom retio
train_ratio = 0.2

DATA_DIR = '/content/drive/MyDrive/Colabs/train_preprocess.csv'

In [None]:
# Use pandas to read csv, this will return a excel like table data
train = pd.read_csv(DATA_DIR,usecols=['preprocess_text', 'class']).dropna()
train.head()

Unnamed: 0,class,preprocess_text
0,1,đến quán 2 lần thôi rất là thích quán tuy nằm ...
1,0,đến quán vào tối chủ_nhật có band hát khá ổn t...
2,0,phục_vụ lâu quá mặc_dù khách rất vắng đợi gần ...
3,0,ko gian bé_tí quán chật_chội đông người nên ...
4,1,khi mình order đặt bánh thì nhận được sự tiếp_...


In [None]:
from typing import Optional
class SentimentData(Dataset):
    """
    Dataset class for sentiment analysis. 
    Every dataset using pytorch should be overwrite this class
    This require 2 function, __len__ and __getitem__
    """
    def __init__(self, data_dir):
        """
        Args:
            data_dir (string): Directory with the csv file
        """
        self.df = pd.read_csv(data_dir, index_col=0).dropna().reset_index(drop=True)

    def __len__(self):
        """
        length of the dataset, i.e. number of rows in the csv file
        Returns: int 
        """
        return len(self.df)

    def __getitem__(self, idx):
        """
        given a row index, returns the corresponding row of the csv file
        Returns: text (string), label (int) 
        """
        text = self.df["preprocess_text"][idx]
        label = self.df["class"][idx]

        return text, label


class SentimentDataModule(pl.LightningDataModule):
    """
    Module class for sentiment analysis. this class is used to load the data to the model. 
    It is a subclass of LightningDataModule. 
    """

    def __init__(self, data_dir: str = DATA_DIR, batch_size: int = 8):
        """
        Args:
            data_dir (string): Directory with the csv file
            batch_size (int): batch size for dataloader
        """
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        """
        Loads the data to the model. 
        the data is loaded in the setup function, so that it is loaded only once. 
        """
        data_full = SentimentData(self.data_dir)
        train_size = round(len(data_full) * train_ratio)
        val_size = len(data_full) - train_size
        print(len(data_full), train_size, val_size)
        self.data_train, self.data_val = random_split(data_full, [train_size, val_size])

    def train_dataloader(self):
        """
        Returns: dataloader for training
        """
        return DataLoader(self.data_train, batch_size=self.batch_size)

    def val_dataloader(self):
        """
        Returns: dataloader for validation
        """
        return DataLoader(self.data_val, batch_size=self.batch_size)

# Do some Test with data
if __name__ == "__main__":
	dm = SentimentDataModule(DATA_DIR)
	dm.setup()
	idx = 0
	for item in (dm.train_dataloader()):
		print(idx)
		print(item)
		idx += 1
		if idx > 5: break



27000 5400 21600
0
[('quán này mình ăn cả 2 năm nay rồi tuy_nhiên mới gia_nhập gia_đình fody nên giờ mới có cơ_hội review địa_điểm khá dễ tìm tuy_nhiên quán nằm trên dốc ngự_bình   nên nếu đi xe_máy thì không sao chứ đi xe_đạp mà lên đây ăn thì hơi nhác view bình_thường lần đầu_tiên vào đây mình khá ấn_tượng bởi cái bảng trên đó vinh_danh   nhưng danh_nhân ăn_chay trên thế_giới quán nhìn sạch_sẽ thoáng mát tuy_nhiên nếu đến đây vào ngày rằm mồng 1 thì khác chật_chội và nhìn hơi bẩn phục_vụ tạm ổn giá_cả chất_lượng quán này là sự kết_hợp của cả 3 yếu_tố ngon bổ rẻ giá thì rẻ mà thức_ăn lại rất ngon vì_vậy mình mới là khách quen của quán', 'hôm rồi mình có đi dạo trên vỉa_hè bạch_đằng thì bắt_gặp những chiếc xe_đạp hồng khá dể thương để_ý kỉ thấy nó bán cafe và kem dọc con đường này mình có gọi thử 2 cây kem kem có 23 vị mà mình chọn socola cô bé bán hàng rất bài_bản bắt kem trên cây ốc quế xịt socôla còn gói thêm cho mình ít giấy_ăn cô bé còn gởi mình card kèm số đt để gọi khi cần thật_

# Model

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
mbert = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [None]:
# try to convert some text into numbers
inputs = ["Tôi ghét nó", "Tôi thích nó", "Tôi quý nó"]
inputs = tokenizer(inputs, return_tensors='pt',padding=True ,truncation=True)
print(inputs)
outputs = mbert(**inputs)
print(outputs)

{'input_ids': tensor([[    0, 14343, 74443,    18,  3711,     2],
        [    0, 14343, 12186,  3711,     2,     1],
        [    0, 14343, 23640,  3711,     2,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0987, -0.0863],
        [ 0.1001, -0.0840],
        [ 0.1030, -0.0844]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
mbert

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score


class SentimentRoberta(pl.LightningModule):
    """
    SentimentRoberta class inherits from LightningModule
    This class is used to train a model using PyTorch Lightning
    It overrides the following methods:
        - forward : forward pass of the model
        - training_step : training step of the model
        - validation_step : validation step of the model
        - validation_epoch_end : end of the validation epoch
        - configure_optimizers : configure optimizers
    """
    def __init__(self, lr_mbert, lr_classifier):
        """
        Initialize the model with the following parameters:
            - lr_roberta : learning rate of the roberta model
            - lr_classifier : learning rate of the classifier model
        """
        super().__init__()
        self.mbert = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.lr_mbert = lr_mbert
        self.lr_classifier = lr_classifier

    def forward(self, texts, labels=None):
        """
        Forward pass of the model
        Args:
            - texts : input texts
            - labels : labels of the input texts
        """
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        for key in inputs:
            inputs[key] = inputs[key].to(self.device)

        outputs = self.mbert(**inputs, labels=labels)
        return outputs

    def configure_optimizers(self):
        """
        Configure optimizers
        This method is used to configure the optimizers of the model by using the learning rate
        for specific parameter of the roberta model and the classifier model
        """
        mbert_params = self.mbert.roberta.named_parameters()
        classifier_params = self.mbert.classifier.named_parameters()

        grouped_params = [
            {"params": [p for n, p in mbert_params], "lr": self.lr_mbert},
            {"params": [p for n, p in classifier_params], "lr": self.lr_classifier}
        ]
        optimizer = torch.optim.AdamW(
            grouped_params
        )
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.98)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'f1/val',
            }
        }

    def training_step(self, batch, batch_idx):
        """
        Training step of the model
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step of the model, used to compute the metrics
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()

        output_scores = torch.softmax(logits, dim=-1)
        return loss, output_scores, labels

    def validation_epoch_end(self, validation_step_outputs):
        """
        End of the validation epoch, this method will be called at the end of the validation epoch,
        it will compute the multiple metrics of classification problem
        Args:
            - validation_step_outputs : outputs of the validation step
        """

        val_preds = torch.tensor([], device=self.device)
        val_scores = torch.tensor([], device=self.device)
        val_labels = torch.tensor([], device=self.device)
        val_loss = 0
        total_item = 0

        for idx, item in enumerate(validation_step_outputs):
            loss, output_scores, labels = item

            predictions = torch.argmax(output_scores, dim=-1)
            val_preds = torch.cat((val_preds, predictions), dim=0)
            val_scores = torch.cat((val_scores, output_scores[:, 1]), dim=0)
            val_labels = torch.cat((val_labels, labels), dim=0)

            val_loss += loss
            total_item += 1

        # print("VAL PREDS", val_preds.shape)
        # print("VAL SCORES", val_scores.shape)
        # print("VAL LABELS", val_labels.shape)
        val_preds = val_preds.cpu().numpy()
        val_scores = val_scores.cpu().numpy()
        val_labels = val_labels.cpu().numpy()

        reports = classification_report(val_labels, val_preds, output_dict=True)
        print("VAL LABELS", val_labels)
        print("VAL SCORES", val_scores)
        try:
            auc = roc_auc_score(val_labels, val_scores)
        except Exception as e:
            print(e)
            print("Cannot calculate AUC. Default to 0")
            auc = 0
        accuracy = accuracy_score(val_labels, val_preds)

        print(classification_report(val_labels, val_preds))

        self.log("loss/val", val_loss)
        self.log("auc/val", auc)
        self.log("accuracy/val", accuracy)
        self.log("precision/val", reports["weighted avg"]["precision"])
        self.log("recall/val", reports["weighted avg"]["recall"])
        self.log("f1/val", reports["weighted avg"]["f1-score"])

In [None]:

trainer = pl.Trainer(
    fast_dev_run=True,
    gpus=1,
)
model = SentimentRoberta(lr_mbert=1e-5, lr_classifier=3e-3)
dm = SentimentDataModule()

trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

27000 5400 21600



  | Name  | Type                                | Params
--------------------------------------------------------------
0 | mbert | XLMRobertaForSequenceClassification | 278 M 
--------------------------------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
1,112.181 Total estimated model params size (MB)
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 0. 1. 1. 0. 0. 1.]
VAL SCORES [0.562457   0.56472605 0.6174318  0.6340162  0.5688453  0.4669752
 0.4944857  0.48685178]
              precision    recall  f1-score   support

         0.0       0.67      0.67      0.67         3
         1.0       0.80      0.80      0.80         5

    accuracy                           0.75         8
   macro avg       0.73      0.73      0.73         8
weighted avg       0.75      0.75      0.75         8



# Training

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')
start = time.time()
torch.manual_seed(123)

tb_logger = pl_loggers.TensorBoardLogger('/content/drive/MyDrive/Colabs/logsxmlr/')

trainer = pl.Trainer(
    min_epochs=1,
    max_epochs=2,
    gpus=1,
    precision=16,
    val_check_interval=0.5,
    # check_val_every_n_epoch=1,
    callbacks=[
      ModelCheckpoint(
          dirpath='/content/drive/MyDrive/Colabs/ckpt',
          save_top_k=3,
          monitor='f1/val',
      ), 
      EarlyStopping('f1/val', patience=5)
    ],
    fast_dev_run=False,
    logger=tb_logger
)

dm.setup(stage="fit")
trainer.fit(model, dm)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                                | Params
--------------------------------------------------------------
0 | mbert | XLMRobertaForSequenceClassification | 278 M 
--------------------------------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
556.090   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0.]
VAL SCORES [0.58962774 0.58649325 0.62337685 0.6410854  0.58752906 0.50980633
 0.5351212  0.52854866 0.6466276  0.60372746 0.5990747  0.6139312
 0.5842712  0.64282495 0.61199087 0.611353  ]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.50      1.00      0.67         8

    accuracy                           0.50        16
   macro avg       0.25      0.50      0.33        16
weighted avg       0.25      0.50      0.33        16



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [1. 1. 0. ... 0. 0. 1.]
VAL SCORES [0.81405276 0.82876563 0.11531159 ... 0.7871551  0.1867616  0.803598  ]
              precision    recall  f1-score   support

         0.0       0.91      0.87      0.89     10757
         1.0       0.88      0.91      0.89     10843

    accuracy                           0.89     21600
   macro avg       0.89      0.89      0.89     21600
weighted avg       0.89      0.89      0.89     21600



In [None]:
end = time.time()

In [None]:
end - start

TEST

In [None]:
# show the result here
%reload_ext tensorboard
%tensorboard --logdir '/content/drive/MyDrive/Colabs/logxmlr/'

In [None]:
# test the model with some sentence
inputs = [
    "Món ăn ngon, đồ uống ngon, rẻ",
    "Món ăn ngon, đồ uống ngon nhưng đắt",
    "Tuy rẻ nhưng đồ ăn không ngon, đồ uống không ngon",
    "không gian chật hẹp, nhân viên không nhiệt tình, đò ăn tạm được",
    "hàng rất tốt, đẹp, mẫu mã đa dạng, sẽ quay lại lần sau"
  ]
outputs = model(inputs)
logits = outputs
score = torch.softmax(logits, dim=-1)
Labels = ["Negative", "Positive"]
numSent = len(inputs)

for index ,item in enumerate(inputs):
    print(f"The sentence: '{inputs[index]}' has {Labels[torch.argmax(score[index], dim=-1).item()]} tone with confident score : {score[index][torch.argmax(score[index], dim=-1).item()]} \n" )


In [None]:
SAVE_MODEL_PATH = "/content/drive/MyDrive/Colabs/saveModel/xmlrModel.pt"
torch.save(model.state_dict(), SAVE_MODEL_PATH)

In [None]:
modelSave = SentimentRoberta(lr_mbert=1e-5, lr_classifier=3e-3)
modelSave.load_state_dict(torch.load(SAVE_MODEL_PATH))

In [None]:
# test the model with some sentence
inputs = [
    "Món ăn ngon, đồ uống ngon, rẻ",
    "Món ăn ngon",
    "Tuy rẻ nhưng đồ ăn không ngon, đồ uống không ngon",
    "không gian chật hẹp, nhân viên nhiệt tình, đồ ăn ngon",
    "hàng rất tốt, đẹp, mẫu mã đa dạng, sẽ quay lại lần sau"
  ]
outputs = modelSave(inputs)
logits = outputs
score = torch.softmax(logits, dim=-1)
Labels = ["Negative", "Positive"]
numSent = len(inputs)

for index ,item in enumerate(inputs):
    print(f"The sentence: '{inputs[index]}' has {Labels[torch.argmax(score[index], dim=-1).item()]} tone with confident score : {score[index][torch.argmax(score[index], dim=-1).item()]} \n" )
