In [30]:
import os
from typing import List, Text, Tuple, Union
from pprint import pprint
from datetime import timedelta, datetime

from pyvi import ViTokenizer
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, FunctionTransformer
from sklearn.multiclass import OneVsRestClassifier as OVRStrategy
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.utils.validation import check_is_fitted
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from emandai.utils import load_data_from_botid
from tqdm import tqdm
from bokeh.io import output_file, output_notebook, export_png, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
from bokeh.layouts import gridplot

from va.src.utils.data import load_va_data

# First Look

## Load Data

In [85]:
PATH = "../data/trungquan/trainset.xlsx"
X, y = load_va_data(PATH)

## Preprocess Text

In [86]:
def preprocess(texts):
    """ Word Segmentation """
    tokenize = lambda x: ViTokenizer.tokenize(x)

    return [tokenize(text) for text in texts]

In [87]:
X = preprocess(X)

## Encode Label

In [88]:
lb = LabelBinarizer()
y = lb.fit_transform(y)
pprint(lb.classes_)
pprint(y)

array(['C2A_TUVAN', 'C2B_GLS', 'C3_KQT'], dtype='<U9')
array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       ...,
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]])


In [89]:
def encode_label(labels):
    lb = LabelBinarizer()
    lb.fit(labels)
    return lb

## Text Embedding

### TF-IDF

In [90]:
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))

In [91]:
tfidf_X = tfidf.fit_transform(X)

In [92]:
len(tfidf.vocabulary_)

1281

### Vietnamese SBERT

In [229]:
sbert = SentenceTransformer('keepitreal/vietnamese-sbert')

In [93]:
sbert_X = sbert.encode(X)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [94]:
sbert_X.shape

(396, 768)

### PhoBERT

In [331]:
phobert = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base")
photokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

In [95]:
def phobert_encode(text: Union[Text, List[Text]],
                   phobert,
                   photokenizer) -> np.ndarray:
    """ Get sentence embedding from [CLS] token of the given text.
    reference: https://discuss.huggingface.co/t/how-to-get-cls-embeddings-from-bertfortokenclassification-model/9276/3
    """
    embeddings = None

    encoded_text = photokenizer(text, padding=True, truncation=True, return_tensors="pt")
    text_ids = encoded_text["input_ids"]
    with torch.no_grad():
        outputs = phobert(text_ids, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        embeddings = last_hidden_state[:, 0, :]

        if isinstance(embeddings, torch.Tensor):
            embeddings = embeddings.detach().cpu().numpy()

        return embeddings

In [2]:
class PhoBertWrapper:
    def __init__(self):
        self.model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base")
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

    def encode(self, X: Union[Text, List[Text]]):
        embeddings = None

        encoded_text = self.tokenizer(X, padding=True, truncation=True, return_tensors="pt")
        text_ids = encoded_text["input_ids"]
        with torch.no_grad():
            outputs = self.model(text_ids, output_hidden_states=True)
            last_hidden_state = outputs.hidden_states[-1]
            embeddings = last_hidden_state[:, 0, :]
            if isinstance(embeddings, torch.Tensor):
                embeddings = embeddings.detach().cpu().numpy()

        return embeddings

In [None]:
phobert_X = phobert_encode(X, phobert, photokenizer)

In [80]:
phobert_X.shape

(396, 768)

## Classifier

### Logistic Regression

In [None]:
clf = {}
clf["tfidf"] = OneVsRestClassifier(LogisticRegression()).fit(tfidf_X, y)
clf["sbert"] = OneVsRestClassifier(LogisticRegression()).fit(sbert_X, y)
clf["phobert"] = OneVsRestClassifier(LogisticRegression()).fit(phobert_X, y)

In [77]:
""" Inference """
def infer(text: Text,
          embedding_type: Text = "tfidf",
          with_prob: bool = False
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    pred = []

    if not isinstance(text, List):
        text = [text]

    if embedding_type == "tfidf":
        vectorizer = tfidf
        _clf = clf["tfidf"]
    elif embedding_type == "sbert":
        vectorizer = sbert
        _clf = clf["sbert"]
    elif embedding_type == "phobert":
        vectorizer = phobert
        _clf = clf["phobert"]
    else:
        raise ValueError

    # preprocess
    text = preprocess(text)

    # vectorize
    if embedding_type == "tfidf":
        features = vectorizer.transform(text)
    elif embedding_type == "sbert":
        features = vectorizer.encode(text)
    elif embedding_type == "phobert":
        features = phobert.encode(text)
    else:
        raise ValueError

    # classify
    pred = _clf.predict(features)

    if with_prob:
        prob = _clf.predict_proba(features)
        prob = np.max(prob, axis=1)
        return pred, prob

    return pred

#### Features

##### TF-IDF

In [242]:
def test1(text):
    pred, prob = infer(text, embedding_type="tfidf", with_prob=True)
    pprint(pred)
    pprint(lb.inverse_transform(pred))
    pprint(prob)

In [243]:
text = "em có thể diễn giải chi tiết hơn được không"
test1(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.34704396])


In [246]:
text = "hay là chút nữa liên lạc lại em nhé"
test1(text)

array([[0, 1, 0]])
array(['C2B_GLS'], dtype='<U9')
array([0.51182048])


In [248]:
text = "cảm ơn em nhưng chắc là anh không hứng thú"
test1(text)

array([[0, 0, 1]])
array(['C3_KQT'], dtype='<U9')
array([0.62048898])


In [251]:
text = "anh nghĩ là em nên dừng lại"
test1(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.46300921])


In [253]:
text = "dạ không ạ"
test1(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.46125805])


In [257]:
text = "không đâu chị ơi"
test1(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.36902174])


In [265]:
text = "ừ không, mình đang làm rồi bạn nha"
test1(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.38223752])


##### SBERT

In [244]:
def test2(text):
    pred, prob = infer(text, embedding_type="sbert", with_prob=True)
    pprint(pred)
    pprint(lb.inverse_transform(pred))
    pprint(prob)

In [245]:
text = "em có thể diễn giải chi tiết hơn được không"
test2(text)

array([[1, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.69315568])


In [247]:
text = "hay là chút nữa liên lạc lại em nhé"
test2(text)

array([[0, 1, 0]])
array(['C2B_GLS'], dtype='<U9')
array([0.95827348])


In [249]:
text = "cảm ơn em nhưng chắc là anh không hứng thú"
test2(text)

array([[0, 0, 1]])
array(['C3_KQT'], dtype='<U9')
array([0.77097445])


In [252]:
text = "anh nghĩ là em nên dừng lại"
test2(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.21407058])


In [256]:
text = "dạ không ạ"
test2(text)

array([[1, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.89481114])


In [258]:
text = "không đâu chị ơi"
test2(text)

array([[0, 0, 1]])
array(['C3_KQT'], dtype='<U9')
array([0.66558446])


In [267]:
text = "ừ không, mình đang làm rồi bạn nha"
test2(text)

array([[0, 0, 1]])
array(['C3_KQT'], dtype='<U9')
array([0.80773914])


##### PhoBERT

In [78]:
def test3(text):
    pred, prob = infer(text, embedding_type="phobert", with_prob=True)
    pprint(pred)
    pprint(lb.inverse_transform(pred))
    pprint(prob)

In [None]:
text = "em có thể diễn giải chi tiết hơn được không"
test3(text)

In [457]:
text = "hay là chút nữa liên lạc lại em nhé"
test3(text)

array([[0, 0, 0]])
array(['C2A_TUVAN'], dtype='<U9')
array([0.37629714])


# Benchmark

In [2]:
embedding_types = ["tfidf", "sbert", "phobert"]

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
class PhoBertWrapper:
    def __init__(self):
        self.model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base")
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

    def encode(self, X: Union[Text, List[Text]]):
        embeddings = None

        encoded_text = self.tokenizer(X, padding=True, truncation=True, return_tensors="pt")
        text_ids = encoded_text["input_ids"]
        with torch.no_grad():
            outputs = self.model(text_ids, output_hidden_states=True)
            last_hidden_state = outputs.hidden_states[-1]
            embeddings = last_hidden_state[:, 0, :]
            if isinstance(embeddings, torch.Tensor):
                embeddings = embeddings.detach().cpu().numpy()

        return embeddings

In [5]:
sbert = SentenceTransformer('keepitreal/vietnamese-sbert')
phobert = PhoBertWrapper()

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: keepitreal/vietnamese-sbert
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

## Load Data

In [6]:
botid = {
    "sunlaw": "6268f7e49f455cd4ea292d88",
    "lamhaian": "6268f8049f455c653b292e29",
    "giang.nguyen": "6268f7f89f455cd9eb292df4",
    "panasonic": "628c83a46c9e4c57fdbc5df6",
    "trung.quan": "628a14af6c9e4c08cebc3e34"
}

### Training Set

In [7]:
# load data
data = {k: load_data_from_botid(v) for k, v in botid.items()}

INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token


### Test set

## Preprocess Data

In [8]:
# preprocess
def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    """ Text preprocessing
    """
    # remove duplicate
    data = data.drop_duplicates("Sentence")

    # lower
    data["Sentence"] = data["Sentence"].map(lambda x: x.lower())
    data.head(n=10)

    # word segmenation
    data["Sentence"] = data["Sentence"].map(ViTokenizer.tokenize)

    return data

In [9]:
preprocessed_data = {k: preprocess(v) for k, v in data.items()}

In [10]:
X = {k: v["Sentence"].tolist() for k, v in preprocessed_data.items()}

## Get Features (DEPRECATED)

In [7]:
""" Inititialize featurizers 
"""
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))
sbert = SentenceTransformer('keepitreal/vietnamese-sbert')
phobert = PhoBertWrapper()

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: keepitreal/vietnamese-sbert
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a B

In [8]:
def get_features(X: Union[Text, List[Text]],
                 embedding_type: Text) -> np.ndarray:
    if embedding_type == "tfidf":
        X = tfidf.fit_transform(X).toarray()
    elif embedding_type == "sbert":
        X = sbert.encode(X)
    elif embedding_type == "phobert":
        X = phobert.encode(X)
    else:
        raise ValueError("Current supported text encoders: ['tfidf', 'sbert', 'phobert']")

    return X

In [9]:
X = {k: v["Sentence"].tolist() for k, v in preprocessed_data.items()}

In [10]:
features = {k: {t: get_features(X[k], t) for t in embedding_types} for k, v in tqdm(X.items())}

  0%|                                                                                                           | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

 33%|█████████████████████████████████                                                                  | 1/3 [00:07<00:15,  7.71s/it]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

 67%|██████████████████████████████████████████████████████████████████                                 | 2/3 [00:11<00:05,  5.63s/it]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.70s/it]


## Encode Labels

In [11]:
f = lambda x: [row[row == 1].index.tolist() for _, row in x.iterrows()]
y = {k: f(v) for k, v in preprocessed_data.items()}
label_encoders = {k: MultiLabelBinarizer().fit(v) for k, v in y.items()}
y = {k: label_encoders[k].transform(v) for k, v in y.items()}

## Training

In [12]:
MAX_CPU = 4

def make_model(embs_type: Text, run_gridsearch=False) -> Pipeline:
    assert isinstance(embs_type, Text)

    if embs_type == "tfidf":
        pipeline = Pipeline(steps=[
            ("vect", TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))),
            ("clf", OVRStrategy(LR(max_iter=1000, n_jobs=MAX_CPU), n_jobs=MAX_CPU))
        ])
    elif embs_type == "sbert":
        pipeline = Pipeline(steps=[
            ("vect", FunctionTransformer(sbert.encode)),
            ("clf", OVRStrategy(LR(max_iter=1000, n_jobs=MAX_CPU), n_jobs=MAX_CPU))
        ])
    elif embs_type == "phobert":
        pipeline = Pipeline(steps=[
            ("vect", FunctionTransformer(phobert.encode)),
            ("clf", OVRStrategy(LR(max_iter=1000, n_jobs=MAX_CPU), n_jobs=MAX_CPU))
        ])
    else:
        raise ValueError("Current supported text encoders: ['tfidf', 'sbert', 'phobert']")

    if run_gridsearch:
        if embs_type == "tfidf":
            params = {
                "vect__ngram_range": [(1, 1), (1, 2), (1, 2), (2, 3)],
                "vect__sublinear_tf": [True, False],
                "vect__max_df": [0.25, 0.33, 0.5, 1.0],
                "clf__estimator__C": [1e-3, 1e-2, 1e-1, 1],
                "clf__estimator__class_weight": ["balanced", None]
            }
        else:
            params = {
                "clf__estimator__C": [1e-3, 1e-2, 1e-1, 1],
                "clf__estimator__class_weight": ["balanced", None]
            }
        pipeline = GridSearchCV(pipeline, param_grid=params,
                                n_jobs=MAX_CPU, cv=5, scoring="f1_micro")
        
    return pipeline

In [13]:
models = {k: {t: make_model(t, run_gridsearch=True) for t in embedding_types} for k in botid.keys()}

In [13]:
pprint(models)

{'giang.nguyen': {'phobert': GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        FunctionTransformer(func=<bound method PhoBertWrapper.encode of <__main__.PhoBertWrapper object at 0x7f6b52186b50>>)),
                                       ('clf',
                                        OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
                                                                                         n_jobs=-1),
                                                            n_jobs=-1))]),
             n_jobs=-1,
             param_grid={'clf__estimator__C': [0.001, 0.01, 0.1, 1],
                         'clf__estimator__class_weight': ['balanced', None]},
             scoring='f1_micro'),
                  'sbert': GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        FunctionTransformer(func=<bound method SentenceTransformer.encode of SentenceTr

In [15]:
f1_micro = {k: {t: None for t in embedding_types} for k in botid.keys()}

In [37]:
KFOLD = 10 # https://cran.r-project.org/web/packages/cvms/vignettes/picking_the_number_of_folds_for_cross-validation.html#:~:text=When%20performing%20cross%2Dvalidation%2C%20it,common%20to%20use%2010%20folds.

# Training
for botname in tqdm(botid.keys()):
    for emb_type in embedding_types:
        model = models[botname][emb_type]
        _X = X[botname]
        _y = y[botname]
        scores = cross_validate(model, _X, _y,
                                scoring=("f1_micro"),
                                cv=KFOLD, n_jobs=1)
        f1_micro[botname][emb_type] = scores["test_score"].mean().round(2)

  0%|                                                                                                                 | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

 20%|█████████████████████                                                                                    | 1/5 [01:05<04:21, 65.47s/it]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

 40%|██████████████████████████████████████████                                                               | 2/5 [02:01<03:00, 60.02s/it]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

 60%|███████████████████████████████████████████████████████████████                                          | 3/5 [03:09<02:07, 63.66s/it]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

 80%|████████████████████████████████████████████████████████████████████████████████████                     | 4/5 [05:43<01:39, 99.19s/it]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:16<00:00, 75.24s/it]


In [18]:
f1_micro

{'sunlaw': {'tfidf': 0.65, 'sbert': 0.81, 'phobert': 0.59},
 'lamhaian': {'tfidf': 0.23, 'sbert': 0.39, 'phobert': 0.22},
 'giang.nguyen': {'tfidf': 0.26, 'sbert': 0.48, 'phobert': 0.31},
 'panasonic': {'tfidf': 0.35, 'sbert': 0.63, 'phobert': 0.3},
 'trung.quan': {'tfidf': 0.69, 'sbert': 0.81, 'phobert': 0.68}}

### Visualize

In [248]:
# Ref: https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#visual-offset

output_notebook()
# output_file("./dodged_bars.html")

botnames = [botname for botname in botid.keys()]
vis_tfidf = [f1_micro[botname]["tfidf"] for botname in botnames]
vis_sbert = [f1_micro[botname]["sbert"] for botname in botnames]
vis_phobert = [f1_micro[botname]["phobert"] for botname in botnames]

vis_data = {
    "botnames": botnames,
    "tfidf": vis_tfidf,
    "sbert": vis_sbert,
    "phobert": vis_phobert
}

source = ColumnDataSource(data=vis_data)

TITLE = "Models Benchmark (w/ diff vectorizer)"
p = figure(x_range=botnames, y_range=(0, 1), height=500,
           title=TITLE, toolbar_location=None, tools="")

p.vbar(x=dodge("botnames", -0.25, range=p.x_range),
       top="tfidf", width=0.2, source=source,
       color="#c9d9d3", legend_label="TF-IDF (Baseline)")

p.vbar(x=dodge("botnames", 0.0, range=p.x_range),
       top="sbert", width=0.2, source=source,
       color="#718dbf", legend_label="Vietnamese SBERT (No FT)")

p.vbar(x=dodge("botnames", 0.25, range=p.x_range),
       top="phobert", width=0.2, source=source,
       color="#e84d60", legend_label="PhoBERT-base (No FT)")

p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.yaxis.axis_label = "f1 micro"
p.legend.orientation = "vertical"
p.legend.location = "top_center"

show(p)

# p.background_fill_color = None
# p.border_fill_color = None
# export_png(p, filename="../imgs/emb_benchmark.png")

## Timing

### Inference Time

In [266]:
botname = "trung.quan"
for e in embedding_types:
    models[botname][e].fit(X[botname], y[botname])

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [278]:
sorted_infer_X = sorted(X[botname], key=lambda x: len(x))
len_intervals = [5, 10, 15, 20]
grouped_X = {str(i): [] for i in len_intervals}

c = 0
for i in len_intervals:
    while c < len(sorted_infer_X):
        text = sorted_infer_X[c]
        text_len = len(text.split())
        if text_len <= i:
            grouped_X[str(i)].append(sorted_infer_X[c])
            c += 1
            continue
        break

In [279]:
infer_time = {str(i): {e: None for e in embedding_types} for i in len_intervals}
for interval, texts in grouped_X.items():
    for e in embedding_types:
        durations = []
        for text in texts:
            start = datetime.now() # start
            models[botname][e].predict([text])
            end = datetime.now() # end
            duration = end - start # delta
            duration = duration.microseconds # microseconds
            duration = duration / 1e3 # miliseconds
            duration = np.round(duration, 2)
            durations.append(duration)
        infer_time[interval][e] = duration.mean()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [280]:
infer_time

{'5': {'tfidf': 0.57, 'sbert': 18.6, 'phobert': 20.71},
 '10': {'tfidf': 0.58, 'sbert': 19.09, 'phobert': 22.45},
 '15': {'tfidf': 0.59, 'sbert': 18.95, 'phobert': 23.89},
 '20': {'tfidf': 0.6, 'sbert': 19.22, 'phobert': 23.44}}

#### Visualize

In [334]:
# Ref: https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#visual-offset

output_notebook()

str_len_intervals = [f"<{str(i)}" for i in len_intervals]

vis_infer_tfidf = [infer_time[str(i)]["tfidf"] for i in len_intervals]
vis_infer_sbert = [infer_time[str(i)]["sbert"] for i in len_intervals]
vis_infer_phobert = [infer_time[str(i)]["phobert"] for i in len_intervals]

vis_data = {
    "token_len": str_len_intervals,
    "tfidf": vis_infer_tfidf,
    "sbert": vis_infer_sbert,
    "phobert": vis_infer_phobert
}

source = ColumnDataSource(data=vis_data)

TITLE = "Inference Time Benhmark"
p = figure(x_range=str_len_intervals,
           y_range=(0, 30), height=700,
           title=TITLE)

p.vbar(x=dodge("token_len", -0.25, range=p.x_range),
       top="tfidf", width=0.2, source=source,
       color="#c9d9d3", legend_label="TF-IDF (Baseline)")

p.vbar(x=dodge("token_len", 0.0, range=p.x_range),
       top="sbert", width=0.2, source=source,
       color="#718dbf", legend_label="Vietnamese SBERT (No FT)")

p.vbar(x=dodge("token_len", 0.25, range=p.x_range),
       top="phobert", width=0.2, source=source,
       color="#e84d60", legend_label="PhoBERT-base (No FT)")

p.line([-5, 5], [9, 9], line_color="#64bf1f",
       line_width=1.5, legend_label="Allowed Maximum Inference Time")

p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.yaxis.axis_label = "inference time (ms)"
p.xaxis.axis_label = "sentence length"
p.legend.orientation = "vertical"
p.legend.location = "top_left"

show(p)

### Training Time

In [14]:
botnames = [botname for botname in botid.keys()]
for botname in botnames:
    print(f"{botname}: {len(X[botname])} (# samples)")

sunlaw: 809 (# samples)
lamhaian: 722 (# samples)
giang.nguyen: 919 (# samples)
panasonic: 1607 (# samples)
trung.quan: 397 (# samples)


In [15]:
trainset_name = ["trung.quan", "giang.nguyen", "panasonic"]

In [16]:
training_time = {t: {e: None for e in embedding_types} for t in trainset_name}

In [17]:
training_time

{'trung.quan': {'tfidf': None, 'sbert': None, 'phobert': None},
 'giang.nguyen': {'tfidf': None, 'sbert': None, 'phobert': None},
 'panasonic': {'tfidf': None, 'sbert': None, 'phobert': None}}

In [18]:
for t in tqdm(trainset_name):
    for e in embedding_types:
        start = datetime.now()
        models[t][e].fit(X[t], y[t])
        end = datetime.now()
        t_time = (end - start).seconds
        t_time = np.round(t_time, 2)
        training_time[t][e] = t_time

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [19]:
training_time

{'trung.quan': {'tfidf': 37, 'sbert': 76, 'phobert': 104},
 'giang.nguyen': {'tfidf': 40, 'sbert': 117, 'phobert': 223},
 'panasonic': {'tfidf': 87, 'sbert': 214, 'phobert': 445}}

  "Label %s is present in all training examples." % str(classes[c])


#### Visualize

In [40]:
# Ref: https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#visual-offset

output_notebook()

dataset_names = [f"{t}\n({len(X[t])} samples)" for t in trainset_name]
vis_training_tfidf = [training_time[t]["tfidf"] for t in trainset_name]
vis_training_sbert = [training_time[t]["sbert"] for t in trainset_name]
vis_training_phobert = [training_time[t]["phobert"] for t in trainset_name]

vis_data = {
    "dataset_names": dataset_names,
    "tfidf": vis_training_tfidf,
    "sbert": vis_training_sbert,
    "phobert": vis_training_phobert
}

source = ColumnDataSource(data=vis_data)

TITLE = "Training Time Benchmark"
p = figure(x_range=dataset_names, y_range=(0, 500),
           title=TITLE)

p.vbar(x=dodge("dataset_names", -0.25, range=p.x_range),
       top="tfidf", width=0.2, source=source,
       color="#c9d9d3", legend_label="TF-IDF (Baseline)")

p.vbar(x=dodge("dataset_names", 0.0, range=p.x_range),
       top="sbert", width=0.2, source=source,
       color="#718dbf", legend_label="Vietnamese SBERT (No FT)")

p.vbar(x=dodge("dataset_names", 0.25, range=p.x_range),
       top="phobert", width=0.2, source=source,
       color="#e84d60", legend_label="PhoBERT-base (No FT)")

p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Datasets"
p.yaxis.axis_label = "Training time (seconds)"
p.legend.orientation = "vertical"
p.legend.location = "top_left"

show(p)

## Evaluate