### Train

In [2]:
import os
import pandas as pd
import numpy as np
from pyvi import ViTokenizer
import torch
from torch.nn import Sigmoid
from datasets import Dataset
from sklearn.metrics import f1_score, accuracy_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from constants import PROPER_LABELS

In [3]:
train_folder = "./data/telesale/processed_100_calls/"
valid_folder = "./data/telesale/validation/"
train_X_path = os.path.join(train_folder, "texts.txt")
train_y_path = os.path.join(train_folder, "labels.txt")
valid_X_path = os.path.join(valid_folder, "texts.txt")
valid_y_path = os.path.join(valid_folder, "labels.txt")

# train_folder = "./data/pho_atis/train/"
# valid_folder = "./data/pho_atis/validation/"
# train_X_path = os.path.join(train_folder, "texts.txt")
# train_y_path = os.path.join(train_folder, "labels.txt")
# valid_X_path = os.path.join(valid_folder, "texts.txt")
# valid_y_path = os.path.join(valid_folder, "labels.txt")


train_X, train_y = [], []
valid_X, valid_y = [], []

def read_file(file):
    contents = []
    with open(file, "r") as f:
        lines = f.readlines()
        for line in lines:
            contents.append(line.strip())
    return contents

train_X = read_file(train_X_path)
train_y = read_file(train_y_path)
valid_X = read_file(valid_X_path)
valid_y = read_file(valid_y_path)

In [4]:
train_df = pd.DataFrame.from_dict({
    "texts": train_X,
    "targets": train_y
})
valid_df = pd.DataFrame.from_dict({
    "texts": valid_X,
    "targets": valid_y
})

In [5]:
# labels_set = PROPER_LABELS
labels_set = set()
for idx, row in train_df.iterrows():
    labels = row["targets"]
    labels = labels.split(",")
    for label in labels:
        labels_set.add(label)

id2label = {idx: label for idx, label in enumerate(labels_set)}
label2id = {label: idx for idx, label in enumerate(labels_set)}

In [6]:
train_set = Dataset.from_pandas(train_df)
valid_set = Dataset.from_pandas(valid_df)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def preprocess_data(text):
    _text = text['texts'].lower()
    _text = ViTokenizer.tokenize(_text)
    _text = tokenizer(_text, padding="max_length", truncation=True)
    labels = [0] * len(labels_set)
    labels_list = text["targets"].split(",")
    for label in labels_list:
        idx = label2id[label]
        labels[idx] = 1
    _text["labels"] = np.array(labels, dtype=float)
    return _text

encoded_trainset = train_set.map(preprocess_data, remove_columns=["texts", "targets"])
encoded_validset = valid_set.map(preprocess_data, remove_columns=["texts", "targets"])

# tmp = encoded_trainset.train_test_split(test_size=0.2)
# encoded_trainset = tmp["train"]
# encoded_evalset = tmp["test"]

encoded_trainset.set_format("torch")
encoded_validset.set_format("torch")
# encoded_evalset.set_format("torch")

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [9]:
example = encoded_trainset[0]
tokenizer.decode(example["input_ids"])

'<s> anh đơn hả anh </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

In [10]:
example["labels"]

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="./results/tapt/telesale/checkpoint-4800/",
#     pretrained_model_name_or_path="vinai/phobert-base",
    problem_type="multi_label_classification",
    label2id=label2id,
    id2label=id2label,
    num_labels=len(labels_set)
)

Some weights of the model checkpoint at ./results/tapt/telesale/checkpoint-4800/ were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./results/tapt/telesale/checkpoint-4800/ and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.de

In [12]:
def multilabel_metrics(preds, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(preds))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro = f1_score(y_true, y_pred, average="micro")
    f1_macro = f1_score(y_true, y_pred, average="macro")
    acc = accuracy_score(y_true, y_pred)
    metrics = {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "accuracy": acc
    }
    return metrics

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multilabel_metrics(preds, p.label_ids)
    return result

In [13]:
encoded_trainset[0]["labels"].type()

'torch.FloatTensor'

In [14]:
encoded_trainset["input_ids"][0]

tensor([    0,    83,   807, 10767,    83,     2,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [15]:
encoded_trainset["input_ids"][0].unsqueeze(0).size()

torch.Size([1, 256])

In [16]:
# outputs = model(input_ids=encoded_trainset["input_ids"][0].unsqueeze(0).to("cuda"),
#                 labels=encoded_trainset["labels"][0].unsqueeze(0).to("cuda"))
# print(outputs)

In [None]:
args = TrainingArguments(
    output_dir='./results/tapt/telesale/finetune',
    num_train_epochs=90,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    logging_steps=100,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="wandb"
)

#### Focal Loss

In [3]:
""" We apply focal loss to fine-tune PhoBERT model
"""
from torchvision.ops import sigmoid_focal_loss


class FLTrainer(Trainer):
    """ Custom trainer for training with Focal Loss
    """
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = model["labels"]
        outputs = model(**inputs)
        logits = outputs["logits"]
        loss = sigmoid_focal_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [17]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_trainset,
    eval_dataset=encoded_validset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(5)]
)

In [18]:
for name, params in model.roberta.embeddings.named_parameters():
    params.requires_grad = False

for name, params in model.roberta.encoder.named_parameters():
    params.requires_grad = False
    if "layer.11" in name:
        params.requires_grad = True

trainer.train()

***** Running training *****
  Num examples = 1117
  Num Epochs = 90
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12600
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mhosjiu[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
100,0.6897,0.661141,0.032389,0.024864,0.0
200,0.6148,0.541816,0.0,0.0,0.0
300,0.4659,0.359147,0.0,0.0,0.0
400,0.2931,0.214502,0.0,0.0,0.0
500,0.1873,0.151486,0.0,0.0,0.0
600,0.1444,0.129536,0.0,0.0,0.0
700,0.1303,0.121001,0.0,0.0,0.0
800,0.1237,0.116869,0.0,0.0,0.0
900,0.1206,0.114894,0.0,0.0,0.0
1000,0.1196,0.113292,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 92
  Batch size = 8
Saving model checkpoint to ./results/tapt/telesale/finetune/checkpoint-100
Configuration saved in ./results/tapt/telesale/finetune/checkpoint-100/config.json
Model weights saved in ./results/tapt/telesale/finetune/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [results/tapt/telesale/finetune/checkpoint-4700] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 92
  Batch size = 8
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
Saving model checkpoint to ./results/tapt/telesale/finetune/checkpoint-200
Configuration saved in ./results/tapt/telesale/finetune/checkpoint-200/config.json
Model weights saved in ./results/tapt/telesale/finetune/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [results/tapt/telesale/finetune/checkpoint-4800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 92
  Batch size = 8
  _warn_prf(average, "

TrainOutput(global_step=12600, training_loss=0.05578526845054021, metrics={'train_runtime': 527.7614, 'train_samples_per_second': 190.484, 'train_steps_per_second': 23.874, 'total_flos': 1.32294332526336e+16, 'train_loss': 0.05578526845054021, 'epoch': 90.0})

In [None]:
trainer.evaluate()

### Inference

In [20]:
model = AutoModelForSequenceClassification.from_pretrained("./results/tapt/telesale/finetune/checkpoint-8100")

404 Client Error: Not Found for url: https://huggingface.co/results/tapt/telesale/finetune/checkpoint-8100/resolve/main/config.json


OSError: Can't load config for './results/tapt/telesale/finetune/checkpoint-8100'. Make sure that:

- './results/tapt/telesale/finetune/checkpoint-8100' is a correct model identifier listed on 'https://huggingface.co/models'
  (make sure './results/tapt/telesale/finetune/checkpoint-8100' is not a path to a local directory with something else, in that case)

- or './results/tapt/telesale/finetune/checkpoint-8100' is the correct path to a directory containing a config.json file



In [None]:
def inference(text):
    tokenized_text = tokenizer(text, return_tensors="pt", truncation=True)
    tokenized_text = {k: v.to(model.device) for k, v in tokenized_text.items()}
    output = model(**tokenized_text)
    sigmoid = Sigmoid()
    probs = sigmoid(output.logits.squeeze().cpu())
    preds = np.zeros(probs.shape)
    preds[np.where(probs >= 0.1)] = 1
    return [model.config.id2label[idx] for idx, v in enumerate(preds) if v == 1.]

In [None]:
text = "à em chào chị nha em là nhân viên bên phía công ty a vi bi ép xê á thuộc à ngân hàng thịnh vượng nè chị à thì em thấy cái hồ sơ của chị lúc trước á là mình có tham gia mua trả góp thì thông tin chị đóng tiền rất là uy tín thì kì này ngân hàng bên em gọi ra hỗ trợ chị một khoản vay tiền mặt lên đến năm mươi chín triệu đồng mà lãi suất bên em đang giảm cho chị chỉ còn có không chấm sáu thôi được chưa thì cái này khi mà chị đăng ký chị tham gia á thì mỗi tháng chị góp cho em dao động như là cái mức lương thu nhập của chị trước đây chị có khai báo khi mà chị mua đồ trả góp á là ba triệu ha thì cái này là góp cố định cho bên em luôn gốc lãi luôn ngoài ra không phát sinh thêm chi phí gì khác nữa thủ tục thì chỉ cần đúng cái chứng minh nhân dân thôi thì em sẽ ký hợp đồng ký hợp đồng rồi mình à sẽ nhận tiền luôn ha"
print(inference(text))