In [None]:
# Install the vncorenlp python wrapper
!pip install vncorenlp

In [None]:
# Download VnCoreNLP-1.1.1.jar & its word segmentation component (i.e. RDRSegmenter) 
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

In [None]:
import pandas as pd
import numpy as np
import json
from vncorenlp import VnCoreNLP
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset, load_metric, load_dataset
import matplotlib.pyplot as plt
import torch

In [None]:
topics_path = "/kaggle/input/datalawvn/topics_datalaw.json"

In [None]:
segmented_datalaw_topics = pd.read_json(topics_path)

In [None]:
# Split the dataset into train and remaining (80% - 20%)
train_topics_dataset = segmented_datalaw_topics.sample(frac=0.8, random_state=42)
remaining_dataset = segmented_datalaw_topics.drop(train_topics_dataset.index)

# Split the remaining dataset into validation and test (50% - 50%)
validation_topics_dataset = remaining_dataset.sample(frac=0.5, random_state=42)
test_topics_dataset = remaining_dataset.drop(validation_topics_dataset.index)

# Print the number of records in each set
print(f'The training topics dataset has {len(train_topics_dataset)} records.')
print(f'The validation topics dataset has {len(validation_topics_dataset)} records.')
print(f'The test topics dataset has {len(test_topics_dataset)} records.')

In [None]:
labels = [label for label in train_topics_dataset.columns if label != 'question']
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}
labels[:5]

In [None]:
hg_train_topics_dataset = Dataset.from_pandas(train_topics_dataset)
hg_test_topics_dataset = Dataset.from_pandas(test_topics_dataset)
hg_validation_topics_dataset = Dataset.from_pandas(validation_topics_dataset)

In [None]:
print(f'The length of hg_train_topics_dataset is {len(hg_train_topics_dataset)}.\n')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')

In [None]:
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unknown token is {tokenizer.unk_token_id}')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}')

In [None]:
def tokenize_topics_dataset(examples):
  # take a batch of texts
  text = examples["question"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [None]:
encoded_dataset_topics_train = hg_train_topics_dataset.map(tokenize_topics_dataset, batched=True, remove_columns=hg_train_topics_dataset.column_names)
encoded_dataset_topics_test = hg_test_topics_dataset.map(tokenize_topics_dataset, batched=True, remove_columns=hg_train_topics_dataset.column_names)
encoded_dataset_topics_validation = hg_validation_topics_dataset.map(tokenize_topics_dataset, batched=True, remove_columns=hg_train_topics_dataset.column_names)

In [None]:
encoded_dataset_topics_train.set_format("torch")
encoded_dataset_topics_test.set_format("torch")
encoded_dataset_topics_validation.set_format("torch")

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base-v2", num_labels=len(labels), id2label=id2label, label2id=label2id)

In [None]:
metric_name = "f1"

In [None]:
import torch.nn as nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        # Class weighting
        loss_fct = nn.BCEWithLogitsLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
labelsss = np.array(labels)
print(f"Labels shape: {labelsss.shape}")

In [None]:
training_args = TrainingArguments(
    output_dir="./topics_classification",
    logging_dir="./topics_classification/logs",
    evaluation_strategy='epoch',
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=4,
    eval_accumulation_steps = 2,
    eval_delay=0.5,
    learning_rate=3e-5,
    weight_decay=1e-4,
    max_grad_norm=1.0,
    num_train_epochs=10,
    lr_scheduler_type="reduce_lr_on_plateau", # Giảm khi hiệu suất không cải thiện trên tập validation.
    warmup_ratio=0.1,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    log_level='debug',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,
    save_safetensors=True,
    seed=42,
    fp16=False,
    dataloader_num_workers=num_cpus,
    run_name="PhoBERTv2_topics",
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    precision_micro_average = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_micro_average = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {
        'f1': f1_micro_average,
        'precision': precision_micro_average,
        'recall': recall_micro_average,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
target.shape

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_topics_train,
    eval_dataset=encoded_dataset_topics_validation,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)])
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
text = hg_test_topics_dataset["question"][1000]

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

In [None]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)