In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/processed-student-feedbacks/preprocessed_data.csv


In [None]:
# Use PhoBert word segmentation
!pip install py_vncorenlp

In [None]:
import py_vncorenlp

vncorenlp_path = "/kaggle/temp/vncorenlp"
os.makedirs(vncorenlp_path, exist_ok=True)

# Automatically download VnCoreNLP components from the original repository
# and save them in some local machine folder
py_vncorenlp.download_model(save_dir=vncorenlp_path)

# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=vncorenlp_path)

In [4]:
text = "Ông Nguyễn Khắc Chúc  đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."

output = rdrsegmenter.word_segment(text)

print(output)

['Ông Nguyễn_Khắc_Chúc đang làm_việc tại Đại_học Quốc_gia Hà_Nội .', 'Bà Lan , vợ ông Chúc , cũng làm_việc tại đây .']


# Load data

In [5]:
df = pd.read_csv('/kaggle/input/processed-student-feedbacks/preprocessed_data.csv').drop(columns=['tokens'])
df['segmented_text'] = df['sentence'].apply(lambda x: rdrsegmenter.word_segment(x)[0])

df.head()

Unnamed: 0,sentence,topic,sentiment,dataset,segmented_text
0,slide giáo trình đầy đủ .,1,2,train,slide giáo_trình đầy_đủ .
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",0,2,train,"nhiệt_tình giảng_dạy , gần_gũi với sinh_viên ."
2,đi học đầy đủ full điểm chuyên cần .,1,0,train,đi học đầy_đủ full điểm chuyên_cần .
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0,train,chưa áp_dụng công_nghệ_thông_tin và các thiết_...
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",0,2,train,"thầy giảng bài hay , có nhiều bài_tập ví_dụ ng..."


In [6]:
# Split by the `dataset` column
df_train = df[df['dataset'] == 'train'].reset_index(drop=True).drop(columns=['sentence', 'dataset'])
df_valid = df[df['dataset'] == 'valid'].reset_index(drop=True).drop(columns=['sentence', 'dataset'])
df_test  = df[df['dataset'] == 'test'].reset_index(drop=True).drop(columns=['sentence', 'dataset'])

In [7]:
from datasets import Dataset, DatasetDict

ds_train = Dataset.from_pandas(df_train)
ds_valid = Dataset.from_pandas(df_valid)
ds_test = Dataset.from_pandas(df_test)

dataset = DatasetDict({
    'train': ds_train,
    'validation': ds_valid,
    'test': ds_test
})

In [8]:
print(dataset)
print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['topic', 'sentiment', 'segmented_text'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['topic', 'sentiment', 'segmented_text'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['topic', 'sentiment', 'segmented_text'],
        num_rows: 3166
    })
})
{'topic': 1, 'sentiment': 2, 'segmented_text': 'slide giáo_trình đầy_đủ .'}


# Load models

In [9]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [11]:
# Tokenize text
def tokenize(row):
    return tokenizer(
        row['segmented_text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

## Train on topic classification

In [12]:
tokenized_dataset = dataset.map(tokenize, batched=True) \
    .rename_columns({'topic': 'labels'}) \
    .remove_columns(["sentiment", "segmented_text"])
print(tokenized_dataset["train"][0])

Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

{'labels': 1, 'input_ids': [0, 48090, 12301, 997, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [13]:
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [14]:
N_TOPICS = 4
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=N_TOPICS).to(device)

print(model.config)

2025-06-14 03:39:20.497161: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749872360.520949     214 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749872360.528708     214 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "PhobertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
}



In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    report_to=[],  # disables all external logging
    eval_strategy="epoch",         
    save_strategy="epoch",               
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_SILENT"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4536,0.30847,0.901453,0.865912,0.770756,0.799222
2,0.2777,0.318782,0.895136,0.85909,0.776043,0.801086
3,0.2239,0.329068,0.90019,0.853298,0.778762,0.804379
4,0.1797,0.387433,0.888187,0.817654,0.783363,0.795763
5,0.1405,0.40435,0.888819,0.810227,0.799679,0.802848
6,0.1072,0.438466,0.891977,0.811484,0.805131,0.807098
7,0.095,0.487957,0.893872,0.810277,0.788625,0.795971
8,0.0741,0.503226,0.894504,0.824558,0.793965,0.806604
9,0.0618,0.529781,0.892609,0.804696,0.792542,0.796179
10,0.0536,0.531742,0.894504,0.815003,0.79511,0.801682


TrainOutput(global_step=3580, training_loss=0.1667003631591797, metrics={'train_runtime': 1517.7355, 'train_samples_per_second': 75.283, 'train_steps_per_second': 2.359, 'total_flos': 7515902258442240.0, 'train_loss': 0.1667003631591797, 'epoch': 10.0})

In [19]:
# Evaluate on test set
trainer.evaluate(tokenized_dataset["test"])

{'eval_loss': 0.42474833130836487,
 'eval_accuracy': 0.8894504106127605,
 'eval_precision': 0.8031367893824529,
 'eval_recall': 0.8097007607306338,
 'eval_f1': 0.8063743816011124,
 'eval_runtime': 10.956,
 'eval_samples_per_second': 288.975,
 'eval_steps_per_second': 9.036,
 'epoch': 10.0}

## Do the same for sentiment classification

In [13]:
tokenized_dataset = dataset.map(tokenize, batched=True) \
    .rename_columns({'sentiment': 'labels'}) \
    .remove_columns(["topic", "segmented_text"])
print(tokenized_dataset["train"][0])

Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

{'labels': 2, 'input_ids': [0, 48090, 12301, 997, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
N_SENTIMENTS = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=N_SENTIMENTS).to(device)

print(model.config)

2025-06-14 04:23:52.000628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749875032.184434      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749875032.241157      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

RobertaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "PhobertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
}



In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    report_to=[],  # disables all external logging
    eval_strategy="epoch",         
    save_strategy="epoch",               
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_SILENT"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2817,0.208753,0.944409,0.888672,0.808288,0.838575
2,0.1658,0.200434,0.953253,0.910134,0.847889,0.873678
3,0.1285,0.187443,0.951358,0.884902,0.858986,0.870924
4,0.0962,0.205412,0.95199,0.905082,0.830041,0.859637
5,0.0722,0.22617,0.950095,0.87156,0.882424,0.876824
6,0.0586,0.257324,0.9482,0.877961,0.852528,0.864259
7,0.0523,0.277106,0.946304,0.851224,0.875612,0.862494
8,0.0396,0.267263,0.953253,0.892222,0.868297,0.879452
9,0.0305,0.289179,0.948831,0.876424,0.865046,0.870551
10,0.0276,0.292444,0.950726,0.876084,0.870558,0.873276


TrainOutput(global_step=3580, training_loss=0.0952922493385869, metrics={'train_runtime': 1509.6267, 'train_samples_per_second': 75.688, 'train_steps_per_second': 2.371, 'total_flos': 7515834777400320.0, 'train_loss': 0.0952922493385869, 'epoch': 10.0})

In [18]:
# Evaluate on test set
trainer.evaluate(tokenized_dataset["test"])

{'eval_loss': 0.319315105676651,
 'eval_accuracy': 0.941566645609602,
 'eval_precision': 0.8654858870090907,
 'eval_recall': 0.8188004687938335,
 'eval_f1': 0.8383469377030993,
 'eval_runtime': 11.3294,
 'eval_samples_per_second': 279.449,
 'eval_steps_per_second': 8.738,
 'epoch': 10.0}