In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Use PhoBert word segmentation
!pip install py_vncorenlp

In [None]:
import py_vncorenlp

vncorenlp_path = "/kaggle/temp/vncorenlp"
os.makedirs(vncorenlp_path, exist_ok=True)

# Automatically download VnCoreNLP components from the original repository
# and save them in some local machine folder
py_vncorenlp.download_model(save_dir=vncorenlp_path)

# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=vncorenlp_path)

In [4]:
text = "Ông Nguyễn Khắc Chúc  đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."

output = rdrsegmenter.word_segment(text)

print(output)

['Ông Nguyễn_Khắc_Chúc đang làm_việc tại Đại_học Quốc_gia Hà_Nội .', 'Bà Lan , vợ ông Chúc , cũng làm_việc tại đây .']


# Load data

In [5]:
df = pd.read_csv('/kaggle/input/processed-student-feedbacks/preprocessed_data.csv').drop(columns=['tokens'])
df['segmented_text'] = df['sentence'].apply(lambda x: rdrsegmenter.word_segment(x)[0])

df.head()

Unnamed: 0,sentence,topic,sentiment,dataset,segmented_text
0,slide giáo trình đầy đủ .,1,2,train,slide giáo_trình đầy_đủ .
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",0,2,train,"nhiệt_tình giảng_dạy , gần_gũi với sinh_viên ."
2,đi học đầy đủ full điểm chuyên cần .,1,0,train,đi học đầy_đủ full điểm chuyên_cần .
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0,train,chưa áp_dụng công_nghệ_thông_tin và các thiết_...
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",0,2,train,"thầy giảng bài hay , có nhiều bài_tập ví_dụ ng..."


In [6]:
# Split by the `dataset` column
df_train = df[df['dataset'] == 'train'].reset_index(drop=True).drop(columns=['sentence', 'dataset'])
df_valid = df[df['dataset'] == 'valid'].reset_index(drop=True).drop(columns=['sentence', 'dataset'])
df_test  = df[df['dataset'] == 'test'].reset_index(drop=True).drop(columns=['sentence', 'dataset'])

In [7]:
from datasets import Dataset, DatasetDict

ds_train = Dataset.from_pandas(df_train)
ds_valid = Dataset.from_pandas(df_valid)
ds_test = Dataset.from_pandas(df_test)

dataset = DatasetDict({
    'train': ds_train,
    'validation': ds_valid,
    'test': ds_test
})

In [8]:
print(dataset)
print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['topic', 'sentiment', 'segmented_text'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['topic', 'sentiment', 'segmented_text'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['topic', 'sentiment', 'segmented_text'],
        num_rows: 3166
    })
})
{'topic': 1, 'sentiment': 2, 'segmented_text': 'slide giáo_trình đầy_đủ .'}


# Load models

In [9]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [11]:
# Tokenize text
def tokenize(row):
    return tokenizer(
        row['segmented_text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

## Train on topic classification

In [13]:
tokenized_dataset = dataset.map(tokenize, batched=True) \
    .rename_columns({'topic': 'labels'}) \
    .remove_columns(["sentiment", "segmented_text"])

Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
N_TOPICS = 4
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=N_TOPICS).to(device)

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    report_to=[],  # disables all external logging
    eval_strategy="epoch",         
    save_strategy="epoch",               
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=15,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_SILENT"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.452,0.312821,0.901453,0.872484,0.770037,0.802105
2,0.2756,0.311037,0.902716,0.851942,0.801713,0.815876
3,0.2205,0.35669,0.896399,0.863744,0.777217,0.806029
4,0.1766,0.406591,0.883765,0.80182,0.787185,0.792118
5,0.1388,0.422025,0.890082,0.811011,0.806565,0.806094
6,0.1097,0.449023,0.891346,0.822418,0.802038,0.811213
7,0.0926,0.504552,0.895768,0.810906,0.817201,0.813081
8,0.0695,0.541742,0.895136,0.820147,0.80933,0.814211
9,0.0549,0.584085,0.893872,0.820187,0.805988,0.8114
10,0.0485,0.582328,0.901453,0.830866,0.813403,0.820601


TrainOutput(global_step=5370, training_loss=0.11848915941888394, metrics={'train_runtime': 2262.639, 'train_samples_per_second': 75.748, 'train_steps_per_second': 2.373, 'total_flos': 1.127385338766336e+16, 'train_loss': 0.11848915941888394, 'epoch': 15.0})

In [19]:
from sklearn.metrics import classification_report

# Evaluate on test set
predictions = trainer.predict(tokenized_dataset["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      2290
           1       0.76      0.76      0.76       572
           2       0.94      0.94      0.94       145
           3       0.61      0.54      0.57       159

    accuracy                           0.89      3166
   macro avg       0.81      0.79      0.80      3166
weighted avg       0.88      0.89      0.89      3166



## Do the same for sentiment classification

In [None]:
tokenized_dataset = dataset.map(tokenize, batched=True) \
    .rename_columns({'sentiment': 'labels'}) \
    .remove_columns(["topic", "segmented_text"])

In [None]:
N_SENTIMENTS = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=N_SENTIMENTS).to(device)

In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    report_to=[],  # disables all external logging
    eval_strategy="epoch",         
    save_strategy="epoch",               
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=15,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_SILENT"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2896,0.202304,0.943146,0.867046,0.803484,0.828342
2,0.1689,0.192299,0.946304,0.861862,0.858944,0.860384
3,0.1333,0.196477,0.951358,0.880626,0.846471,0.861759
4,0.1023,0.219135,0.948831,0.902461,0.82803,0.857353
5,0.0791,0.241023,0.950726,0.880556,0.866641,0.873283
6,0.0629,0.257995,0.949463,0.905349,0.836865,0.864569
7,0.05,0.2677,0.946304,0.847531,0.871871,0.858757
8,0.0397,0.28334,0.954517,0.904498,0.857022,0.877604
9,0.0323,0.306832,0.950095,0.872711,0.861957,0.867146
10,0.0231,0.322636,0.95199,0.878451,0.867587,0.872816


TrainOutput(global_step=5370, training_loss=0.06885073686667217, metrics={'train_runtime': 2276.1221, 'train_samples_per_second': 75.299, 'train_steps_per_second': 2.359, 'total_flos': 1.127375216610048e+16, 'train_loss': 0.06885073686667217, 'epoch': 15.0})

In [25]:
from sklearn.metrics import classification_report

# Evaluate on test set
predictions = trainer.predict(tokenized_dataset["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1409
           1       0.73      0.48      0.58       167
           2       0.95      0.96      0.96      1590

    accuracy                           0.94      3166
   macro avg       0.88      0.80      0.83      3166
weighted avg       0.94      0.94      0.94      3166

