In [7]:
!python3 --version

Python 3.12.3


In [1]:
!pip freeze > requirements.txt

In [14]:
# Standard library imports
import os
import sys

# Third party imports
import numpy as np
import pandas as pd
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

from datasets import Dataset

from sklearn.model_selection import (
    train_test_split
)
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score
)

# Local application imports

# Constants
data_path = "./data/toxic.csv"

In [3]:
# Load dataset
data = pd.read_csv(data_path)
shape = data.shape
print(f"Dataset shape: {shape}\n" + ('=' * 50))

print(data.info())

Dataset shape: (28942, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28942 entries, 0 to 28941
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        28942 non-null  object
 1   label       28942 non-null  int64 
 2   word_count  28942 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 678.5+ KB
None


In [4]:
# Undersampling
data_copy = data.copy()

toxic_len = len(data_copy[data_copy['label'] == 1])

toxic = data_copy[data_copy['label'] == 1]
non_toxic = data_copy[data_copy['label'] == 0].sample(n = toxic_len, random_state = 42)

balanced = pd.concat([toxic, non_toxic], ignore_index = True)
balanced = balanced.sample(frac = 1, random_state = 1).reset_index(drop = True)

In [5]:
balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10870 entries, 0 to 10869
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        10870 non-null  object
 1   label       10870 non-null  int64 
 2   word_count  10870 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 254.9+ KB


In [6]:
train_set, validation_set = train_test_split(
    balanced,
    test_size= 0.1,
    random_state= 1,
    stratify = balanced['label']
)

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [8]:
train_dataset = Dataset.from_pandas(train_set)
val_dataset = Dataset.from_pandas(validation_set)

In [9]:
process = lambda batch: tokenizer(
    batch["text"],
    truncation=True,
    padding="max_length",
    max_length=256,
)

train_tokenized = train_dataset.map(
    process,
    batched=True,
)

validation_tokenized = val_dataset.map(
    process,
    batched=True,
)

Map: 100%|██████████| 9783/9783 [00:02<00:00, 4038.86 examples/s]
Map: 100%|██████████| 1087/1087 [00:00<00:00, 5128.94 examples/s]


In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/phobert-base",
    num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def evaluation(prediction: tuple) -> dict:
    predict_label, true_label = prediction
    predict_label = np.argmax(predict_label, axis=1)

    f1 = f1_score(true_label, predict_label, average='weighted')
    accuracy = accuracy_score(true_label, predict_label)
    precision = precision_score(true_label, predict_label, average='weighted')
    recall = recall_score(true_label, predict_label, average='weighted')

    return {
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }

In [29]:
training_args = TrainingArguments(
    output_dir="./phobert_toxic_result",

    per_device_train_batch_size=2,  
    gradient_accumulation_steps=16, 
    fp16=True,                      
   
    num_train_epochs=5,             
    learning_rate=3e-5,             
    
    eval_strategy="epoch",    
    save_strategy="epoch",          
    load_best_model_at_end=True,    
    metric_for_best_model="f1",     
   
    logging_steps=100
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    compute_metrics=evaluation,
    processing_class=tokenizer
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.3543,0.434864,0.810393,0.811408,0.818105,0.811408
2,0.2738,0.41736,0.830719,0.830727,0.830801,0.830727
3,0.2032,0.49358,0.827517,0.827967,0.831345,0.827967
4,0.1371,0.566849,0.842606,0.842686,0.84342,0.842686
5,0.089,0.643571,0.838065,0.838086,0.838248,0.838086


TrainOutput(global_step=1530, training_loss=0.19951781743492175, metrics={'train_runtime': 1870.221, 'train_samples_per_second': 26.155, 'train_steps_per_second': 0.818, 'total_flos': 6435038636467200.0, 'train_loss': 0.19951781743492175, 'epoch': 5.0})

In [22]:
from dotenv import load_dotenv
from huggingface_hub import login

In [26]:
load_dotenv()

token = os.getenv("HUGGINGFACE_HUB_TOKEN")
if not token:
    raise RuntimeError("HUGGINGFACE_HUB_TOKEN not found in .env")

os.environ["HUGGINGFACE_HUB_TOKEN"] = token

try:
    login(token=token)
except Exception as e:
    print("huggingface_hub.login() warning:", e)


splits = {'train': 'train.csv', 'validation': 'dev.csv', 'test': 'test.csv'}
vi_hsd = [pd.read_csv("hf://datasets/sonlam1102/vihsd/" + splits[split]) for split in splits]

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [28]:
for each in vi_hsd:
    each.rename(columns={'free_text': 'text', 'label_id': 'label'}, inplace=True)
    each['label'] = each['label'].replace(2, 1)

test_df = vi_hsd[2]

test_dataset = Dataset.from_pandas(test_df) 
tokenized_test = test_dataset.map(
    lambda p: tokenizer(p['text'], truncation=True, padding='max_length', max_length=256),
    batched=True
)

# 2. Chạy dự đoán
print("Đang đánh giá trên tập Test...")
test_results = trainer.predict(tokenized_test)

# 3. In ra kết quả V1
print("--- KẾT QUẢ BASELINE V1 (PHOBERT) ---")
print(test_results.metrics)

Map: 100%|██████████| 6680/6680 [00:01<00:00, 4776.94 examples/s]


Đang đánh giá trên tập Test...


--- KẾT QUẢ BASELINE V1 (PHOBERT) ---
{'test_loss': 0.450543612241745, 'test_f1': 0.8156071080119192, 'test_accuracy': 0.7961077844311377, 'test_precision': 0.8610184689422915, 'test_recall': 0.7961077844311377, 'test_runtime': 59.6863, 'test_samples_per_second': 111.918, 'test_steps_per_second': 13.99}


In [32]:
for each in vi_hsd:
    each.rename(columns={'free_text': 'text', 'label_id': 'label'}, inplace=True)
    each['label'] = each['label'].replace(2, 1)

test_df = vi_hsd[2]

test_dataset = Dataset.from_pandas(test_df) 
tokenized_test = test_dataset.map(
    lambda p: tokenizer(p['text'], truncation=True, padding='max_length', max_length=256),
    batched=True
)

# 2. Chạy dự đoán
print("Đang đánh giá trên tập Test...")
test_results = trainer.predict(tokenized_test)

# 3. In ra kết quả V1
print("--- KẾT QUẢ BASELINE V1 (PHOBERT) ---")
print(test_results.metrics)

Map: 100%|██████████| 6680/6680 [00:01<00:00, 3977.30 examples/s]


Đang đánh giá trên tập Test...


--- KẾT QUẢ BASELINE V1 (PHOBERT) ---
{'test_loss': 0.5575075745582581, 'test_f1': 0.8522600834227863, 'test_accuracy': 0.8410179640718562, 'test_precision': 0.875531916317784, 'test_recall': 0.8410179640718562, 'test_runtime': 44.6553, 'test_samples_per_second': 149.59, 'test_steps_per_second': 18.699}


In [36]:
data_new = data.copy()

toxic_new_politics = pd.read_csv("politics_processed.csv")
toxic_new_offensive = pd.read_csv("offensive_processed.csv")
toxic_new_racist = pd.read_csv("racist_processed.csv")

data_new = pd.concat([data_new, toxic_new_politics, toxic_new_offensive, toxic_new_racist], ignore_index=True)
data_new = data_new.sample(frac=1, random_state=1).reset_index(drop=True)

In [37]:
data_new['label'].value_counts()

label
1    24102
0    23507
Name: count, dtype: int64

In [None]:
train_set, validation_set = train_test_split(
    data_new,
    test_size= 0.1,
    random_state= 1,
    stratify = data_new['label']
)