In [2]:
import os
import re
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from underthesea import word_tokenize
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
def read_data(directory):
    data = []
    for category in os.listdir(directory):
        category_path = os.path.join(directory, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                file_path = os.path.join(category_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    data.append((text, category))
    return data

In [3]:
train_data = read_data('data_train/train')
test_data = read_data('data_train/test')

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def remove_special_characters(text):
    text = re.sub(r'[^\w\s,]', '', text)
    text = re.sub(r'[:;]+[)]+', '', text)
    return text.strip()

In [6]:
df['text'] = df['text'].apply(remove_special_characters)

In [7]:
df['category'] = df['category'].apply(lambda x: 1 if x == 'neg' else 0)

In [8]:
df = df.sample(frac=1)
df

Unnamed: 0.1,Unnamed: 0,text,category
22791,22791,Nói tên quán là Con hẻm nhỏ nhưng thực_ra ko h...,0
38766,38766,Một nơi tuyệt_vời để thưởng_thức ẩm_thực Việt ...,0
20618,20618,"Quán decor khá ấn_tượng , nhìn trẻ_trung hiện ...",0
39273,39273,"Nhân_viên dể thương , chiều khách \nQuán đẹp \...",0
38666,38666,Trà sữa ở đây pha vừa uống k lạt cũg k béo u...,0
...,...,...,...
34704,34704,Chỗ này bán bánh cũ hay_sao mà vỏ bánh cứng kh...,1
13213,13213,Quán nằm trg hẻm nhỏ không_gian quán cũng khá...,1
37004,37004,"qua n kho i chê , thư c ăn vư a ngon vư a...",0
11632,11632,Hôm_qua đi ăn vơ i ba n kem cu ng râ t ngo...,1


In [9]:
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle(seed=42)

In [10]:
def tokenize_function(sentence):
    return tokenizer(sentence["text"], padding="max_length", truncation=True, max_length=140)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [11]:
train_dataset = train_dataset.rename_column("category", "labels")
test_dataset = test_dataset.rename_column("category", "labels")

In [12]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 32000
})

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)



In [15]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.2913,0.241297,0.908375,0.908304,0.909293,0.908375
1000,0.2421,0.225858,0.913,0.912999,0.913001,0.913
1500,0.205,0.232769,0.912,0.91196,0.912495,0.912
2000,0.1918,0.234203,0.913375,0.913307,0.914327,0.913375
2500,0.1595,0.250742,0.914,0.913983,0.914177,0.914
3000,0.1516,0.25298,0.914,0.913983,0.914177,0.914


TrainOutput(global_step=3000, training_loss=0.20690624237060548, metrics={'train_runtime': 2968.0503, 'train_samples_per_second': 32.344, 'train_steps_per_second': 1.011, 'total_flos': 6906665203200000.0, 'train_loss': 0.20690624237060548, 'epoch': 3.0})

In [19]:
trainer.save_model("/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/model")

In [20]:
tokenizer.save_pretrained("/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/tokenizer")

('/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/tokenizer/vocab.txt',
 '/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/tokenizer/bpe.codes',
 '/content/drive/MyDrive/colab_notebook/Thư mục không có tiêu đề/tokenizer/added_tokens.json')