In [26]:
import os
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from underthesea import word_tokenize
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [27]:
def read_data(directory):
    data = []
    for category in os.listdir(directory):
        category_path = os.path.join(directory, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                file_path = os.path.join(category_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    data.append((text, category))
    return data

In [28]:
train_data = read_data('data_train/train')
test_data = read_data('data_train/test')

In [29]:
train_df = pd.DataFrame(train_data, columns=['text', 'category'])
test_df = pd.DataFrame(test_data, columns=['text', 'category'])

In [30]:
df = pd.concat([train_df, test_df], ignore_index=True)
df['category'] = df['category'].apply(lambda x: 1 if x == 'neg' else 0)
df['text'] = df['text'].apply(lambda x: word_tokenize(x, format="text"))


In [31]:
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle(seed=42).select(range(700))

In [32]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


tokenized_datasets = dataset.map(tokenize_function, batched=True)


train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


train_dataset = train_dataset.rename_column("category", "labels")
test_dataset = test_dataset.rename_column("category", "labels")

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [33]:
train_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 560
})

In [34]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [35]:
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./my_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/210 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.37709933519363403, 'eval_accuracy': 0.8428571428571429, 'eval_f1': 0.8417743324720068, 'eval_precision': 0.8421328671328672, 'eval_recall': 0.8428571428571429, 'eval_runtime': 60.6997, 'eval_samples_per_second': 2.306, 'eval_steps_per_second': 0.297, 'epoch': 1.0}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.31451988220214844, 'eval_accuracy': 0.9, 'eval_f1': 0.9, 'eval_precision': 0.9, 'eval_recall': 0.9, 'eval_runtime': 60.6601, 'eval_samples_per_second': 2.308, 'eval_steps_per_second': 0.297, 'epoch': 2.0}


  0%|          | 0/18 [00:00<?, ?it/s]

{'eval_loss': 0.40131276845932007, 'eval_accuracy': 0.8785714285714286, 'eval_f1': 0.8793036547637775, 'eval_precision': 0.8822992322058519, 'eval_recall': 0.8785714285714286, 'eval_runtime': 60.9772, 'eval_samples_per_second': 2.296, 'eval_steps_per_second': 0.295, 'epoch': 3.0}
{'train_runtime': 2511.3707, 'train_samples_per_second': 0.669, 'train_steps_per_second': 0.084, 'train_loss': 0.375930422828311, 'epoch': 3.0}


TrainOutput(global_step=210, training_loss=0.375930422828311, metrics={'train_runtime': 2511.3707, 'train_samples_per_second': 0.669, 'train_steps_per_second': 0.084, 'total_flos': 110506643251200.0, 'train_loss': 0.375930422828311, 'epoch': 3.0})

In [36]:
results = trainer.evaluate()
print("Evaluation results:", results)   

  0%|          | 0/18 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.40131276845932007, 'eval_accuracy': 0.8785714285714286, 'eval_f1': 0.8793036547637775, 'eval_precision': 0.8822992322058519, 'eval_recall': 0.8785714285714286, 'eval_runtime': 58.4413, 'eval_samples_per_second': 2.396, 'eval_steps_per_second': 0.308, 'epoch': 3.0}


In [41]:
def predict_sentiment(text):
    processed_text = word_tokenize(text, format="text")
    inputs = tokenizer(processed_text, return_tensors="pt", padding="max_length", truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()
        
    return "Positive" if prediction == 1 else "Negative"

print(predict_sentiment("Sản phẩm có thể nói là khá ổn"))
print(predict_sentiment("Dịch vụ khách hàng không thật sự tốt"))


Positive
Negative
