In [1]:
# !pip install transformers datasets torch scikit-learn

In [2]:
import numpy
print(numpy.__version__)

2.2.4


In [3]:
import pandas as pd
import torch
import hf_xet
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import transformers

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [6]:
from datasets.arrow_dataset import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import EarlyStoppingCallback

In [7]:
# Đọc dữ liệu
df = pd.read_csv('./data/balanced_neutral_sentiment_data.csv')

In [8]:
# Kiểm tra kiểu dữ liệu và xử lý missing values
df['cleaned_comment'] = df['cleaned_comment'].astype(str)
df['cleaned_comment'] = df['cleaned_comment'].fillna('')

# Chia dữ liệu thành train, validation, test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=42)

train_dataset = Dataset.from_pandas(train_df[['cleaned_comment', 'sentiment']])
val_dataset = Dataset.from_pandas(val_df[['cleaned_comment', 'sentiment']])
test_dataset = Dataset.from_pandas(test_df[['cleaned_comment', 'sentiment']])

In [9]:
# Tải BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Hàm token hóa dữ liệu
def tokenize_function(examples):
    return tokenizer(examples['cleaned_comment'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 2715/2715 [00:01<00:00, 1680.04 examples/s]
Map: 100%|██████████| 388/388 [00:00<00:00, 1783.35 examples/s]
Map: 100%|██████████| 776/776 [00:00<00:00, 1738.14 examples/s]


In [10]:
# Mã hóa nhãn
label_mapping = {'Tốt': 2, 'Trung tính': 1, 'Tệ': 0}

def label_mapping_function(examples):
    examples['label'] = [label_mapping[sentiment] for sentiment in examples['sentiment']]
    return examples

train_dataset = train_dataset.map(label_mapping_function, batched=True)
val_dataset = val_dataset.map(label_mapping_function, batched=True)
test_dataset = test_dataset.map(label_mapping_function, batched=True)

# Kiểm tra kết quả
print(train_dataset[0])

Map: 100%|██████████| 2715/2715 [00:00<00:00, 126110.60 examples/s]
Map: 100%|██████████| 388/388 [00:00<00:00, 50439.81 examples/s]
Map: 100%|██████████| 776/776 [00:00<00:00, 97749.94 examples/s]

{'cleaned_comment': 'bênh viện còn khá mới, nhân viên tận tình nhẹ nhàng. nhưng điều quan truyền nhất là bác sỹ trẻ nhiều quá, bác sỹ thực hành nhưng lại không có thạc sỹ bác sỹ hay bác sỹ ckii phụ trách chính. bác sỹ trẻ thăm khám trực tiếp rất thiếu kinh nghiệm, phải có bác sỹ phụ trách khoa kèm cặp và ra quyết định.', 'sentiment': 'Trung tính', '__index_level_0__': 2899, 'input_ids': [101, 24429, 10237, 27805, 14674, 57205, 18652, 117, 14694, 15202, 109327, 23403, 93799, 13265, 10376, 119, 15662, 16391, 12522, 20967, 13346, 10331, 98709, 187, 66556, 38723, 13710, 27261, 117, 98709, 187, 66556, 11992, 13910, 15662, 13148, 11755, 10601, 77586, 31607, 187, 66556, 98709, 187, 66556, 13605, 98709, 187, 66556, 171, 70149, 28422, 59660, 12707, 119, 98709, 187, 66556, 38723, 89311, 57205, 10147, 34270, 16948, 18946, 54594, 21130, 42788, 117, 15723, 10601, 98709, 187, 66556, 28422, 59660, 11685, 179, 70958, 171, 75669, 10432, 11859, 27016, 15027, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,




In [11]:
# Kiểm tra số lượng mẫu cân bằng
print("Số lượng mẫu trong tập train:")
print(train_df['sentiment'].value_counts())
print("\nSố lượng mẫu trong tập validation:")
print(val_df['sentiment'].value_counts())
print("\nSố lượng mẫu trong tập test:")
print(test_df['sentiment'].value_counts())


Số lượng mẫu trong tập train:
sentiment
Tốt           1108
Tệ             895
Trung tính     712
Name: count, dtype: int64

Số lượng mẫu trong tập validation:
sentiment
Tốt           169
Tệ            117
Trung tính    102
Name: count, dtype: int64

Số lượng mẫu trong tập test:
sentiment
Tốt           333
Tệ            257
Trung tính    186
Name: count, dtype: int64


In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [13]:
def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }



NameError: name 'model' is not defined

In [14]:
print(transformers.__version__)

4.51.3


In [15]:
print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)

Accelerate version: 1.6.0
Transformers version: 4.51.3


In [16]:
training_args = TrainingArguments(
    output_dir='./results',                  
    eval_strategy="epoch",             
    save_strategy="epoch",                   
    learning_rate=2e-5,                      
    per_device_train_batch_size=16,          
    per_device_eval_batch_size=64,           
    num_train_epochs=10,                     
    weight_decay=0.01,                       
    logging_dir='./logs',                    
    load_best_model_at_end=True,             
    fp16=True,                               
    gradient_accumulation_steps=2,           
    save_steps=1000,                         
    eval_steps=1000,                         
    save_total_limit=3,                      
    logging_steps=500,                       
)
# Cài đặt Trainer
trainer = Trainer(
    model=model,                            
    args=training_args,                     
    train_dataset=train_dataset,            
    eval_dataset=val_dataset,               
    compute_metrics=compute_metrics,         
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)



In [17]:
# Huấn luyện mô hình
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.368307,0.878866,0.889189,0.878866,0.878393
2,No log,0.330908,0.891753,0.89977,0.891753,0.891023
3,No log,0.284252,0.886598,0.891818,0.886598,0.886334
4,No log,0.300786,0.876289,0.878361,0.876289,0.87651
5,No log,0.412529,0.873711,0.878168,0.873711,0.874263
6,0.297800,0.414189,0.886598,0.888197,0.886598,0.886019
7,0.297800,0.465927,0.891753,0.89123,0.891753,0.891004
8,0.297800,0.527854,0.89433,0.894352,0.89433,0.893703




TrainOutput(global_step=680, training_loss=0.2413575999877032, metrics={'train_runtime': 18689.2824, 'train_samples_per_second': 1.453, 'train_steps_per_second': 0.045, 'total_flos': 1428705858263040.0, 'train_loss': 0.2413575999877032, 'epoch': 8.0})

In [18]:
# Đánh giá mô hình
results = trainer.evaluate(test_dataset)
print(results)



{'eval_loss': 0.3382989466190338, 'eval_accuracy': 0.8827319587628866, 'eval_precision': 0.8873570877436858, 'eval_recall': 0.8827319587628866, 'eval_f1': 0.8826269630453241, 'eval_runtime': 125.4606, 'eval_samples_per_second': 6.185, 'eval_steps_per_second': 0.104, 'epoch': 8.0}


In [19]:
# Hàm dự đoán cảm xúc
def predict_sentiment(text):
    enc = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**enc)
    pred = torch.argmax(output.logits, dim=-1)
    return pred.item()

# Dự đoán một ví dụ
new_comment = "Bệnh viện sạch, y tá có thái độ với bệnh nhân"
prediction = predict_sentiment(new_comment)
sentiment_labels = ['Tệ', 'Trung tính', 'Tốt']
print(f"Cảm xúc dự đoán: {sentiment_labels[prediction]}")

Cảm xúc dự đoán: Tệ


In [20]:
def split_clauses(sentence):
    import re
    clauses = re.split(r'[,.;!?]', sentence)
    return [clauses.strip() for clauses in clauses if clauses.strip()]

def score_sentence(sentence):
    clauses = split_clauses(sentence)
    if not clauses:
        return None,
    
    scores = []
    for clause in clauses:
        pred = predict_sentiment(clause)
        scores.append(pred)
        
    overall_score = sum(scores) / len(scores)
    return scores, overall_score

sentence = "Bệnh viện sạch, y tá có thái độ với bệnh nhân"
overall_score = score_sentence(sentence)
print(overall_score)

([2, 0], 1.0)


In [21]:
# Giả sử mô hình đã được huấn luyện là model
model.save_pretrained("sentiment_model_bert")
tokenizer.save_pretrained("sentiment_model_bert")

('sentiment_model_bert\\tokenizer_config.json',
 'sentiment_model_bert\\special_tokens_map.json',
 'sentiment_model_bert\\vocab.txt',
 'sentiment_model_bert\\added_tokens.json')