In [15]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [16]:
tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vietnamese-Sentiment-visobert")
model = AutoModelForSequenceClassification.from_pretrained("5CD-AI/Vietnamese-Sentiment-visobert")

In [17]:
train_df = pd.read_csv('./cleaned_data/train.csv')
valid_df = pd.read_csv('./cleaned_data/valid.csv')
test_df = pd.read_csv('./cleaned_data/test.csv')
# Làm sạch dữ liệu
def clean_text(text):
    text = str(text)
    emoji_pattern = re.compile(":[a-zA-Z0-9_]+:")
    text = emoji_pattern.sub("", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[\s\_]+', ' ', text)
    return text.strip()

train_df['sentence'] = train_df['sentence'].apply(clean_text)
valid_df['sentence'] = valid_df['sentence'].apply(clean_text)
test_df['sentence'] = test_df['sentence'].apply(clean_text)
# dataset = [train_df, valid_df, test_df]
# for data in dataset:
#     for i, text in enumerate(data['sentence'].tolist()):
#         try:
#             encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#             max_id = encoding['input_ids'].max().item()
#             if max_id >= 64001:
#                 print(f"Câu lỗi tại index {i}: {text}")
#         except Exception as e:
#             print(f"Lỗi tại index {i}: {text}, Chi tiết: {str(e)}")

In [18]:
train_df

Unnamed: 0,sentence,label
0,mua có mỗi bingsu thập cẩm 45k mà mình f đợi h...,0
1,thứ 6 nào ta cùng quẩy vuvuzela beer club chun...,0
2,mình đi với nhóm tổng cộng 4 người ăn chỉ có k...,0
3,nhân viên phục vụ không mấy tận tình đồ ăn ra ...,0
4,vào đây thì hết bàn nhưng mình vẫn ngồi đợi bì...,0
...,...,...
29732,29 mình đi với nhóm bạn tổng cộng là 8ngthiệt ...,1
29733,sushi bình dân mà chất lượng không bình dân ch...,1
29734,trời ơi từ bé đến lớn chưa thử món kem nào bằn...,1
29735,nge mn cũng ns ngon nên hni đến coi thế nào qu...,1


# **Tokenize**

In [19]:
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")

# Áp dụng tokenize cho từng tập dữ liệu
train_encodings = tokenize_function(train_df['sentence'].tolist())
valid_encodings = tokenize_function(valid_df['sentence'].tolist())
test_encodings = tokenize_function(test_df['sentence'].tolist())

# **Prepare dataset**

In [20]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
train_dataset = SentimentDataset(train_encodings, train_df['label'].tolist())
valid_dataset = SentimentDataset(valid_encodings, valid_df['label'].tolist())
test_dataset = SentimentDataset(test_encodings, test_df['label'].tolist())

# **Config train**

In [22]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [23]:
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)  # 3 epochs

# **Training**

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
model.to(device)

cuda


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=7

In [25]:
from tqdm import tqdm

num_epochs = 3

for epoch in range(num_epochs):
    # Huấn luyện
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()

    print(f"Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader)}")

# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0
#     for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1} - Training")):
#         optimizer.zero_grad()
#         inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#         labels = batch['labels'].to(device)
#         # Kiểm tra input_ids
#         max_id = inputs['input_ids'].max().item()
#         if max_id >= 64001:
#             print(f"Lỗi trong batch {i}: max input_id = {max_id}")
#             print(f"input_ids: {inputs['input_ids']}")
#             raise ValueError("Tìm thấy input_id không hợp lệ")
#         if 'position_ids' in inputs:
#             max_pos_id = inputs['position_ids'].max().item()
#             if max_pos_id >= 512:  # PhoBERT max_position_embeddings thường là 512
#                 print(f"Lỗi trong batch {i}: max position_id = {max_pos_id}")
#                 print(f"position_ids: {inputs['position_ids']}")
#                 raise ValueError("Tìm thấy position_id không hợp lệ")
#         outputs = model(**inputs, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         scheduler.step()
#         train_loss += loss.item()
#     print(f"Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader)}")
    
    # Đánh giá trên tập valid
    model.eval()
    valid_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in valid_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            valid_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Epoch {epoch+1}, Valid Loss: {valid_loss / len(valid_loader)}, Accuracy: {accuracy}")

Epoch 1 - Training: 100%|██████████| 1859/1859 [08:08<00:00,  3.81it/s]


Epoch 1, Train Loss: 0.2404482413284709
Epoch 1, Valid Loss: 0.23040283209120857, Accuracy: 0.9114000604777744


Epoch 2 - Training: 100%|██████████| 1859/1859 [08:09<00:00,  3.80it/s]


Epoch 2, Train Loss: 0.16454656555774738
Epoch 2, Valid Loss: 0.22777276060389484, Accuracy: 0.9184558008265296


Epoch 3 - Training: 100%|██████████| 1859/1859 [08:10<00:00,  3.79it/s]


Epoch 3, Train Loss: 0.0997015593318953
Epoch 3, Valid Loss: 0.272244840523681, Accuracy: 0.9110976716056849


# **Evaluate**

In [26]:
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        _, predicted = torch.max(outputs.logits, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.9110976716056849


In [27]:
model.save_pretrained("./modelTransformer/")
tokenizer.save_pretrained("./modelTransformer/")

Non-default generation parameters: {'max_length': 256}


('./modelTransformer/tokenizer_config.json',
 './modelTransformer/special_tokens_map.json',
 './modelTransformer/tokenizer.json')

In [28]:
model = AutoModelForSequenceClassification.from_pretrained("./modelTransformer/")
tokenizer = AutoTokenizer.from_pretrained("./modelTransformer/")