In [1]:
import os
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

In [2]:
train_file = "/content/Train_data.txt"
test_file = "/content/Test_data.txt"

In [3]:
def read_data(file):
    data = []
    with open(file, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(line.strip())
    return data

In [4]:
train_data_preprocess = read_data(train_file)
test_data_preprocess = read_data(test_file)

In [5]:
df_train = pd.DataFrame(train_data_preprocess, columns=['text'])
df_test = pd.DataFrame(test_data_preprocess, columns=['text'])

df_train['label'] = df_train['text'].apply(lambda x: x.split()[0].replace('__label__', ''))
df_train['text'] = df_train['text'].apply(lambda x: ' '.join(x.split(' ')[1:]))

df_test['label'] = df_test['text'].apply(lambda x: x.split(' ')[0].replace('__label__', ''))
df_test['text'] = df_test['text'].apply(lambda x: ' '.join(x.split(' ')[1:]))

df_train

Unnamed: 0,text,label
0,"Theo h√†nh tr√¨nh tour du l·ªãch M·ªπ - B·ªù ƒê√¥ng, du ...",Du_lich
1,m√¨nh c·∫ßn t√¨m 1 ph√≤ng cho kho·∫£ng 3 ng∆∞·ªùi quanh...,Nha_dat
2,Cho thu√™ nh√† ri√™ng dt 60m/s√†n. C√≥ 4 ph√≤ng ng·ªß...,Nha_dat
3,"Cho thu√™ nh√† ·ªü t·∫ßng 4 kh√©p k√≠n, 4/295 Nguy·ªÖn K...",Nha_dat
4,‚ñ∫ Crumpler jackpack full photo ‚ñ∫ gi√° : 800.000...,Mua_sam
...,...,...
15995,C√ÅC M√ìN KIM CHI NGON CHO M√ôA THU -------------...,Do_an_va_do_uong
15996,C·∫ßn cho thu√™ Chung c∆∞ Greenstar 234 Ph·∫°m VƒÉn ƒê...,Nha_dat
15997,CH∆Ø∆†NG TR√åNH H·ªåC PH√ç TH√ÅNG 08/2016 T·∫∂NG NGAY ...,Kinh_doanh_va_Cong_nghiep
15998,B·ªë tr√≠ th√¥ng minh gi√∫p nh√† ·ªëng S√†i G√≤n kh√¥ng c...,Nha_va_vuon


In [6]:
df_test

Unnamed: 0,text,label
0,G·∫•p ; Hi·ªán b√™n em ƒëang c·∫ßn thu√™ 1 ph√≤ng c√≥ Di·ªá...,ÔªøNha_dat
1,üåà CH√ÄO NOEL ƒê√ìN M∆ØA QU√Ä T·∫∂NG . üòç Nh√¢n d·ªãp Noel...,Mang_internet_va_vien_thong
2,üì¢üì¢üì¢ KH·ªûI C√îNG X√ÇY D·ª∞NG 33 CƒÇN NH√Ä PH·ªê LI·ªÄN K·ªÄ ...,Kinh_doanh_va_Cong_nghiep
3,"S√°ng ng√†y h√¥m nay, BTC r·∫•t vui khi nh·∫≠n ƒë∆∞·ª£c s...",Sach
4,C·∫ßn cho thu√™ cƒÉn h·ªô chung c∆∞ d∆∞·ªõi s√†i ƒë·ªìng ƒë·ªëi...,Nha_dat
...,...,...
10012,[ T·ªîNG H·ª¢P NH·ªÆNG M√ìN NGON KHU V·ª∞C ƒê√ÄO T·∫§N - BA...,Do_an_va_do_uong
10013,B·∫£n tin t√†i ch√≠nh kinh doanh t·ªëi th·ª© s√°u (23/0...,Tai_chinh
10014,"Ngang nhi√™n v·ª´a hack v·ª´a stream, game th·ªß Over...",Giai_tri
10015,"5 TOUR N∆Ø·ªöC NGO√ÄI D·ªäP GI√ÅNG SINH, NƒÇM M·ªöI GI√Å ...",Du_lich


# Data Processing

In [7]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]|_', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()

    return text

In [8]:
test = preprocess_text('ƒêau m·ªèi vai g√°y n√™n ƒëi massage c·ªï, nh∆∞ng sau ƒë√≥ n·ªØ ca sƒ© ƒë√£ kh√¥ng th·ªÉ di chuy·ªÉn v√¨ g·∫∑p nhi·ªÅu tri·ªáu ch·ª©ng ƒëau nh·ª©c, kh√¥ng th·ªÉ n·∫±m th·∫≥ng l∆∞ng ƒë∆∞·ª£c. Sau g·∫ßn 1 th√°ng ƒëi·ªÅu tr·ªã, c√¥ ƒë√£ kh√¥ng th·ªÉ qua kh·ªèi. Theo: Vietnamnet')
test

'ƒëau m·ªèi vai g√°y n√™n ƒëi massage c·ªï nh∆∞ng sau ƒë√≥ n·ªØ ca sƒ© ƒë√£ kh√¥ng th·ªÉ di chuy·ªÉn v√¨ g·∫∑p nhi·ªÅu tri·ªáu ch·ª©ng ƒëau nh·ª©c kh√¥ng th·ªÉ n·∫±m th·∫≥ng l∆∞ng ƒë∆∞·ª£c sau g·∫ßn th√°ng ƒëi·ªÅu tr·ªã c√¥ ƒë√£ kh√¥ng th·ªÉ qua kh·ªèi theo vietnamnet'

In [9]:
def preprocess_data(df):
    df['text'] = df['text'].apply(lambda x: preprocess_text(x))
    df.drop_duplicates(subset=['text'], inplace=True)
    return df

In [10]:
df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)

In [11]:
print(df_train['text'][:5])
print(df_test['text'][:5])

0    theo h√†nh tr√¨nh tour du l·ªãch m·ªπ b·ªù ƒë√¥ng du kh√°...
1    m√¨nh c·∫ßn t√¨m ph√≤ng cho kho·∫£ng ng∆∞·ªùi quanh khu ...
2    cho thu√™ nh√† ri√™ng dt ms√†n c√≥ ph√≤ng ng·ªß p kh√°c...
3    cho thu√™ nh√† ·ªü t·∫ßng kh√©p k√≠n nguy·ªÖn kho√°i c√≥ b...
4    crumpler jackpack full photo gi√° vnƒë gi·∫£m c√≤n ...
Name: text, dtype: object
0    g·∫•p hi·ªán b√™n em ƒëang c·∫ßn thu√™ ph√≤ng c√≥ di·ªán t√≠...
1    ch√†o noel ƒë√≥n m∆∞a qu√† t·∫∑ng nh√¢n d·ªãp noel viett...
2    kh·ªüi c√¥ng x√¢y d·ª±ng cƒÉn nh√† ph·ªë li·ªÅn k·ªÅ ch·ªâ tri...
3    s√°ng ng√†y h√¥m nay btc r·∫•t vui khi nh·∫≠n ƒë∆∞·ª£c s√°...
4    c·∫ßn cho thu√™ cƒÉn h·ªô chung c∆∞ d∆∞·ªõi s√†i ƒë·ªìng ƒë·ªëi...
Name: text, dtype: object


In [12]:
print(df_train.dtypes)
print(df_test.dtypes)

text     object
label    object
dtype: object
text     object
label    object
dtype: object


In [13]:
label_to_encoded = {label: idx for idx, label in enumerate(df_train['label'].unique())}

df_train['label'] = df_train['label'].map(label_to_encoded)
df_test['label'] = df_test['label'].map(label_to_encoded)

In [14]:
label_to_encoded

{'Du_lich': 0,
 'Nha_dat': 1,
 'Mua_sam': 2,
 'Tai_chinh': 3,
 'Mang_internet_va_vien_thong': 4,
 'Nha_va_vuon': 5,
 'Kinh_doanh_va_Cong_nghiep': 6,
 'Nghe_thuat': 7,
 'Giao_duc': 8,
 'Lam_dep_va_the_hinh': 9,
 'Con_nguoi_va_xa_hoi': 10,
 'Sach': 11,
 'Chinh_tri': 12,
 'Do_an_va_do_uong': 13,
 'Giao_thong': 14,
 'Thoi_quen_va_so_thich': 15,
 'Giai_tri': 16,
 'Suc_khoe_va_benh_tat': 17,
 'Phap_luat': 18,
 'Khoa_hoc': 19,
 'May_tinh_va_thiet_bi_dien_tu': 20,
 'Cong_nghe_moi': 21,
 'The_thao': 22}

In [15]:
label_to_encoded_format = {
    'Du l·ªãch': 0,
    'Nh√† ƒë·∫•t': 1,
    'Mua s·∫Øm': 2,
    'T√†i ch√≠nh': 3,
    'M·∫°ng internet v√† vi·ªÖn th√¥ng': 4,
    'Nh√† v√† v∆∞·ªùn': 5,
    'Kinh doanh v√† c√¥ng nghi·ªáp': 6,
    'Ngh·ªá thu·∫≠t': 7,
    'Gi√°o d·ª•c': 8,
    'L√†m ƒë·∫πp v√† th·ªÉ h√¨nh': 9,
    'Con ng∆∞·ªùi v√† x√£ h·ªôi': 10,
    'S√°ch': 11,
    'Ch√≠nh tr·ªã': 12,
    'ƒê·ªì ƒÉn v√† ƒë·ªì u·ªëng': 13,
    'Giao th√¥ng': 14,
    'Th√≥i quen v√† s·ªü th√≠ch': 15,
    'Gi·∫£i tr√≠': 16,
    'S·ª©c kho·∫ª v√† b·ªánh t·∫≠t': 17,
    'Ph√°p lu·∫≠t': 18,
    'Khoa h·ªçc': 19,
    'M√°y t√≠nh v√† thi·∫øt b·ªã ƒëi·ªán t·ª≠': 20,
    'C√¥ng ngh·ªá m·ªõi': 21,
    'Th·ªÉ thao': 22
}

In [16]:
print(df_train[:5])
print(df_test[:5])

                                                text  label
0  theo h√†nh tr√¨nh tour du l·ªãch m·ªπ b·ªù ƒë√¥ng du kh√°...      0
1  m√¨nh c·∫ßn t√¨m ph√≤ng cho kho·∫£ng ng∆∞·ªùi quanh khu ...      1
2  cho thu√™ nh√† ri√™ng dt ms√†n c√≥ ph√≤ng ng·ªß p kh√°c...      1
3  cho thu√™ nh√† ·ªü t·∫ßng kh√©p k√≠n nguy·ªÖn kho√°i c√≥ b...      1
4  crumpler jackpack full photo gi√° vnƒë gi·∫£m c√≤n ...      2
                                                text  label
0  g·∫•p hi·ªán b√™n em ƒëang c·∫ßn thu√™ ph√≤ng c√≥ di·ªán t√≠...    NaN
1  ch√†o noel ƒë√≥n m∆∞a qu√† t·∫∑ng nh√¢n d·ªãp noel viett...    4.0
2  kh·ªüi c√¥ng x√¢y d·ª±ng cƒÉn nh√† ph·ªë li·ªÅn k·ªÅ ch·ªâ tri...    6.0
3  s√°ng ng√†y h√¥m nay btc r·∫•t vui khi nh·∫≠n ƒë∆∞·ª£c s√°...   11.0
4  c·∫ßn cho thu√™ cƒÉn h·ªô chung c∆∞ d∆∞·ªõi s√†i ƒë·ªìng ƒë·ªëi...    1.0


In [17]:
print("S·ªë l∆∞·ª£ng NaN trong df_train:")
print(df_train.isnull().sum())
print("\nS·ªë l∆∞·ª£ng NaN trong df_test:")
print(df_test.isnull().sum())

S·ªë l∆∞·ª£ng NaN trong df_train:
text     0
label    0
dtype: int64

S·ªë l∆∞·ª£ng NaN trong df_test:
text     0
label    1
dtype: int64


In [18]:
df_test = df_test.iloc[1:].reset_index(drop=True)

In [19]:
df_test['label'] = df_test['label'].astype(int)

In [20]:
print(df_test.head())

                                                text  label
0  ch√†o noel ƒë√≥n m∆∞a qu√† t·∫∑ng nh√¢n d·ªãp noel viett...      4
1  kh·ªüi c√¥ng x√¢y d·ª±ng cƒÉn nh√† ph·ªë li·ªÅn k·ªÅ ch·ªâ tri...      6
2  s√°ng ng√†y h√¥m nay btc r·∫•t vui khi nh·∫≠n ƒë∆∞·ª£c s√°...     11
3  c·∫ßn cho thu√™ cƒÉn h·ªô chung c∆∞ d∆∞·ªõi s√†i ƒë·ªìng ƒë·ªëi...      1
4  b√†i d·ª± thi c·ªßa ban nh·∫°c old mac donal band ban...      7


In [21]:
print(df_train.dtypes)
print(df_test.dtypes)

text     object
label     int64
dtype: object
text     object
label     int64
dtype: object


In [None]:
def save_csv_data(df, path):
  df.to_csv(path, encoding='utf-8')

# Finetune phoBERT

In [22]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [23]:
class VietnameseTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [24]:
class PhoBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        super(PhoBERTClassifier, self).__init__()
        self.phobert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [25]:
print("Preparing datasets...")
train_texts = df_train['text'].tolist()
train_labels = df_train['label'].tolist()
test_texts = df_test['text'].tolist()
test_labels = df_test['label'].tolist()

train_dataset = VietnameseTextDataset(train_texts, train_labels, tokenizer)
test_dataset = VietnameseTextDataset(test_texts, test_labels, tokenizer)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Preparing datasets...


In [26]:
# Training function
def train_epoch(model, data_loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(data_loader, desc="Training")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{correct_predictions/total_predictions:.4f}'
        })

    return total_loss / len(data_loader), correct_predictions / total_predictions

In [27]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(data_loader), accuracy, predictions, true_labels

In [30]:
NUM_CLASSES = len(label_to_encoded)
MODEL_NAME = "vinai/phobert-base"
model = PhoBERTClassifier(MODEL_NAME, NUM_CLASSES)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

# Optimizer v√† scheduler
LEARNING_RATE = 2e-5
EPOCHS = 3
WARMUP_STEPS = 100

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

# Loss function
criterion = nn.CrossEntropyLoss()

In [31]:
# Training loop
print("Starting training...")
best_accuracy = 0
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')
    print('-' * 50)

    # Training
    train_loss, train_acc = train_epoch(
        model, train_loader, optimizer, scheduler, criterion, device
    )

    # Evaluation
    val_loss, val_acc, predictions, true_labels = evaluate(
        model, test_loader, criterion, device
    )

    # Store metrics
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    # Save best model
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_accuracy': best_accuracy,
            'label_to_encoded': label_to_encoded_format
        }, 'best_phobert_classifier.pth')
        print(f'New best model saved with accuracy: {best_accuracy:.4f}')

print(f'\nTraining completed! Best validation accuracy: {best_accuracy:.4f}')

Starting training...

Epoch 1/3
--------------------------------------------------


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 942/942 [11:03<00:00,  1.42it/s, loss=0.2369, acc=0.7751]
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 603/603 [02:10<00:00,  4.62it/s]


Train Loss: 0.9641, Train Acc: 0.7751
Val Loss: 0.3821, Val Acc: 0.8802
New best model saved with accuracy: 0.8802

Epoch 2/3
--------------------------------------------------


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 942/942 [11:01<00:00,  1.42it/s, loss=0.0904, acc=0.9009]
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 603/603 [02:10<00:00,  4.63it/s]


Train Loss: 0.3020, Train Acc: 0.9009
Val Loss: 0.3219, Val Acc: 0.8830
New best model saved with accuracy: 0.8830

Epoch 3/3
--------------------------------------------------


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 942/942 [11:01<00:00,  1.42it/s, loss=0.0112, acc=0.9213]
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 603/603 [02:18<00:00,  4.36it/s]


Train Loss: 0.2204, Train Acc: 0.9213
Val Loss: 0.3036, Val Acc: 0.8898
New best model saved with accuracy: 0.8898

Training completed! Best validation accuracy: 0.8898


In [32]:
# T·∫°o classification report
encoded_to_label = {v: k for k, v in label_to_encoded_format.items()}
target_names = [encoded_to_label[i] for i in range(len(label_to_encoded_format))]

print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=target_names))


Classification Report:
                              precision    recall  f1-score   support

                     Du l·ªãch       0.96      0.98      0.97       604
                     Nh√† ƒë·∫•t       0.98      0.99      0.98      1583
                     Mua s·∫Øm       0.95      0.95      0.95       754
                   T√†i ch√≠nh       0.54      0.29      0.37       758
 M·∫°ng internet v√† vi·ªÖn th√¥ng       0.99      0.98      0.98       396
                 Nh√† v√† v∆∞·ªùn       0.97      0.91      0.94       158
   Kinh doanh v√† c√¥ng nghi·ªáp       0.62      0.79      0.70      1250
                  Ngh·ªá thu·∫≠t       0.99      0.99      0.99       398
                    Gi√°o d·ª•c       0.96      0.94      0.95       460
         L√†m ƒë·∫πp v√† th·ªÉ h√¨nh       0.95      0.96      0.95       181
         Con ng∆∞·ªùi v√† x√£ h·ªôi       0.92      0.94      0.93       204
                        S√°ch       0.94      0.96      0.95       245
                 

In [33]:
def predict_text(model, tokenizer, text, label_to_encoded, device, max_length=256):
    model.eval()

    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs, dim=-1)
        predicted_class_id = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class_id].item()

    encoded_to_label = {v: k for k, v in label_to_encoded.items()}
    predicted_label = encoded_to_label[predicted_class_id]

    return predicted_label, confidence

In [35]:
test_text = "T√¥i mu·ªën mua m·ªôt chi·∫øc ƒëi·ªán tho·∫°i m·ªõi v·ªõi camera t·ªët"
test_text_preprocessed = preprocess_text(test_text)
predicted_label, confidence = predict_text(
    model, tokenizer, test_text_preprocessed, label_to_encoded_format, device
)
print(f"\nSample prediction:")
print(f"Text: {test_text}")
print(f"Predicted label: {predicted_label}")
print(f"Confidence: {confidence:.4f}")


Sample prediction:
Text: T√¥i mu·ªën mua m·ªôt chi·∫øc ƒëi·ªán tho·∫°i m·ªõi v·ªõi camera t·ªët
Predicted label: M·∫°ng internet v√† vi·ªÖn th√¥ng
Confidence: 0.7746
