In [92]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/topic-dataset/train_markov.csv
/kaggle/input/topic-dataset/train_cleaned.csv
/kaggle/input/topic-dataset/test_cleaned.csv
/kaggle/input/facebook-data/Train_data.txt
/kaggle/input/facebook-data/Train_bigdata.txt
/kaggle/input/vietnamstopword/vietnamese-stopwords.txt


In [93]:
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt

In [94]:
train_path = "/kaggle/input/facebook-data/Train_bigdata.txt"


In [95]:
data = []
with open(train_path, 'r', encoding='utf-8-sig') as file:
    for line in file:
        data.append(line.strip())

# Tạo DataFrame từ dữ liệu
df = pd.DataFrame(data, columns=['text'])

df['label'] = df['text'].apply(lambda x: x.split(' ')[0].replace('__label__', ''))
df['text'] = df['text'].apply(lambda x: ' '.join(x.split(' ')[1:]))

In [96]:
stopwords_path = '/kaggle/input/vietnamstopword/vietnamese-stopwords.txt'
with open(stopwords_path, 'r', encoding='utf-8') as file:
    vietnamese_stopwords = set(line.strip() for line in file)

In [97]:
def preprocess_text(text, stopwords):
    # Loại bỏ đường link
    text = re.sub(r'http\S+', '', text)
    # Loại bỏ các ký tự đặc biệt, emojis và số
    text = re.sub(r'[^\w\s]|_', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    # Chuyển về chữ thường
    text = text.lower()

    # Tách từ và loại bỏ stopwords
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)

In [98]:
df['clean_text'] = df['text'].apply(lambda x: preprocess_text(x, vietnamese_stopwords))
df = df.drop_duplicates('text', keep='first').reset_index(drop=True)
df

Unnamed: 0,text,label,clean_text
0,"Theo hành trình tour du lịch Mỹ - Bờ Đông, du ...",Du_lich,hành trình tour du lịch mỹ bờ đông du lữ hành ...
1,mình cần tìm 1 phòng cho khoảng 3 người quanh...,Nha_dat,phòng quanh khu vực hồ tùng mậu phòng khép kín...
2,Cho thuê nhà riêng dt 60m/sàn. Có 4 phòng ngủ...,Nha_dat,thuê dt msàn phòng ngủ p thoáng mát an ninh ng...
3,"Cho thuê nhà ở tầng 4 khép kín, 4/295 Nguyễn K...",Nha_dat,thuê tầng khép kín nguyễn khoái bếp tủ lạnh lò...
4,► Crumpler jackpack full photo ► giá : 800.000...,Mua_sam,crumpler jackpack full photo giá vnđ vnđ đựng ...
...,...,...,...
15218,CÓ GÌ TRONG HÀNH TRÌNH ĐẾN NHẬT NGẮM HOA TỬ ĐẰ...,Du_lich,hành trình nhật ngắm hoa tử đằng nhật hành trì...
15219,CÁC MÓN KIM CHI NGON CHO MÙA THU -------------...,Do_an_va_do_uong,món kim chi ngon mùa thu ực ựcựcthật chảy miến...
15220,Cần cho thuê Chung cư Greenstar 234 Phạm Văn Đ...,Nha_dat,thuê cư greenstar phạm văn đồng bắc liêm hà nộ...
15221,Bố trí thông minh giúp nhà ống Sài Gòn không c...,Nha_va_vuon,bố trí thông minh giúp ống sài gòn chỗ tối bố ...


In [99]:
label_map = {label: i for i, label in enumerate(sorted(df['label'].unique()))}

In [100]:
X_train, X_val, y_train, y_val = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)

In [101]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [102]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = label_map[self.labels.iloc[idx]]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }


In [103]:
train_dataset = CustomDataset(X_train, y_train, tokenizer)
val_dataset = CustomDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [104]:
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

num_labels = len(label_map)
model = BERTClassifier(num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [105]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = nn.CrossEntropyLoss()
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)




In [106]:
def train_epoch(model, train_loader, optimizer, loss_fn, scheduler):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    return avg_loss, accuracy

In [107]:
def evaluate(model, val_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total
    return avg_loss, accuracy


In [108]:
epochs = 20
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, loss_fn, lr_scheduler)
    val_loss, val_acc = evaluate(model, val_loader, loss_fn)

    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")


Epoch 1/20
Train Loss: 1.5057, Train Accuracy: 0.5908, Val Loss: 0.7254, Val Accuracy: 0.8016
Epoch 2/20
Train Loss: 0.6108, Train Accuracy: 0.8272, Val Loss: 0.5341, Val Accuracy: 0.8404
Epoch 3/20
Train Loss: 0.4311, Train Accuracy: 0.8717, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 4/20
Train Loss: 0.3832, Train Accuracy: 0.8851, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 5/20
Train Loss: 0.3812, Train Accuracy: 0.8847, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 6/20
Train Loss: 0.3796, Train Accuracy: 0.8849, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 7/20
Train Loss: 0.3841, Train Accuracy: 0.8813, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 8/20
Train Loss: 0.3830, Train Accuracy: 0.8881, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 9/20
Train Loss: 0.3813, Train Accuracy: 0.8836, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 10/20
Train Loss: 0.3820, Train Accuracy: 0.8845, Val Loss: 0.4850, Val Accuracy: 0.8522
Epoch 11/20
Train Loss: 0.3828, Train Accuracy: 0.8840, Val

In [127]:
model_save_path = "bert_classifier_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to bert_classifier_model.pth


In [109]:
def predict(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels

val_preds, val_labels = predict(model, val_loader)
print("\nClassification Report:\n")
print(classification_report(val_labels, val_preds, target_names=label_map.keys()))


Classification Report:

                              precision    recall  f1-score   support

                   Chinh_tri       0.83      0.93      0.88       161
         Con_nguoi_va_xa_hoi       0.71      0.86      0.78        78
               Cong_nghe_moi       1.00      0.83      0.91         6
            Do_an_va_do_uong       0.95      0.99      0.97       473
                     Du_lich       0.88      0.91      0.89       191
                    Giai_tri       0.94      0.98      0.96        48
                    Giao_duc       0.86      0.95      0.90       138
                  Giao_thong       0.81      1.00      0.89        25
                    Khoa_hoc       0.86      0.17      0.28        36
   Kinh_doanh_va_Cong_nghiep       0.63      0.80      0.71       388
         Lam_dep_va_the_hinh       0.84      0.92      0.88        53
 Mang_internet_va_vien_thong       0.97      0.92      0.95       116
May_tinh_va_thiet_bi_dien_tu       0.95      0.88      0.91     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [110]:
test_path = "/kaggle/input/facebook-data/Train_data.txt"

In [122]:
test_data = []
with open(test_path, 'r', encoding='utf-8-sig') as file:
    for line in file:
        test_data.append(line.strip())

test_df = pd.DataFrame(test_data, columns=['text'])

# Tách nhãn và văn bản
test_df['label'] = test_df['text'].apply(lambda x: x.split(' ')[0].replace('__label__', ''))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join(x.split(' ')[1:]))


In [123]:
test_df['clean_text'] = test_df['text'].apply(lambda x: preprocess_text(x, vietnamese_stopwords))

In [124]:
test_dataset = CustomDataset(test_df['clean_text'], test_df['label'], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [125]:
test_preds, test_labels = predict(model, test_loader)

# === Hiển thị kết quả ===
print("\nClassification Report (Test Data):\n")
print(classification_report(test_labels, test_preds, target_names=label_map.keys()))


Classification Report (Test Data):

                              precision    recall  f1-score   support

                   Chinh_tri       0.83      0.94      0.88       472
         Con_nguoi_va_xa_hoi       0.69      0.88      0.77       205
               Cong_nghe_moi       1.00      0.35      0.52        20
            Do_an_va_do_uong       0.96      0.98      0.97      1425
                     Du_lich       0.85      0.90      0.87       606
                    Giai_tri       0.96      0.99      0.97       114
                    Giao_duc       0.85      0.94      0.89       460
                  Giao_thong       0.80      0.94      0.86        50
                    Khoa_hoc       0.75      0.16      0.27        92
   Kinh_doanh_va_Cong_nghiep       0.57      0.83      0.67      1407
         Lam_dep_va_the_hinh       0.91      0.89      0.90       181
 Mang_internet_va_vien_thong       0.96      0.95      0.95       401
May_tinh_va_thiet_bi_dien_tu       0.93      0.95   

In [126]:
# Tạo DataFrame kết quả
test_df['predicted_label'] = [list(label_map.keys())[pred] for pred in test_preds]
print(test_df[['text', 'label', 'predicted_label']].head())

                                                text  \
0  Gấp ; Hiện bên em đang cần thuê 1 phòng có Diệ...   
1  🌈 CHÀO NOEL ĐÓN MƯA QUÀ TẶNG . 😍 Nhân dịp Noel...   
2  📢📢📢 KHỞI CÔNG XÂY DỰNG 33 CĂN NHÀ PHỐ LIỀN KỀ ...   
3  Sáng ngày hôm nay, BTC rất vui khi nhận được s...   
4  Cần cho thuê căn hộ chung cư dưới sài đồng đối...   

                         label              predicted_label  
0                      Nha_dat                      Nha_dat  
1  Mang_internet_va_vien_thong  Mang_internet_va_vien_thong  
2    Kinh_doanh_va_Cong_nghiep                      Nha_dat  
3                         Sach                         Sach  
4                      Nha_dat                      Nha_dat  
