# Import thư viện

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

# Load data

In [2]:
# Load data
df = pd.read_csv("/kaggle/input/preprocessed-data/preprocessed_data.csv")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
df["Texts"] = df["Paragraph"].apply(
    lambda x: tokenizer.encode(x, truncation=True, max_length=256, padding="max_length")
)

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

# Define Dataset class

In [3]:
# Define Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])

In [4]:
# Prepare data
texts = df["Texts"].tolist()
labels = df["Topic"].tolist()

# Split data

In [5]:
# Split data
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

In [6]:
# Create datasets
train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_texts, test_labels)

# Create dataloaders

In [7]:
# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define TransformerClassifier class

In [8]:
# Define TransformerClassifier class
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_classes, max_len, dropout_rate=0.3):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = self._create_positional_encoding(max_len, embed_dim)
        self.dropout_rate = dropout_rate
        self.dropout = nn.Dropout(dropout_rate)  # Embedding and attention outputs dropout
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),  # Fully connected layer dropout
            nn.Linear(hidden_dim, num_classes)
        )
        # Move positional_encoding to the device during initialization
        self.positional_encoding = self.positional_encoding.to(device)

    def _create_positional_encoding(self, max_len, embed_dim):
        pe = torch.zeros(max_len, embed_dim)
        for pos in range(max_len):
            for i in range(0, embed_dim, 2):
                pe[pos, i] = np.sin(pos / (10000 ** (2 * i / embed_dim)))
                if i + 1 < embed_dim:
                    pe[pos, i + 1] = np.cos(pos / (10000 ** (2 * i / embed_dim)))
        return pe.unsqueeze(0)

    def forward(self, input_ids):
        # Apply dropout after embedding and positional encoding
        x = self.embedding(input_ids) + self.positional_encoding[:, :input_ids.size(1), :]
        x = self.dropout(x)
        x, _ = self.attention(x, x, x)
        # Apply dropout after attention
        x = self.dropout(x)
        x = x.mean(dim=1)  # Pooling
        return self.fc(x)

# Khởi tạo mô hình và các tham số

In [9]:

# Initialize model and hyperparameters
vocab_size = len(tokenizer)
embed_dim = 128
num_heads = 8
hidden_dim = 256
num_classes = len(set(labels))
max_len = 256
dropout_rate = 0.3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_classes, max_len, dropout_rate)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model

In [10]:
# Train the model
model = model.to(device)
epochs = 50
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, \
          Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {correct/total:.4f}")

Epoch 1/50, Train Loss: 2.1171,           Val Loss: 1.5531, Val Accuracy: 0.5243
Epoch 2/50, Train Loss: 1.5270,           Val Loss: 1.1900, Val Accuracy: 0.6337
Epoch 3/50, Train Loss: 1.2722,           Val Loss: 1.0337, Val Accuracy: 0.6863
Epoch 4/50, Train Loss: 1.1186,           Val Loss: 0.9869, Val Accuracy: 0.7023
Epoch 5/50, Train Loss: 1.0108,           Val Loss: 0.8758, Val Accuracy: 0.7333
Epoch 6/50, Train Loss: 0.9149,           Val Loss: 0.8346, Val Accuracy: 0.7483
Epoch 7/50, Train Loss: 0.8426,           Val Loss: 0.7890, Val Accuracy: 0.7730
Epoch 8/50, Train Loss: 0.7643,           Val Loss: 0.7785, Val Accuracy: 0.7800
Epoch 9/50, Train Loss: 0.7152,           Val Loss: 0.7257, Val Accuracy: 0.7893
Epoch 10/50, Train Loss: 0.6575,           Val Loss: 0.7258, Val Accuracy: 0.7903
Epoch 11/50, Train Loss: 0.6158,           Val Loss: 0.7538, Val Accuracy: 0.7847
Epoch 12/50, Train Loss: 0.5681,           Val Loss: 0.7025, Val Accuracy: 0.8083
Epoch 13/50, Train Loss: 

# Evaluation on test set

In [11]:

# Evaluation on test set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print("Test Accuracy:", accuracy)
print("Classification Report on Test Set:")
print(classification_report(all_labels, all_preds))

Test Accuracy: 0.848
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.84      0.77      0.81       172
           1       0.93      0.89      0.91       222
           2       0.77      0.74      0.75       201
           3       0.87      0.88      0.87       211
           4       0.87      0.85      0.86       200
           5       0.93      0.83      0.88       195
           6       0.96      0.95      0.96       216
           7       0.78      0.86      0.82       221
           8       0.81      0.89      0.85       195
           9       0.91      0.91      0.91       170
          10       0.74      0.72      0.73       199
          11       0.88      0.84      0.86       185
          12       0.78      0.85      0.81       239
          13       0.91      0.91      0.91       194
          14       0.78      0.83      0.81       180

    accuracy                           0.85      3000
   macro avg       0.85 

# Save the model

In [12]:
# Save the model
torch.save(model.state_dict(), "model.pth")