In [1]:
# Importing the necessary libraries
import pandas as pd
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
# Load data
df = pd.read_csv("data/preprocessed_data.csv")

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
df["InputIDs"] = df["Paragraph"].apply(
    lambda x: tokenizer.encode(x, truncation=True, max_length=256, padding="max_length")
)

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [3]:
# Define the model
class GRUClassifier(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        padding_idx,
        hidden_dim,
        num_layers,
        num_classes,
        dropout,
    ):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=padding_idx
        )
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(
            embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout
        )
        self.bn = nn.BatchNorm1d(hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        out, _ = self.gru(x)
        out = self.bn(out[:, -1, :])
        out = self.fc(out)
        return out

model = GRUClassifier(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=256,
    padding_idx=tokenizer.pad_token_id,
    hidden_dim=256,
    num_layers=4,
    num_classes=15,
    dropout=0.5,
)

In [4]:
# Split data into train, validation, and test sets
inputs = torch.tensor(df["InputIDs"].values.tolist())
labels = torch.tensor(df["Topic"].values.tolist())

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    inputs, labels, test_size=len(inputs) // 10, random_state=42
)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size=len(inputs) // 10, random_state=42
)

In [5]:
# Create data loaders, optimizer, and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)

train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_inputs, val_labels)
test_dataset = torch.utils.data.TensorDataset(test_inputs, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

CrossEntropyLoss()

In [6]:
# Train the model
for epoch in range(30):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Validation Accuracy: {correct / total}")

Epoch 1, Loss: 2.5499294346173604
Validation Accuracy: 0.33266666666666667
Epoch 2, Loss: 1.6326103251775106
Validation Accuracy: 0.6136666666666667
Epoch 3, Loss: 1.2153741349379221
Validation Accuracy: 0.678
Epoch 4, Loss: 0.9977275927066803
Validation Accuracy: 0.7443333333333333
Epoch 5, Loss: 0.852398206392924
Validation Accuracy: 0.752
Epoch 6, Loss: 0.754019831140836
Validation Accuracy: 0.7726666666666666
Epoch 7, Loss: 0.6779570437471072
Validation Accuracy: 0.7876666666666666
Epoch 8, Loss: 0.6275034290353457
Validation Accuracy: 0.8016666666666666
Epoch 9, Loss: 0.5754840432107449
Validation Accuracy: 0.7913333333333333
Epoch 10, Loss: 0.5316531060039997
Validation Accuracy: 0.7966666666666666
Epoch 11, Loss: 0.4873463716904322
Validation Accuracy: 0.8026666666666666
Epoch 12, Loss: 0.45561233545343083
Validation Accuracy: 0.8136666666666666
Epoch 13, Loss: 0.4293518770088752
Validation Accuracy: 0.8143333333333334
Epoch 14, Loss: 0.4030472201357285
Validation Accuracy: 0.82

In [7]:
# Compute test accuracy, f1 and confusion matrix
model.eval()
predicted_labels = []
true_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy}")

f1 = f1_score(true_labels, predicted_labels, average="weighted")
print(f"F1 Score: {f1}")

cm = confusion_matrix(true_labels, predicted_labels)
print(cm)

Accuracy: 0.843
F1 Score: 0.8439286730181444
[[149   1   3   2   5   1   1   4   3   1   3   1   3   0   2]
 [  0 187   7   0   2   1   0   5   1   0   2   2   2   0   1]
 [  7   6 144   9   2   0   0   5   0   0   7   2   8   2   2]
 [  9   0   7 167   1   1   0   5   2   0   0   3   1   0   5]
 [  3   3   6   1 168   1   0   1   7   2   3   4  11   1   0]
 [  0   1   2   1   2 190   1   4   1   1  16   1   4   0   3]
 [  0   3   0   0   0   0 184   2   2   0   2   0   0   0   1]
 [  9   3   4   3   0   0   1 173   3   0   8   1   0   5  10]
 [  3   1   2   0   1   3   0   2 181   1   6   0   0   0   3]
 [  1   0   1   0   2   0   1   0   0 157   8   1   0   0   2]
 [  0   0   3   0   0  10   1   4   5  11 162   1   6   0   9]
 [  1   3   1   2   0   0   0   2   0   0   8 175   1   0   2]
 [  2   2   7   0   5   4   2   1   4   1   7   1 161   0   1]
 [  0   0   3   0   0   0   1   7   1   2   2   2   3 159   5]
 [  2   1   2   2   0   0   1   0   2   1  10   4   1   0 172]]


In [8]:
# Save the model
torch.save(model.state_dict(), "model.pth")