In [1]:
import pandas as pd
from collections import Counter
import ast

def get_rare_tokens(df: pd.DataFrame, threshold: int=2) -> set:
    all_sentences = df['tokens'].tolist()
    all_tokens = [token for sentence in all_sentences for token in sentence]
    token_counts = Counter(all_tokens)
    rare_tokens = {token for token, count in token_counts.items() if count < threshold}
    return rare_tokens

In [2]:
df = pd.read_csv("data/preprocessed_data.csv")
df['tokens'] = df['tokens'].apply(ast.literal_eval)

df_train = df[df['dataset'] == 'train']
df_train.head()

Unnamed: 0,sentence,topic,sentiment,dataset,tokens
0,slide giáo trình đầy đủ .,1,2,train,"[slide, giáo_trình, đầy_đủ]"
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",0,2,train,"[nhiệt_tình, giảng_dạy, gần_gũi, sinh_viên]"
2,đi học đầy đủ full điểm chuyên cần .,1,0,train,"[đi, học, đầy_đủ, full, chuyên_cần]"
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0,train,"[áp_dụng, công_nghệ, thông_tin, thiết_bị, giản..."
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",0,2,train,"[thầy, giảng, bài_tập, ví_dụ, lớp]"


In [3]:
rare_tokens = get_rare_tokens(df_train, 2)

print(f"The number of rare tokens is: {len(rare_tokens)}")
print(f"Some rare tokens are: {list(rare_tokens)[:5]}")

The number of rare tokens is: 1623
Some rare tokens are: ['ngang', 'ht1', 'wzjwz37', 'thoại', 'max']


In [4]:
vocab = {"<PAD>": 0, "<UNK>": 1}
for token in df_train['tokens'].explode().unique():
    if token not in rare_tokens:
        vocab[token] = len(vocab)
print(f"The vocabulary size is: {len(vocab)}")

The vocabulary size is: 1697


In [5]:
# Properly map unknown tokens to <UNK>, not <PAD>
def encode(tokens):
    return [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]

df['input_ids'] = df['tokens'].apply(encode)
df.head()

Unnamed: 0,sentence,topic,sentiment,dataset,tokens,input_ids
0,slide giáo trình đầy đủ .,1,2,train,"[slide, giáo_trình, đầy_đủ]","[2, 3, 4]"
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",0,2,train,"[nhiệt_tình, giảng_dạy, gần_gũi, sinh_viên]","[5, 6, 7, 8]"
2,đi học đầy đủ full điểm chuyên cần .,1,0,train,"[đi, học, đầy_đủ, full, chuyên_cần]","[9, 10, 4, 1, 11]"
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0,train,"[áp_dụng, công_nghệ, thông_tin, thiết_bị, giản...","[12, 13, 14, 15, 6]"
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",0,2,train,"[thầy, giảng, bài_tập, ví_dụ, lớp]","[16, 17, 18, 19, 20]"


In [6]:
import torch

def split_dataset(label_column):
    df_train = df[df['dataset'] == 'train']
    df_val = df[df['dataset'] == 'valid']
    df_test = df[df['dataset'] == 'test']

    def prepare(df_split):
        X = [torch.tensor(seq) for seq in df_split['input_ids']]
        y = df_split[label_column].tolist()
        return X, y

    X_train, y_train = prepare(df_train)
    X_val, y_val = prepare(df_val)
    X_test, y_test = prepare(df_test)

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


In [7]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = pad_sequence(sequences, batch_first=True, padding_value=vocab["<PAD>"])
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [8]:
import torch.nn as nn
import torch.nn.functional as f

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, out_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        return self.fc(self.dropout(h[-1]))

In [14]:
def train_model(model, train_loader, val_loader, epochs=50, patience=5):
    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.Adam(model.parameters(), lr=3e-4)

    best_loss = float("inf")
    best_state = None
    wait = 0

    for epoch in range(epochs):
        model.train()
        for x, y in train_loader:
            opt.zero_grad()
            out = model(x)
            loss = loss_fn(out, y)
            loss.backward()
            opt.step()

        model.eval()
        total = 0
        with torch.no_grad():
            for x, y in val_loader:
                out = model(x)
                total += loss_fn(out, y).item()
        avg = total / len(val_loader)
        print(f"Epoch {epoch+1}, val loss: {avg:.4f}")

        if avg < best_loss:
            best_loss = avg
            best_state = model.state_dict()
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print("Early stopping")
                break

    model.load_state_dict(best_state)


In [15]:
from sklearn.metrics import classification_report

def evaluate_model(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for x, y in loader:
            out = model(x)
            pred = out.argmax(dim=1)
            all_preds.extend(pred.tolist())
            all_labels.extend(y.tolist())
    print(classification_report(all_labels, all_preds))

In [21]:
from torch.utils.data import DataLoader

def run_for_task(label_column, **kwargs):
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = split_dataset(label_column)

    train_ds = TextDataset(X_train, y_train)
    val_ds = TextDataset(X_val, y_val)
    test_ds = TextDataset(X_test, y_test)

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32)
    test_loader = DataLoader(test_ds, batch_size=32)

    model = LSTMClassifier(
        vocab_size=len(vocab),
        emb_dim=128,
        hidden_dim=64,
        out_dim=len(set(y_train + y_val + y_test)),
    )

    epochs = kwargs.get("epochs", 50)
    patience = kwargs.get("patience", epochs // 10)

    train_model(model, train_loader, val_loader, epochs=epochs, patience=patience)
    evaluate_model(model, test_loader)

In [17]:
run_for_task("topic")

Epoch 1, val loss: 0.6558
Epoch 2, val loss: 0.6688
Epoch 3, val loss: 0.5831
Epoch 4, val loss: 0.5859
Epoch 5, val loss: 0.5211
Epoch 6, val loss: 0.5032
Epoch 7, val loss: 0.4869
Epoch 8, val loss: 0.5183
Epoch 9, val loss: 0.5255
Epoch 10, val loss: 0.4775
Epoch 11, val loss: 0.5633
Epoch 12, val loss: 0.5058
Epoch 13, val loss: 0.5393
Epoch 14, val loss: 0.5257
Epoch 15, val loss: 0.5145
Epoch 16, val loss: 0.5614
Epoch 17, val loss: 0.5253
Epoch 18, val loss: 0.5351
Epoch 19, val loss: 0.5783
Epoch 20, val loss: 0.5609
Early stopping
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      2290
           1       0.52      0.77      0.62       572
           2       0.73      0.92      0.81       145
           3       0.15      0.01      0.02       159

    accuracy                           0.81      3166
   macro avg       0.59      0.64      0.59      3166
weighted avg       0.81      0.81      0.80      3166



In [24]:
run_for_task("sentiment", epochs=100, patience=30)

Epoch 1, val loss: 0.8459
Epoch 2, val loss: 0.8462
Epoch 3, val loss: 0.8455
Epoch 4, val loss: 0.8466
Epoch 5, val loss: 0.8456
Epoch 6, val loss: 0.8454
Epoch 7, val loss: 0.8459
Epoch 8, val loss: 0.8456
Epoch 9, val loss: 0.8457
Epoch 10, val loss: 0.8465
Epoch 11, val loss: 0.8453
Epoch 12, val loss: 0.8457
Epoch 13, val loss: 0.8453
Epoch 14, val loss: 0.8457
Epoch 15, val loss: 0.8457
Epoch 16, val loss: 0.8452
Epoch 17, val loss: 0.8390
Epoch 18, val loss: 0.8358
Epoch 19, val loss: 0.8326
Epoch 20, val loss: 0.8399
Epoch 21, val loss: 0.8372
Epoch 22, val loss: 0.8338
Epoch 23, val loss: 0.8344
Epoch 24, val loss: 0.8377
Epoch 25, val loss: 0.8349
Epoch 26, val loss: 0.8345
Epoch 27, val loss: 0.8340
Epoch 28, val loss: 0.8370
Epoch 29, val loss: 0.8344
Epoch 30, val loss: 0.8351
Epoch 31, val loss: 0.8350
Epoch 32, val loss: 0.8349
Epoch 33, val loss: 0.8351
Epoch 34, val loss: 0.8339
Epoch 35, val loss: 0.8355
Epoch 36, val loss: 0.8359
Epoch 37, val loss: 0.8356
Epoch 38, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
