In [1]:
from datasets import load_dataset

ds = load_dataset("uitnlp/vietnamese_students_feedback")
train_ds = ds['train']
val_ds = ds['validation']
test_ds = ds['test']

### Tokenization:

### TF-IDF:

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(train_ds['sentence'])

xgboost = XGBClassifier()
xgboost.fit(X_train_tfidf, train_ds['sentiment'])

X_val_tfidf = tfidf.transform(val_ds['sentence'])
val_preds = xgboost.predict(X_val_tfidf)

print('Validation set:')
print(classification_report(val_ds['sentiment'], val_preds))

X_test_tfidf = tfidf.transform(test_ds['sentence'])
test_preds = xgboost.predict(X_test_tfidf)
print("Test set:")
print(classification_report(test_ds['sentiment'], test_preds))

Validation set:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       705
           1       0.56      0.21      0.30        73
           2       0.91      0.93      0.92       805

    accuracy                           0.90      1583
   macro avg       0.79      0.69      0.71      1583
weighted avg       0.89      0.90      0.89      1583

Test set:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      1409
           1       0.59      0.16      0.25       167
           2       0.91      0.91      0.91      1590

    accuracy                           0.89      3166
   macro avg       0.79      0.67      0.69      3166
weighted avg       0.88      0.89      0.87      3166



### Word2Vec:

In [None]:
from gensim import utils

batch = [utils.simple_preprocess(doc) for doc in train_ds['sentence']]

# train word2vec
from gensim.models import Word2Vec
w2v = Word2Vec(sentences=batch, vector_size=100, window=5, min_count=2, workers=4, epochs=50)

w2v.save("word2vec.model")

In [4]:
import numpy as np
def w2v_transform(sentences, w2v_model):
    vectors = []
    for sent in sentences:
        word_embeddings = [w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size) for word in sent]
        if word_embeddings:
            sent_vector = sum(word_embeddings) / len(word_embeddings)
        else:
            sent_vector = np.zeros(w2v_model.vector_size)
        vectors.append(sent_vector)
    return np.array(vectors)

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

train_sentences = [utils.simple_preprocess(doc) for doc in train_ds['sentence']]
train_vectorized = w2v_transform(train_sentences, w2v)

xgboost_w2v = XGBClassifier()
xgboost_w2v.fit(train_vectorized, train_ds['sentiment'])

val_sentences = [utils.simple_preprocess(doc) for doc in val_ds['sentence']]
val_vectorized = w2v_transform(val_sentences, w2v)
val_preds_w2v = xgboost_w2v.predict(val_vectorized)
print('Validation set with Word2Vec:')
print(classification_report(val_ds['sentiment'], val_preds_w2v))

test_sentences = [utils.simple_preprocess(doc) for doc in test_ds['sentence']]
test_vectorized = w2v_transform(test_sentences, w2v)
test_preds_w2v = xgboost_w2v.predict(test_vectorized)
print("Test set with Word2Vec:")
print(classification_report(test_ds['sentiment'], test_preds_w2v))

Validation set with Word2Vec:
              precision    recall  f1-score   support

           0       0.86      0.93      0.90       705
           1       0.71      0.16      0.27        73
           2       0.91      0.91      0.91       805

    accuracy                           0.89      1583
   macro avg       0.83      0.67      0.69      1583
weighted avg       0.88      0.89      0.87      1583

Test set with Word2Vec:
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      1409
           1       0.49      0.11      0.18       167
           2       0.91      0.89      0.90      1590

    accuracy                           0.87      3166
   macro avg       0.74      0.64      0.65      3166
weighted avg       0.85      0.87      0.85      3166



### FastText:

In [7]:
from gensim.models import FastText

train_sentences = [utils.simple_preprocess(doc) for doc in train_ds['sentence']]
fasttext = FastText(sentences=train_sentences, vector_size=128, window=5, min_count=2, workers=4, epochs=50)
fasttext.save("fasttext.model")

def fasttext_transform(sentences, ft_model):
    vectors = []
    for sent in sentences:
        word_embeddings = [ft_model.wv[word] if word in ft_model.wv else np.zeros(ft_model.vector_size) for word in sent]
        if word_embeddings:
            sent_vector = sum(word_embeddings) / len(word_embeddings)
        else:
            sent_vector = np.zeros(ft_model.vector_size)
        vectors.append(sent_vector)
    return np.array(vectors)

train_vectorized_ft = fasttext_transform(train_sentences, fasttext)
xgboost_ft = XGBClassifier()
xgboost_ft.fit(train_vectorized_ft, train_ds['sentiment'])

val_sentences = [utils.simple_preprocess(doc) for doc in val_ds['sentence']]
val_vectorized_ft = fasttext_transform(val_sentences, fasttext)
val_preds = xgboost_ft.predict(val_vectorized_ft)
print('Validation set:')
print(classification_report(val_ds['sentiment'], val_preds))

test_sentences = [utils.simple_preprocess(doc) for doc in test_ds['sentence']]
test_vectorized_ft = fasttext_transform(test_sentences, fasttext)
test_preds = xgboost_ft.predict(test_vectorized_ft)
print("Test set:")
print(classification_report(test_ds['sentiment'], test_preds))

Validation set:
              precision    recall  f1-score   support

           0       0.86      0.93      0.89       705
           1       0.59      0.14      0.22        73
           2       0.92      0.91      0.91       805

    accuracy                           0.88      1583
   macro avg       0.79      0.66      0.68      1583
weighted avg       0.87      0.88      0.87      1583

Test set:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1409
           1       0.56      0.09      0.15       167
           2       0.89      0.89      0.89      1590

    accuracy                           0.86      3166
   macro avg       0.76      0.63      0.64      3166
weighted avg       0.85      0.86      0.84      3166



### LSTM/GRU:

In [36]:
from gensim import utils
from gensim.models import FastText

train_sentences = [utils.simple_preprocess(doc) for doc in train_ds['sentence']]
val_sentences = [utils.simple_preprocess(doc) for doc in val_ds['sentence']]
test_sentences = [utils.simple_preprocess(doc) for doc in test_ds['sentence']]

In [64]:
from gensim import utils
import torch

words = []
for sent in train_sentences:
    words.extend(sent)
words = list(set(sorted(words)))
word2ind = {w:idx+2 for idx, w in enumerate(words)} # index 0 for pad token, index 0 for unknown token
ind2word = {idx+2:w for idx, w in enumerate(words)}

pad_token = '<PAD>'
pad_token_id = 0
word2ind[pad_token] = pad_token_id
ind2word[pad_token_id] = pad_token

unk_token = '<UNK>'
unk_token_id = 1
word2ind[unk_token] = unk_token_id
ind2word[unk_token_id] = unk_token

vocab_size = len(word2ind.keys())

def tokenize(input_sentences, word2ind, unk_token='<UNK>'):
    processed_sents = [utils.simple_preprocess(sent) for sent in input_sentences]
    sentence_ids = []
    unk_token_id = word2ind[unk_token]
    for processed_sent in processed_sents:
        ids = [word2ind.get(word, unk_token_id) for word in processed_sent]
        sentence_ids.append(ids)
    return sentence_ids

def pad_sequence(input_sequences, word2ind, pad_token):
    max_len = max([len(seq) for seq in input_sequences])
    pad_token_id = word2ind[pad_token]
    padded_sequences = [seq + [pad_token_id]*(max_len - len(seq)) for seq in input_sequences]
    return padded_sequences

def collate_fn(batch):
    sentences = [sample['sentence'] for sample in batch]
    labels = [sample['sentiment'] for sample in batch]
    tokenized_sents = tokenize(input_sentences=sentences, word2ind=word2ind, unk_token=unk_token)
    padded_sequences = pad_sequence(input_sequences=tokenized_sents, word2ind=word2ind, pad_token=pad_token)
    return torch.tensor(padded_sequences, dtype=torch.int32), torch.tensor(labels, dtype=torch.int32)

In [55]:
samples = train_ds['sentence'][:4]
tokenized_sents = tokenize(input_sentences=samples, word2ind=word2ind, unk_token=unk_token)
print("Tokenized seq:")
print(tokenized_sents)

padded_sequences = pad_sequence(input_sequences=tokenized_sents, word2ind=word2ind, pad_token='<PAD>')
print('Padded seq:')
print(padded_sequences)

Tokenized seq:
[[1126, 388, 1028, 2125, 1847], [1672, 1087, 810, 1815, 409, 655, 1516, 306, 643], [1320, 966, 2125, 1847, 2123, 545, 843, 1246], [1323, 2144, 610, 1946, 575, 1093, 1624, 1782, 523, 693, 109, 457, 246, 1908, 1338, 810, 1815]]
Padded seq:
[[1126, 388, 1028, 2125, 1847, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1672, 1087, 810, 1815, 409, 655, 1516, 306, 643, 0, 0, 0, 0, 0, 0, 0, 0], [1320, 966, 2125, 1847, 2123, 545, 843, 1246, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1323, 2144, 610, 1946, 575, 1093, 1624, 1782, 523, 693, 109, 457, 246, 1908, 1338, 810, 1815]]


In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1, dropout_p=0.1, pad_token_id=0):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token_id)

        # GRU
        self.gru = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers,
            dropout=(0 if num_layers==1 else dropout_p),
            batch_first=True,
            bidirectional=True # bidirection để nắm bắt ngữ nghĩa tốt hơn
        )
        
        self.dropout = dropout_p

    def forward(self, input_seqs):
        embedded = self.dropout(self.embedding(input_seqs))
        # embedded: [B, seq_len, embedding_dim]
        outputs, hidden = self.gru(embedded)

        return outputs, hidden

class SentimentClassifier(nn.Module):
    def __init__(self, num_classes, vocab_size, embedding_dim, hidden_size, num_layers=1, dropout_p=0.1):
        self.encoder = Encoder(vocab_size, embedding_dim, hidden_size, num_layers, dropout_p)
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Linear(embedding_dim // 2, num_classes)
        )

    def forward(self, input_seqs):
        features = self.encoder(input_seqs)
        logits = self.mlp(features)
        return torch.softmax(logits)

In [65]:
import torch
from torch.utils.data import DataLoader

generator = torch.Generator().manual_seed(42)
train_dataloader = DataLoader(train_ds, batch_size=4, collate_fn=collate_fn, shuffle=True, num_workers=0, generator=generator)
val_dataloader = DataLoader(val_ds, batch_size=4, collate_fn=collate_fn, shuffle=False, num_workers=0)
test_dataloader = DataLoader(test_ds, batch_size=4, collate_fn=collate_fn, shuffle=False, num_workers=0)

In [68]:
test = nn.Sequential(
    nn.Embedding(num_embeddings=vocab_size, embedding_dim=128, padding_idx=0),
    nn.LSTM(input_size=128, hidden_size=10, num_layers=2, batch_first=True)
)

In [78]:
for batch in train_dataloader:
    seqs, labels = batch
    outputs, (hiddens, cells) = test(seqs)
    print(outputs.shape)
    print(hiddens.shape)
    print(cells.shape)
    break

torch.Size([4, 57, 10])
torch.Size([2, 4, 10])
torch.Size([2, 4, 10])


In [None]:
device = "cpu"
model = SentimentClassifier(
    num_classes=3,
    vocab_size=
)

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.AdamW(m)