In [None]:
pip install -q datasets underthesea scikit-learn pandas joblib

In [38]:
%pip install -q tensorflow

In [47]:
import os
import re
import string
from pathlib import Path

import tensorflow as tf
from tensorflow.keras import layers, losses

In [42]:
if not os.path.exists('data/'):
  os.makedirs('data/')


In [30]:
from datasets import load_dataset

dataset = load_dataset("anotherpolarbear/vietnamese-sentiment-analysis")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['comment', 'label'],
        num_rows: 7786
    })
    test: Dataset({
        features: ['comment', 'label'],
        num_rows: 2224
    })
})


In [31]:
import re
import pandas as pd

def filter_map_labels(split):
  '''
  Only choose range 1–2 as negative (0) and 4–5 as positive (1)
  omitting label 3 to make the binary problem clear.
  '''
  df = pd.DataFrame(split)
  df = df[df['label'] != 3]
  df['label'] = df['label'].apply(lambda x: 1 if x>=4 else 0)
  return df

train_df = filter_map_labels(dataset['train'])
test_df = filter_map_labels(dataset['test'])
print(train_df['label'].value_counts())

def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^a-z0-9à-ỹ\s]', ' ', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

train_df['comment'] = train_df['comment'].apply(clean_text)
test_df['comment'] = test_df['comment'].apply(clean_text)
print("Train DataFrame after resetting index:")
display(train_df.head())
print("\nTest DataFrame after resetting index:")
display(test_df.head())

label
1    4936
0    1838
Name: count, dtype: int64
Train DataFrame after resetting index:


Unnamed: 0,comment,label
0,mới mua máy này tại thegioididong thốt nốt cảm...,1
1,pin kém còn lại miễn chê mua 8 3 2019 tình trạ...,1
4,mới mua sài được 1 tháng thấy pin rất trâu sài...,1
5,xài tốt rất mượt pin trâu nếu các bạn để độ sá...,1
6,mình mới xài được 7 tháng xuống 7 pin chả hiểu...,0



Test DataFrame after resetting index:


Unnamed: 0,comment,label
0,điện thoải ổn facelock cực nhanh vân tay ôk mà...,1
1,mình mới mua vivo91c tải ứng dụng games nhanh ...,1
2,xấu đẹp gì ko biết nhưng rất ưng tgdđ phục vụ ...,1
3,màn hình hơi lác khi chơi game game nặng thì m...,1
4,nói chung máy đẹp với màn amoled ổn trong tầm ...,1


In [64]:
review_cols = [c for c in train_df.columns if "comment" in c]
text_col = review_cols[0]
output_root = Path("data/")

def custom_standardization(input_data: tf.Tensor) -> tf.Tensor:
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
  return tf.strings.regex_replace(
      stripped_html, "[%s]" % re.escape(string.punctuation), ""
  )

def write_split(split_df: pd.DataFrame, split_name: str) -> Path:
  # Standardize text for this split and write one review per txt under pos/neg
  standardized = custom_standardization(
      tf.constant(split_df[text_col].astype(str).values)
  ).numpy()
  split_df = split_df.copy()
  split_df[text_col] = [s.decode("utf-8", errors="ignore") for s in standardized]

  split_dir = output_root / split_name
  for label_value, group in split_df.groupby(split_df["label"]):
    if label_value not in [0, 1]:
      continue

    label_dir = split_dir / ("pos" if label_value == 1 else "neg")
    label_dir.mkdir(parents=True, exist_ok=True)

    for idx, text in enumerate(group[text_col], start=1):
      file_path = label_dir / f"{split_name}_{label_value}_{idx}.txt"
      file_path.write_text(str(text).strip(), encoding="utf-8")

  return split_dir

train_dir = write_split(train_df, "train")
test_dir = write_split(test_df, "test")

In [66]:
# Datasets
def setup_dataset(train_dir, test_dir):
    batch_size = 32
    seed = 42
    raw_train_ds = tf.keras.utils.text_dataset_from_directory(
        train_dir,
        batch_size=batch_size,
        validation_split=0.2,
        subset="training",
        seed=seed,
    )
    amount_batches = tf.data.experimental.cardinality(raw_train_ds)
    print(f'Number of batches in the training dataset: {amount_batches}')

    raw_val_ds = tf.keras.utils.text_dataset_from_directory(
        train_dir,
        batch_size=batch_size,
        validation_split=0.2,
        subset="validation",
        seed=seed,
    )
    amount_batches = tf.data.experimental.cardinality(raw_val_ds)
    print(f'Number of batches in the validation dataset: {amount_batches}')

    raw_test_ds = tf.keras.utils.text_dataset_from_directory(
        test_dir,
        batch_size=batch_size,
    )
    amount_batches = tf.data.experimental.cardinality(raw_test_ds)
    print(f'Number of batches in the test dataset: {amount_batches}')

    return raw_train_ds, raw_val_ds, raw_test_ds

raw_train_ds, raw_val_ds, raw_test_ds = setup_dataset(train_dir, test_dir)
text_batch, label_batch = next(iter(raw_train_ds))
print("Length of text batch: ", len(text_batch))


Found 6774 files belonging to 2 classes.
Using 5420 files for training.
Number of batches in the training dataset: 170
Found 6774 files belonging to 2 classes.
Using 1354 files for validation.
Number of batches in the validation dataset: 43
Found 1966 files belonging to 2 classes.
Number of batches in the test dataset: 62
Length of text batch:  32


In [None]:
from sentence_transformers import SentenceTransformer

# Use SBERT for vectorization with UTF-8 decoding to avoid ASCII decode issues
sbert = SentenceTransformer("keepitreal/vietnamese-sbert")
embedding_dim = sbert.get_sentence_embedding_dimension()


def encode_text_batch(text_batch: tf.Tensor) -> tf.Tensor:
    # Decode bytes -> utf-8 (ignore errors) then encode with SBERT
    sentences = [s.decode("utf-8", errors="ignore") for s in text_batch.numpy().tolist()]
    embeddings = sbert.encode(sentences, convert_to_numpy=True, show_progress_bar=False)
    return embeddings.astype("float32")


def vectorize_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset:
    def _map(text, label):
        embeddings = tf.py_function(func=encode_text_batch, inp=[text], Tout=tf.float32)
        embeddings.set_shape((None, embedding_dim))  # batch, embedding_dim
        label = tf.cast(label, tf.float32)
        return embeddings, label

    return dataset.map(_map, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)


vectorized_train_ds = vectorize_dataset(raw_train_ds)
vectorized_val_ds = vectorize_dataset(raw_val_ds)
vectorized_test_ds = vectorize_dataset(raw_test_ds)

In [70]:
# MLP model on SBERT embeddings (shape uses embedding_dim)
mlp_model = tf.keras.Sequential(
    [
        layers.Input(shape=(embedding_dim,)),
        layers.Dense(256, activation="relu"),
        layers.Dropout(0.3),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid"),
    ]
)

mlp_model.compile(
    loss=losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
)

print(mlp_model.summary())

history = mlp_model.fit(
    vectorized_train_ds,
    validation_data=vectorized_val_ds,
    epochs=10,
    verbose=1,
)

eval_results = mlp_model.evaluate(vectorized_test_ds, verbose=1)
print("Test results (loss, accuracy, auc):", eval_results)

None
Epoch 1/10
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 261ms/step - accuracy: 0.8186 - auc: 0.8659 - loss: 0.3916 - val_accuracy: 0.8722 - val_auc: 0.9451 - val_loss: 0.2605
Epoch 2/10
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 241ms/step - accuracy: 0.8777 - auc: 0.9365 - loss: 0.2833 - val_accuracy: 0.8730 - val_auc: 0.9502 - val_loss: 0.2605
Epoch 3/10
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 237ms/step - accuracy: 0.8893 - auc: 0.9477 - loss: 0.2597 - val_accuracy: 0.8936 - val_auc: 0.9480 - val_loss: 0.2487
Epoch 4/10
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 241ms/step - accuracy: 0.8954 - auc: 0.9566 - loss: 0.2353 - val_accuracy: 0.8914 - val_auc: 0.9502 - val_loss: 0.2464
Epoch 5/10
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 250ms/step - accuracy: 0.9028 - auc: 0.9580 - loss: 0.2330 - val_accuracy: 0.8744 - val_auc: 0.9486 - val_loss: 0.2745
Epoch 6/10

In [74]:
os.makedirs("models", exist_ok=True)
model_path = "models/vn_review.keras"
mlp_model.save(model_path)
print(f"Saved model to {model_path}")

sample_texts = [
  "Pin kém còn lại miễn chê mua 8/3/2019 tình trạng pin còn 88% có ai giống tôi không",
  "Mọi người cập nhật phần mềm lại , nó sẽ bớt tốn pin, mình đã thử rồi, mọi thứ cũng ok, nhưng vân tay ko nhạy",
  "Con Iphone X 17 PRO này quá ngon!!!",
  "Mình mua dc 1 tuần máy đang phát trực tiếp tự nhiên tắt nguồn, bắt sống wifi quá yêu, hối hận"
]
sample_embeddings = sbert.encode(sample_texts, convert_to_numpy=True, show_progress_bar=False).astype("float32")
pred_probs = mlp_model.predict(sample_embeddings)
print("Sample predictions (probability of positive):")
for text, prob in zip(sample_texts, pred_probs.flatten()):
    print(f"{prob:.4f}\t{text}")

Saved model to models/vn_review.keras
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Sample predictions (probability of positive):
0.1196	Pin kém còn lại miễn chê mua 8/3/2019 tình trạng pin còn 88% có ai giống tôi không
1.0000	Mọi người cập nhật phần mềm lại , nó sẽ bớt tốn pin, mình đã thử rồi, mọi thứ cũng ok, nhưng vân tay ko nhạy
0.7347	Con Iphone X 17 PRO này quá ngon!!!
0.0058	Mình mua dc 1 tuần máy đang phát trực tiếp tự nhiên tắt nguồn, bắt sống wifi quá yêu, hối hận
