In [39]:
import pandas as pd
from pyvi import ViTokenizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [40]:
train_df = pd.read_csv(r'./cleaned_data/train.csv', quotechar='"', encoding='utf-8')
valid_df = pd.read_csv(r'./cleaned_data/valid.csv', quotechar='"', encoding='utf-8')
test_df = pd.read_csv(r'./cleaned_data/test.csv', quotechar='"', encoding='utf-8')

In [41]:
df = pd.concat([train_df, valid_df, test_df], ignore_index=True)
sentences = df['sentence'].values
labels = df['label'].values

In [42]:
def tokenize_vietnamese(text):
    return ' '.join(ViTokenizer.tokenize(text).split())

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

tokenized_sentences = [tokenize_vietnamese(sentence) for sentence in sentences]
count_vectorizer = CountVectorizer(max_features=500)
X = count_vectorizer.fit_transform(tokenized_sentences).toarray()

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [38]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB(alpha=1, fit_prior=False)
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)
print("\nNaive Bayes Test Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Test Accuracy: 0.8251311012505043

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.78      0.82      4993
           1       0.79      0.88      0.83      4923

    accuracy                           0.83      9916
   macro avg       0.83      0.83      0.82      9916
weighted avg       0.83      0.83      0.82      9916



In [30]:
def predict_sentiment(sentence, vectorizer, model):
    tokenized = tokenize_vietnamese(sentence)
    vector = vectorizer.transform([tokenized]).toarray()
    prediction = model.predict(vector)
    return prediction[0]

new_sentence = "Đồ ăn đóng gói cẩn thận nhưng không ngon lắm."
print("\nSentiment of new sentence:", predict_sentiment(new_sentence, count_vectorizer, nb_model))


Sentiment of new sentence: 1
