In [37]:
import pandas as pd
import numpy as np
from pyvi import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

In [38]:
train_df = pd.read_csv(r'./cleaned_data/train.csv', quotechar='"', encoding='utf-8')
valid_df = pd.read_csv(r'./cleaned_data/valid.csv', quotechar='"', encoding='utf-8')
test_df = pd.read_csv(r'./cleaned_data/test.csv', quotechar='"', encoding='utf-8')

In [39]:
df = pd.concat([train_df, valid_df, test_df], ignore_index=True)
sentences = df['sentence'].values
labels = df['label'].values

In [40]:
def tokenize_vietnamese(text):
    return ' '.join(ViTokenizer.tokenize(text).split())

In [41]:
tokenized_sentences = [tokenize_vietnamese(sentence) for sentence in sentences]

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X = tfidf_vectorizer.fit_transform(tokenized_sentences).toarray()

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [43]:
best_model = SVC(C=1, kernel='linear', gamma='scale', random_state=42)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.8698063735377168

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      4993
           1       0.86      0.88      0.87      4923

    accuracy                           0.87      9916
   macro avg       0.87      0.87      0.87      9916
weighted avg       0.87      0.87      0.87      9916



In [44]:
def predict_sentiment(sentence, vectorizer, model):
    tokenized = tokenize_vietnamese(sentence)
    vector = vectorizer.transform([tokenized]).toarray()
    prediction = model.predict(vector)
    return prediction[0]

# Ví dụ dự đoán một câu
new_sentence = "Sản phẩm này rất tốt và dễ sử dụng!"
print("\nSentiment of new sentence:", predict_sentiment(new_sentence, tfidf_vectorizer, best_model))


Sentiment of new sentence: 1
