In [167]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
import joblib
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score

In [156]:
def normalize(text):
    text = text.lower()
    # text = re.sub(r"[^0-9a-zàáâãèéêìíòóôõùúăđĩũơưỳýỷỹạảấầẩẫậắằẳẵặẹẻẽềểễệọỏốồổỗộớờởỡợụủứừửữự ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

Tiền sử lý dữ liệu

In [None]:
# dataset = pd.read_json("./dataset.json")
dataset = pd.read_json("./dataset.json")
training_data = []  
responses_map = {}

for item in dataset.to_dict('records'):
    tag = item["tag"]
    patterns = item['patterns']
    responses_map[tag] = item['responses']
    for pattern in patterns:
        training_data.append({'text': normalize(pattern), 'tags': tag})
df = pd.DataFrame(training_data)
y = df['tags']
X= df['text']

Train Test Split

In [172]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

In [200]:
word_test =  "xin chào"

# Mô hình ngữ nghĩa (embedding-based models)

In [158]:
# Load embedding model
emb = SentenceTransformer("keepitreal/vietnamese-sbert")

In [182]:
X_train_emb = emb.encode(list(X_train), normalize_embeddings=True)
X_test_emb = emb.encode(list(X_test), normalize_embeddings=True)

In [207]:
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train_emb, y_train)

In [208]:
y_pred = clf.predict(X_test_emb)
acc = accuracy_score(y_test, y_pred)
print("Độ chính xác:", acc)

Độ chính xác: 0.6974789915966386


Kiểm tra

In [227]:
test_emb = emb.encode(list(word_test), normalize_embeddings=True)

single_prediction = clf.predict(test_emb)
single_proba = clf.predict_proba(test_emb)

tag = single_prediction[0]
confidence = single_proba.max()

print(f"Câu: '{word_test}'")
print(f"➡️ Dự đoán Tag: {tag}")
print(f"➡️ Độ tự tin: {confidence:.2%}")

Câu: 'xin chào'
➡️ Dự đoán Tag: cam_on
➡️ Độ tự tin: 23.40%


# Thuật toán rừng ngẫu nhiên

In [210]:
vectorizer = TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(1, 3), 
        min_df=1,
        max_features=40000,
        lowercase=True,
        sublinear_tf=True 
    )
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [211]:
model = RandomForestClassifier(
        n_estimators=150,        
        max_depth=20,            
        min_samples_split=6,    
        min_samples_leaf=2,      
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    )

In [212]:
model.fit(X_train_vectorized, y_train)

# Đánh giá
train_accuracy = model.score(X_train_vectorized, y_train)
test_accuracy = model.score(X_test_vectorized, y_test)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Train Accuracy: 0.95
Test Accuracy: 0.76


Kiểm tra

In [None]:
test_rdf = vectorizer.transform(list(normalize(word_test)))
predicted_tag = model.predict(test_rdf)
predicted_proba = model.predict_proba(test_rdf)

tag_rdf = predicted_tag[0]      
confidence_rdf = predicted_proba.max()

print(f"Câu: '{word_test}'")
print(f"➡️ Dự đoán Tag: {tag_rdf}")
print(f"➡️ Độ tự tin: {confidence_rdf:.2%}")

Câu: 'xin chào'
➡️ Dự đoán Tag: chao_hoi
➡️ Độ tự tin: 17.71%


In [213]:
model_package = {
    'vectorizer': vectorizer,
    'model': model,
    'responses_map': responses_map,
    'classes': model.classes_,
    'training_info': {
        'train_accuracy': model.score(X_train_vectorized, y_train),
        'test_accuracy': model.score(X_test_vectorized, y_test),
        'timestamp': pd.Timestamp.now()
    }
}
joblib.dump(model_package, 'chatbot_model.pkl')

['chatbot_model.pkl']

# Kiểm tra model

In [None]:
import joblib
import random
# 1. TẢI MODEL
model_data = joblib.load('chatbot_model.pkl')

# 2. HÀM GỌI NHANH
def quick_chat(text):
    vec = model_data['vectorizer']
    model = model_data['model']
    responses = model_data['responses_map']
    
    # Predict
    text_vec = vec.transform([text.lower()])
    tag = model.predict(text_vec)[0]
    conf = model.predict_proba(text_vec).max()
    print(conf)
    
    # Get response
    if tag in responses and conf > 0.1:
        return random.choice(responses[tag])
    else:
        return "Xin lỗi, tôi không hiểu."

# 3. SỬ DỤNG
word = "điểm thi"
print(quick_chat(word))

0.37092047400179845
Bạn đăng nhập vào cổng thông tin để tra cứu điểm cá nhân nhé.
