# Phân loại Văn bản với Mạng Nơ-ron Hồi quy (RNN/LSTM)

# Phần 1: Nền tảng lý thuyết

- Mô hình Bag-of-Words: Biểu diễn mỗi văn bản bằng một vector tần suất từ (TF-IDF), sau đó dùng các thuật toán Machine Learning cổ điển như Logistic Regression, SVM

- Mô hình Word2Vec + Dense Layer: Biểu diễn mỗi từ bằng 1 vector dày đặc, sau đó tính vector trung bình cho cả câu và đưa vào mạng nơ-ron đơn giản.

## Phần 2: Lab thực hành

### Bước 0: Thiết lập Môi trường và Tải dữ liệu

In [1]:
# Lệnh shell để giải nén file
!tar -xzvf ../Data/hwu.tar.gz

x hwu/
x hwu/categories.json
x hwu/train_5.csv
x hwu/train_10.csv
x hwu/val.csv
x hwu/test.csv
x hwu/train.csv


In [36]:
import pandas as pd

# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv('hwu/train.csv', header=None, names=['text', 'intent'])
df_val = pd.read_csv('hwu/val.csv', header=None, names=['text', 'intent'])
df_test = pd.read_csv('hwu/test.csv', header=None, names=['text', 'intent'])

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8955, 2)
Validation shape: (1077, 2)
Test shape: (1077, 2)


Unnamed: 0,text,intent
0,text,category
1,what alarms do i have set right now,alarm_query
2,checkout today alarm of meeting,alarm_query
3,report alarm settings,alarm_query
4,see see for me the alarms that you have set to...,alarm_query


In [37]:
from sklearn.preprocessing import LabelEncoder
# ... (Code để fit LabelEncoder trên toàn bộ tập intent và transform các tập train/val/test)
intents = (
    df_train['intent'].tolist() +
    df_val['intent'].tolist() +
    df_test['intent'].tolist()
)
classes = list(set(intents))

label_encoder = LabelEncoder()
label_encoder.fit(intents)
df_train["intent"] = label_encoder.transform(df_train["intent"])
df_val["intent"] = label_encoder.transform(df_val["intent"])
df_test["intent"] = label_encoder.transform(df_test["intent"])
num_classes = len(set(intents))

### Nhiệm vụ: Pipeline TF-IDF + Logistic Regression

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, log_loss

# 1. Tạo một pipeline với TfidfVectorizer và LogisticRegression
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

# 2. Huấn luyện pipeline trên tập train
tfidf_lr_pipeline.fit(df_train["text"], df_train["intent"])

# 3. Đánh giá trên tập test
y_pred = tfidf_lr_pipeline.predict(df_test["text"])
loss_tfidf = log_loss(df_test["intent"], tfidf_lr_pipeline.predict_proba(df_test["text"]), labels=list(range(num_classes)))

print(classification_report(y_true=df_test["intent"], y_pred=y_pred))
print("Test loss", loss_tfidf)

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.81      0.89      0.85        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.48      0.53      0.50        19
           7       0.89      0.89      0.89        19
           8       0.82      0.74      0.78        19
           9       0.00      0.00      0.00         1
          10       0.59      0.68      0.63        19
          11       0.67      0.75      0.71         8
          12       0.74      0.89      0.81        19
          13       0.78      0.88      0.82         8
          14       0.83      0.79      0.81        19
          15       0.92      0.63      0.75        19
          16       0.77      0.89      0.83        19
          17       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Nhiệm vụ 2: Pipeline Word2Vec (Trung bình) + Dense Layer

In [5]:
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [6]:
# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 2. Viết hàm để chuyển mỗi câu thành vector trung bình
def sentence_to_avg_vector(text, model):
# ... (Implement logic)
    words = text.split()
    vectors = []
    vectors = [model.wv[word] if word in model.wv else np.zeros(100) for word in words]   
    avg_vector = np.mean(vectors ,axis=0)
    return avg_vector

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg = np.array([sentence_to_avg_vector(t, w2v_model) for t in df_train['text']])
X_val_avg = np.array([sentence_to_avg_vector(t, w2v_model) for t in df_val['text']])
X_test_avg = np.array([sentence_to_avg_vector(t, w2v_model) for t in df_test['text']])

y_train = df_train["intent"].values
y_val   = df_val["intent"].values
y_test  = df_test["intent"].values

# 4. Xây dựng mô hình Sequential của Keras
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# 5. Compile, huấn luyện và đánh giá mô hình
model.compile(
    optimizer='adam',
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train_avg, y_train,
    validation_data=(X_val_avg, y_val), 
    epochs=100,
    batch_size=32
)

y_pred_probs = model.predict(X_test_avg)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_true=y_test, y_pred=y_pred))
print("Test loss:", model.evaluate(X_test_avg, y_test, verbose=1)[0])

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0208 - loss: 4.1538 - val_accuracy: 0.0446 - val_loss: 4.1174
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0303 - loss: 4.1157 - val_accuracy: 0.0622 - val_loss: 4.0802
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0450 - loss: 4.0646 - val_accuracy: 0.0687 - val_loss: 4.0065
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0523 - loss: 3.9845 - val_accuracy: 0.0604 - val_loss: 3.9021
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0715 - loss: 3.8777 - val_accuracy: 0.0975 - val_loss: 3.7843
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0823 - loss: 3.7805 - val_accuracy: 0.1049 - val_loss: 3.6840
Epoch 7/100
[1m280/280[0m [32m━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Nhiệm vụ 3: Mô hình Nâng cao (Embedding Pre-trained + LSTM)

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.callbacks import EarlyStopping

# 1. Tiền xử lý cho mô hình chuỗi
# a. Tokenizer: Tạo vocab và chuyển text thành chuỗi chỉ số

tokenizer = Tokenizer(num_words=10000, oov_token="<UNK>")
tokenizer.fit_on_texts(df_train["text"])

# Chuyển text sang sequences
train_sequences = tokenizer.texts_to_sequences(df_train["text"])
val_sequences   = tokenizer.texts_to_sequences(df_val["text"])
test_sequences  = tokenizer.texts_to_sequences(df_test["text"])

# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 50
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_val_pad   = pad_sequences(val_sequences, maxlen=max_len, padding='post')
X_test_pad  = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# 3. Xây dựng mô hình Sequential với LSTM
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix], # Khởi tạo trọng số
        input_length=max_len,
        trainable=False # Đóng băng lớp Embedding
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
    ])

# 4. Compile, huấn luyện (sử dụng EarlyStopping) và đánh giá
lstm_model_pretrained.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',  # Nhãn dạng số nguyên
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = lstm_model_pretrained.fit(
    X_train_pad, df_train["intent"].values,
    validation_data=(X_val_pad, df_val["intent"].values),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop]
)

y_pred_probs = lstm_model_pretrained.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_true=y_test, y_pred=y_pred))
print("Test loss:", lstm_model_pretrained.evaluate(X_test_pad, y_test)[0])

Epoch 1/50




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.0163 - loss: 4.1508 - val_accuracy: 0.0186 - val_loss: 4.1410
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.0247 - loss: 4.0748 - val_accuracy: 0.0409 - val_loss: 3.9291
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - accuracy: 0.0424 - loss: 3.9368 - val_accuracy: 0.0511 - val_loss: 3.9190
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.0518 - loss: 3.8937 - val_accuracy: 0.0539 - val_loss: 3.8002
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.0494 - loss: 3.8615 - val_accuracy: 0.0622 - val_loss: 3.7571
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.0554 - loss: 3.8319 - val_accuracy: 0.0594 - val_loss: 3.7388
Epoch 7/50
[1m280/280[0m [32m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1105 - loss: 3.4021
Test loss: 3.4021434783935547


### Nhiệm vụ 4: Mô hình Nâng cao (Embedding học từ đầu + LSTM)

In [8]:
# Dữ liệu đã được tiền xử lý (tokenized, padded) từ nhiệm vụ 3
# 1. Xây dựng mô hình
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100, # Chọn một chiều embedding, ví dụ 100
        input_length=max_len
        # Không có weights, trainable=True (mặc định)
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])
# 2. Compile, huấn luyện và đánh giá mô hình
lstm_model_scratch.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = lstm_model_scratch.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop]
)

y_pred_probs = lstm_model_scratch.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_test, y_pred))
print("Test loss:", lstm_model_scratch.evaluate(X_test_pad, y_test, verbose=1))

Epoch 1/50




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.0168 - loss: 4.1506 - val_accuracy: 0.0176 - val_loss: 4.1339
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - accuracy: 0.0163 - loss: 4.1385 - val_accuracy: 0.0176 - val_loss: 4.1312
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - accuracy: 0.0143 - loss: 4.1368 - val_accuracy: 0.0176 - val_loss: 4.1331
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00        19
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        19
           7  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0176 - loss: 4.1339
Test loss: [4.133893966674805, 0.017641596496105194]


### Nhiệm vụ 5: Đánh giá, So sánh và Phân tích

In [38]:
# Câu kiểm tra
test_sentences = [
    "can you remind me to not call my mom",
    "is it going to be sunny or rainy tomorrow",
    "find a flight from new york to london but not through paris"
]

# 1. TF-IDF + Logistic Regression
pred_tfidf = tfidf_lr_pipeline.predict(test_sentences)

# 2. Word2Vec Avg + Dense
def sentence_to_avg_vector(text, model):
    words = text.split()
    vectors = [model.wv[word] if word in model.wv else np.zeros(model.vector_size) for word in words]
    return np.mean(vectors, axis=0)

X_w2v = np.array([sentence_to_avg_vector(s, w2v_model) for s in test_sentences])
pred_w2v = np.argmax(model.predict(X_w2v), axis=1)

# 3. LSTM pretrained embedding

sequences = tokenizer.texts_to_sequences(test_sentences)
X_pad = pad_sequences(sequences, maxlen=max_len, padding='post')
pred_lstm_pretrained = np.argmax(lstm_model_pretrained.predict(X_pad), axis=1)

# 4. LSTM scratch
pred_lstm_scratch = np.argmax(lstm_model_scratch.predict(X_pad), axis=1)

# In kết quả
for i, sent in enumerate(test_sentences):
    print(f"Sentence: {sent}")
    print(f"TF-IDF + LR: {classes[pred_tfidf[i]]}")
    print(f"Word2Vec Avg + Dense: {classes[pred_w2v[i]]}")
    print(f"LSTM Pretrained: {classes[pred_lstm_pretrained[i]]}")
    print(f"LSTM Scratch: {classes[pred_lstm_scratch[i]]}")
    print("#"*70)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Sentence: can you remind me to not call my mom
TF-IDF + LR: play_podcasts
Word2Vec Avg + Dense: datetime_query
LSTM Pretrained: iot_hue_lightoff
LSTM Scratch: transport_taxi
######################################################################
Sentence: is it going to be sunny or rainy tomorrow
TF-IDF + LR: iot_wemo_on
Word2Vec Avg + Dense: alarm_remove
LSTM Pretrained: iot_hue_lightchange
LSTM Scratch: transport_taxi
######################################################################
Sentence: find a flight from new york to london but not through paris
TF-IDF + LR: cooking_recipe
Word2Vec Avg + Dense: general_praise
LSTM Pretrained: general_repeat
LSTM Scratch: transport_taxi
######################################################################
