### Thiết lập môi trường và tải dữ liệu

In [None]:
import pandas as pd
import os

data_path = r'/content'
train_path = os.path.join(data_path, 'train.csv')
val_path =  os.path.join(data_path, 'val.csv')
test_path = os.path.join(data_path, 'test.csv')

# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [None]:
# label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_train['label_encoded'] = label_encoder.fit_transform(df_train['category'])
df_val['label_encoded'] = label_encoder.transform(df_val['category'])
df_test['label_encoded'] = label_encoder.transform(df_test['category'])

num_classes = len(label_encoder.classes_)
print("Number of classes:", num_classes)

Number of classes: 64


### Pipeline TF-IDF + Logistic regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
# 1. Tạo một pipeline với TfidfVectorizer và LogisticRegression
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)
# 2. Huấn luyện pipeline trên tập train
tfidf_lr_pipeline.fit(df_train['text'], df_train['label_encoded'])

# 3. Đánh giá trên tập test
y_pred = tfidf_lr_pipeline.predict(df_test['text'])
print(classification_report(df_test['label_encoded'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.77      0.89      0.83        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.87      0.68      0.76        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

### Pipeline Word2Vec (mean) + Dense layer

In [None]:
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 2. Viết hàm để chuyển mỗi câu thành vector trung bình
def sentence_to_avg_vector(text, model):
    # ... (Implement logic)
    vectors = [model.wv[word] for word in text.split() if word in model.wv]
    avg_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    return avg_vector

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_train['text']])
X_val_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_val['text']])
X_test_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_test['text']])
y_train = df_train['label_encoded'].values
y_val = df_val['label_encoded'].values
y_test = df_test['label_encoded'].values

# 4. Xây dựng mô hình Sequential của Keras
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# 5. Compile, huấn luyện và đánh giá mô hình
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_avg, y_train, validation_data=(X_val_avg, y_val), epochs=50, batch_size=32)
test_loss, test_acc = model.evaluate(X_test_avg, y_test)
print(f"Test accuracy: {test_acc}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.0171 - loss: 4.1596 - val_accuracy: 0.0372 - val_loss: 4.0762
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0447 - loss: 4.0638 - val_accuracy: 0.0651 - val_loss: 3.9617
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0478 - loss: 3.9577 - val_accuracy: 0.0725 - val_loss: 3.8432
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0638 - loss: 3.8479 - val_accuracy: 0.0799 - val_loss: 3.7579
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0703 - loss: 3.7833 - val_accuracy: 0.0836 - val_loss: 3.6878
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0767 - loss: 3.7114 - val_accuracy: 0.1078 - val_loss: 3.6195
Epoch 7/50
[1m280/280[0m 

### Mô hình nâng cao (Embedding pre-trained + LSTM)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM

# 1. Tiền xử lý cho mô hình chuỗi
# a. Tokenizer: Tạo vocab và chuyển text thành chuỗi chỉ số
vocab_size = len(w2v_model.wv.index_to_key) + 1
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(df_train['text'])
train_sequences = tokenizer.texts_to_sequences(df_train['text'])
# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 50
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = tokenizer.texts_to_sequences(df_val['text'])
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = tokenizer.texts_to_sequences(df_test['text'])
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# 3. Xây dựng mô hình Sequential với LSTM
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix], # Khởi tạo trọng số
        input_length=max_len,
        trainable=False # Đóng băng lớp Embedding
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

# 4. Compile, huấn luyện (sử dụng EarlyStopping) và đánh giá
lstm_model_pretrained.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = lstm_model_pretrained.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stopping]
)
test_loss, test_acc = lstm_model_pretrained.evaluate(X_test_pad, y_test)
print(f"Test accuracy with LSTM and pretrained embeddings: {test_acc}")

Epoch 1/50




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 147ms/step - accuracy: 0.0171 - loss: 4.1441 - val_accuracy: 0.0344 - val_loss: 4.0009
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 146ms/step - accuracy: 0.0397 - loss: 4.0113 - val_accuracy: 0.0465 - val_loss: 3.8727
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 144ms/step - accuracy: 0.0408 - loss: 3.9230 - val_accuracy: 0.0511 - val_loss: 3.8630
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 146ms/step - accuracy: 0.0505 - loss: 3.8627 - val_accuracy: 0.0623 - val_loss: 3.7638
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 142ms/step - accuracy: 0.0473 - loss: 3.8253 - val_accuracy: 0.0716 - val_loss: 3.7439
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 142ms/step - accuracy: 0.0635 - loss: 3.8074 - val_accuracy: 0.0725 - val_loss: 3.7206
Epoch 7/50
[1m280/28

### Mô hình nâng cao (Embedding học từ đầu + LSTM)

In [None]:
# Dữ liệu đã được tiền xử lý (tokenized, padded) từ nhiệm vụ 3
# 1. Xây dựng mô hình
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100, # Chọn một chiều embedding, ví dụ 100
        input_length=max_len
        # Không có weights, trainable=True (mặc định)
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

# 2. Compile, huấn luyện và đánh giá mô hình
lstm_model_scratch.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = lstm_model_scratch.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=50,
    batch_size=16,
    callbacks=[early_stopping]
)
test_loss, test_acc = lstm_model_scratch.evaluate(X_test_pad, y_test)
print(f"Test accuracy with LSTM and embeddings learned from scratch: {test_acc}")

Epoch 1/50
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 155ms/step - accuracy: 0.0175 - loss: 4.1488 - val_accuracy: 0.0177 - val_loss: 4.1319
Epoch 2/50
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 157ms/step - accuracy: 0.0159 - loss: 4.1350 - val_accuracy: 0.0177 - val_loss: 4.1268
Epoch 3/50
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 156ms/step - accuracy: 0.0150 - loss: 4.1346 - val_accuracy: 0.0177 - val_loss: 4.1268
Epoch 4/50
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 159ms/step - accuracy: 0.0171 - loss: 4.1352 - val_accuracy: 0.0177 - val_loss: 4.1266
Epoch 5/50
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 157ms/step - accuracy: 0.0173 - loss: 4.1361 - val_accuracy: 0.0177 - val_loss: 4.1263
Epoch 6/50
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 157ms/step - accuracy: 0.0159 - loss: 4.1333 - val_accuracy: 0.0177 - val_loss: 4.1254
Epoch 7/5