### Thiết lập môi trường và tải dữ liệu

In [1]:
import pandas as pd
import os

data_path = r'/content'
train_path = os.path.join(data_path, 'train.csv')
val_path =  os.path.join(data_path, 'val.csv')
test_path = os.path.join(data_path, 'test.csv')

# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [2]:
# label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_train['label_encoded'] = label_encoder.fit_transform(df_train['category'])
df_val['label_encoded'] = label_encoder.transform(df_val['category'])
df_test['label_encoded'] = label_encoder.transform(df_test['category'])

num_classes = len(label_encoder.classes_)
print("Number of classes:", num_classes)

Number of classes: 64


### Pipeline TF-IDF + Logistic regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
# 1. Tạo một pipeline với TfidfVectorizer và LogisticRegression
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)
# 2. Huấn luyện pipeline trên tập train
tfidf_lr_pipeline.fit(df_train['text'], df_train['label_encoded'])

# 3. Đánh giá trên tập test
y_pred = tfidf_lr_pipeline.predict(df_test['text'])
print(classification_report(df_test['label_encoded'], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.77      0.89      0.83        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.45      0.53      0.49        19
           7       0.89      0.89      0.89        19
           8       0.87      0.68      0.76        19
           9       0.59      0.68      0.63        19
          10       0.67      0.75      0.71         8
          11       0.74      0.89      0.81        19
          12       0.78      0.88      0.82         8
          13       0.83      0.79      0.81        19
          14       0.92      0.63      0.75        19
          15       0.81      0.89      0.85        19
          16       1.00      1.00      1.00        19
          17       1.00    

### Pipeline Word2Vec (mean) + Dense layer

In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from gensim.models import Word2Vec

# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=100)

# 2. Viết hàm để chuyển mỗi câu thành vector trung bình
def sentence_to_avg_vector(text, model):
    # ... (Implement logic)
    vectors = [model.wv[word] for word in text.split() if word in model.wv]
    avg_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    return avg_vector

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_train['text']])
X_val_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_val['text']])
X_test_avg = np.array([sentence_to_avg_vector(text, w2v_model) for text in df_test['text']])
y_train = df_train['label_encoded'].values
y_val = df_val['label_encoded'].values
y_test = df_test['label_encoded'].values

# 4. Xây dựng mô hình Sequential của Keras
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# 5. Compile, huấn luyện và đánh giá mô hình
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
    X_train_avg, y_train,
    validation_data=(X_val_avg, y_val),
    epochs=100, batch_size=32,
    callbacks=[early_stopping]
)
y_pred = model.predict(X_test_avg)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.1986 - loss: 3.4423 - val_accuracy: 0.6970 - val_loss: 1.4045
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5880 - loss: 1.5604 - val_accuracy: 0.7528 - val_loss: 0.9813
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6679 - loss: 1.2089 - val_accuracy: 0.7695 - val_loss: 0.8538
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6980 - loss: 1.0580 - val_accuracy: 0.7955 - val_loss: 0.7801
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7294 - loss: 0.9479 - val_accuracy: 0.8020 - val_loss: 0.7333
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7447 - loss: 0.9092 - val_accuracy: 0.8048 - val_loss: 0.7016
Epoch 7/100
[1m280/280[0m [32m

In [None]:
# tính toán loss trên tập test
y_pred_proba = model.predict(X_test_avg)
log_loss = -np.mean(np.log(y_pred_proba[np.arange(len(y_test)), y_test]))
print(f"Log Loss on test set: {log_loss}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Log Loss on test set: 0.6701321601867676


### Mô hình nâng cao (Embedding pre-trained + LSTM)

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM

# 1. Tiền xử lý cho mô hình chuỗi
# a. Tokenizer: Tạo vocab và chuyển text thành chuỗi chỉ số
vocab_size = len(w2v_model.wv.index_to_key) + 1
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(df_train['text'])
train_sequences = tokenizer.texts_to_sequences(df_train['text'])
# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 100
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = tokenizer.texts_to_sequences(df_val['text'])
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = tokenizer.texts_to_sequences(df_test['text'])
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# 3. Xây dựng mô hình Sequential với LSTM
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix], # Khởi tạo trọng số
        input_length=max_len,
        mask_zero=True,
        trainable=False # Đóng băng lớp Embedding
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

# 4. Compile, huấn luyện (sử dụng EarlyStopping) và đánh giá
lstm_model_pretrained.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = lstm_model_pretrained.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping]
)



Epoch 1/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 349ms/step - accuracy: 0.3748 - loss: 2.9687 - val_accuracy: 0.7686 - val_loss: 0.9300
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 341ms/step - accuracy: 0.7778 - loss: 0.9050 - val_accuracy: 0.8160 - val_loss: 0.6865
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 347ms/step - accuracy: 0.8195 - loss: 0.6701 - val_accuracy: 0.8327 - val_loss: 0.6053
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 346ms/step - accuracy: 0.8510 - loss: 0.5603 - val_accuracy: 0.8401 - val_loss: 0.5527
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 347ms/step - accuracy: 0.8627 - loss: 0.5020 - val_accuracy: 0.8485 - val_loss: 0.5401
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 347ms/step - accuracy: 0.8691 - loss: 0.4608 - val_accuracy: 0.8485 - val_loss: 0.5187
Ep

NameError: name 'classification_report' is not defined

In [6]:
from sklearn.metrics import classification_report
y_pred = lstm_model_pretrained.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 100ms/step
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        19
           1       0.79      1.00      0.88        11
           2       0.89      0.89      0.89        19
           3       0.75      0.75      0.75         8
           4       0.86      0.80      0.83        15
           5       0.75      0.92      0.83        13
           6       0.57      0.63      0.60        19
           7       1.00      0.95      0.97        19
           8       0.89      0.84      0.86        19
           9       0.92      0.58      0.71        19
          10       0.80      1.00      0.89         8
          11       0.82      0.95      0.88        19
          12       0.88      0.88      0.88         8
          13       0.88      0.79      0.83        19
          14       0.71      0.63      0.67        19
          15       0.82      0.95      0.88        19
      

In [7]:
# tính toán loss trên tập test
y_pred_proba = lstm_model_pretrained.predict(X_test_pad)
log_loss = -np.mean(np.log(y_pred_proba[np.arange(len(y_test)), y_test]))
print(f"Log Loss on test set: {log_loss}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step
Log Loss on test set: 0.5620812773704529


### Mô hình nâng cao (Embedding học từ đầu + LSTM)

In [8]:
# Dữ liệu đã được tiền xử lý (tokenized, padded) từ nhiệm vụ 3
# 1. Xây dựng mô hình
lstm_model_scratch = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100, # Chọn một chiều embedding, ví dụ 100
        input_length=max_len,
        mask_zero=True,
        # Không có weights, trainable=True (mặc định)
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')
])

# 2. Compile, huấn luyện và đánh giá mô hình
lstm_model_scratch.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = lstm_model_scratch.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping]
)
y_pred = lstm_model_scratch.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes))

Epoch 1/100




[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 393ms/step - accuracy: 0.1372 - loss: 3.7876 - val_accuracy: 0.5781 - val_loss: 1.8261
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 387ms/step - accuracy: 0.6381 - loss: 1.5643 - val_accuracy: 0.7779 - val_loss: 0.9589
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 384ms/step - accuracy: 0.8259 - loss: 0.7729 - val_accuracy: 0.8262 - val_loss: 0.6904
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 401ms/step - accuracy: 0.9030 - loss: 0.4482 - val_accuracy: 0.8411 - val_loss: 0.5988
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 382ms/step - accuracy: 0.9391 - loss: 0.3049 - val_accuracy: 0.8606 - val_loss: 0.5322
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 386ms/step - accuracy: 0.9520 - loss: 0.2210 - val_accuracy: 0.8559 - val_loss: 0.5248
Epoch 7/10

In [9]:
# tính toán loss trên tập test
y_pred_proba = lstm_model_scratch.predict(X_test_pad)
log_loss = -np.mean(np.log(y_pred_proba[np.arange(len(y_test)), y_test]))
print(f"Log Loss on test set: {log_loss}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step
Log Loss on test set: 0.6144747138023376
