#BUILD

###Import library

In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
import torch
from transformers import AutoTokenizer
from tensorflow.keras.layers import Concatenate
import json

###Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open("/content/drive/MyDrive/model_build/train.json", "r", encoding = "utf-8") as f:
  dataset = json.load(f)

###Tokenize

In [None]:
# Load tokenizer PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

# Hàm token hóa
def tokenize_data(data, tokenizer, max_length=50):
    input_ids = []

    for sample in data:
        encoded = tokenizer(
            sample, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf"
        )
        input_ids.append(encoded["input_ids"])

    return tf.concat(input_ids, axis=0)  # Chuyển thành TensorFlow Tensor

# Token hóa dữ liệu
input_ids = tokenize_data([x["prompt"] for x in dataset], tokenizer)
target_ids = tokenize_data([x["response"] for x in dataset], tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [None]:
import numpy as np
input_ids_np = input_ids.numpy() if isinstance(input_ids, tf.Tensor) else input_ids
target_ids_np = target_ids.numpy() if isinstance(target_ids, tf.Tensor) else target_ids

### Lớp Seq2Seq với Attention

In [None]:

class Seq2SeqAttention(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_size, **kwargs):
        super(Seq2SeqAttention, self).__init__(**kwargs)

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.encoder_lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=True, return_state=True)
        self.decoder_lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=True, return_state=True)
        self.attention = tf.keras.layers.Attention()
        self.dense = tf.keras.layers.Dense(vocab_size, activation="softmax")

        # Lưu tham số
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_embedded = self.embedding(encoder_inputs)
        encoder_outputs, state_h, state_c = self.encoder_lstm(encoder_embedded)

        decoder_embedded = self.embedding(decoder_inputs)
        decoder_outputs, _, _ = self.decoder_lstm(decoder_embedded, initial_state=[state_h, state_c])

        # Áp dụng Attention
        context_vector = self.attention([decoder_outputs, encoder_outputs, encoder_outputs])
        concat_outputs = tf.concat([decoder_outputs, context_vector], axis=-1)

        outputs = self.dense(concat_outputs)
        return outputs

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "embedding_dim": self.embedding_dim,
            "hidden_size": self.hidden_size
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)



###Training

In [None]:
# Thông số mô hình
VOCAB_SIZE = tokenizer.vocab_size
EMBEDDING_DIM = 256
HIDDEN_SIZE = 512

# Tạo mô hình
model = Seq2SeqAttention(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE)
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [None]:
# Xác nhận shape dữ liệu trước khi huấn luyện
print("input_ids_np shape:", input_ids_np.shape)
print("target_ids_np shape:", target_ids_np.shape)

input_ids_np shape: (5006, 50)
target_ids_np shape: (5006, 50)


In [None]:
model.fit(
    [input_ids_np, target_ids_np[:, :-1]],  # Bỏ token cuối
    target_ids_np[:, 1:],  # Dịch phải 1 bước
    batch_size=16,
    epochs=20
)

Epoch 1/20


  output, from_logits = _get_logits(


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 248ms/step - accuracy: 0.4816 - loss: 3.9081
Epoch 2/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 256ms/step - accuracy: 0.6421 - loss: 1.9695
Epoch 3/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 260ms/step - accuracy: 0.7606 - loss: 1.2463
Epoch 4/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 260ms/step - accuracy: 0.8039 - loss: 0.9342
Epoch 5/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 260ms/step - accuracy: 0.8242 - loss: 0.7889
Epoch 6/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 260ms/step - accuracy: 0.8425 - loss: 0.6713
Epoch 7/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 260ms/step - accuracy: 0.8526 - loss: 0.6111
Epoch 8/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 260ms/step - accuracy: 0.8641 - loss: 0.5348
Epoch 9/20
[1m313/313[0m 

<keras.src.callbacks.history.History at 0x7f3ddf823390>

In [None]:
model.summary()

In [None]:
# Hàm sinh phản hồi
def generate_response(prompt, tokenizer, model, max_length=150):
    encoded_prompt = tokenizer(prompt, padding="max_length", truncation=True, max_length=max_length, return_tensors="tf")
    input_ids = encoded_prompt["input_ids"]
    attention_mask = encoded_prompt["attention_mask"]

    decoder_input = tf.convert_to_tensor([[tokenizer.bos_token_id]])
    response = []

    for _ in range(max_length):
        predictions = model([input_ids, decoder_input])
        predicted_id = tf.argmax(predictions[:, -1, :], axis=-1).numpy()[0]

        if predicted_id == tokenizer.eos_token_id:
            break

        response.append(predicted_id)
        decoder_input = tf.concat([decoder_input, tf.convert_to_tensor([[predicted_id]])], axis=1)

    return tokenizer.decode(response)




#TEST

In [None]:
# Ví dụ sử dụng
prompt = "Đất công nghiệp là gì?"
response = generate_response(prompt, tokenizer, model)
print("Chatbot:", response)

Chatbot: Đất công nghiệp là loại đất được sử dụng cho mục đích xây dựng các công trình phục vụ hoạt động thu nhập để nhà ở xã hội như nhà hát, sử dụng vào mục đích công cộng như nhà hát, công viên nhà hát, công viên và các công trình phục vụ sản xuất công nghiệp.


#SAVE

In [None]:
import os
import joblib  # Thư viện giúp lưu object dễ dàng

# Tạo thư mục nếu chưa có
save_dir = "/content/drive/MyDrive/model_build/saved_model"
os.makedirs(save_dir, exist_ok=True)

# Lưu mô hình
model.save(os.path.join(save_dir, "seq2seq_model.keras"))

# Lưu tokenizer
tokenizer_path = os.path.join(save_dir, "tokenizer.pkl")
joblib.dump(tokenizer, tokenizer_path)

print(" Mô hình & tokenizer đã được lưu!")
