In [15]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import re
import pickle

print("Using TensorFlow version:", tf.__version__)

Using TensorFlow version: 2.10.1


In [34]:
# Load dataset
df = pd.read_csv("../dataset/tiny_chat_dataset.csv")
inputs = df['input_text'].astype(str).tolist()
targets = df['reply_text'].astype(str).tolist()

# Clean and prepare text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9' ]+", '', text)
    return text.strip()

inputs = [clean_text(text) for text in inputs]
targets = ['<start> ' + clean_text(text) + ' <end>' for text in targets]

In [35]:
# Tokenization
VOCAB_SIZE = 1000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=800, oov_token='OOV')
tokenizer.fit_on_texts(inputs + targets)

input_seq = tokenizer.texts_to_sequences(inputs)
target_seq = tokenizer.texts_to_sequences(targets)

max_len_in = max(len(seq) for seq in input_seq)
max_len_out = max(len(seq) for seq in target_seq)

input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len_in, padding='post')
target_seq = keras.preprocessing.sequence.pad_sequences(target_seq, maxlen=max_len_out, padding='post')

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}

print(f"Vocabulary size: {len(word_index)}")
print(f"Max input length: {max_len_in}, max output length: {max_len_out}")

Vocabulary size: 139
Max input length: 7, max output length: 14


In [36]:
# Create Seq2Seq Model (Encoder Decoder GRU)
EMBED_DIM = 64
HIDDEN_DIM = 128

# Encoder
encoder_inputs = keras.Input(shape=(None,))
enc_emb = layers.Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h = layers.GRU(HIDDEN_DIM, return_state = True, dropout=0.2)(enc_emb)

# Decoder
decoder_inputs = keras.Input(shape=(None,))
dec_emb = layers.Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(decoder_inputs)
decoder_gru = layers.GRU(HIDDEN_DIM, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(dec_emb, initial_state=state_h)
decoder_dense = layers.Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.summary()

# Prepare shifted target for teacher forcing
target_input = target_seq[:, :-1]
target_output = target_seq[:, 1:]
target_output = np.expand_dims(target_output, -1)


# Train the model 
history = model.fit(
    [input_seq, target_input],
    target_output,
    batch_size=32,
    epochs=200,
    validation_split=0.1,
    verbose=1
)

# Save the model and Tokenizer
model.save('tiny_bot.h5')
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model training completed and saved.")

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_18 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_19 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_12 (Embedding)       (None, None, 64)     64000       ['input_18[0][0]']               
                                                                                                  
 embedding_13 (Embedding)       (None, None, 64)     64000       ['input_19[0][0]']               
                                                                                           

### Test Inference

In [37]:
def generate_reply(input_text, max_reply_len=20):
    text = clean_text(input_text)
    seq = tokenizer.texts_to_sequences([text])
    seq = keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_len_in, padding='post')

    # Encode input
    enc_model = keras.Model(encoder_inputs, state_h)
    enc_state = enc_model.predict(seq)

    # Decoder setup for inference
    dec_state_input = keras.Input(shape=(HIDDEN_DIM,))
    dec_x = dec_emb
    dec_outputs, dec_state = decoder_gru(dec_x, initial_state=dec_state_input)
    dec_out_tokens = decoder_dense(dec_outputs)
    dec_infer_model = keras.Model([decoder_inputs, dec_state_input], [dec_out_tokens, dec_state])

    # Generate
    target_seq_test = np.array([[word_index['start']]])
    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, enc_state = dec_infer_model.predict([target_seq_test, enc_state])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = index_word.get(sampled_token_index, '')

        if sampled_word == 'end' or len(decoded_sentence.split()) > max_reply_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word
            target_seq_test = np.array([[sampled_token_index]])

    return decoded_sentence.strip()

# ----------------------------------------------
# 9️⃣ Test example
# ----------------------------------------------
print("\nExample:")
print("You: turn on the fan")
print("Bot:", generate_reply("turn on the fan"))


Example:
You: turn on the fan
Bot: okay turning on the curtain


In [38]:
generate_reply("Hi")



'hi there'

In [39]:
generate_reply("What can you do?")



'i can help control your smart home devices'

In [40]:
generate_reply("Turn on the fridge")



'okay turning on the cctv'

In [41]:
generate_reply("It's so hot")



'i can help control your smart home devices'

In [42]:
generate_reply("Plants are dry")



'i can help control your smart home devices'

In [43]:
generate_reply("it's dark")



'just hanging out ready to help you'

In [44]:
generate_reply("Turn on the CCTV")



'okay turning on the light'