<a href="https://colab.research.google.com/github/iiCellxx/CSST-102-ALMARIO/blob/main/PracticeChatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import opendatasets as od
import pandas as pd

# Step 1: Download the dataset
od.download("https://www.kaggle.com/datasets/thedevastator/viggo-video-game-chatbot-dataset")

# Step 2: Load and preview the dataset
train_df = pd.read_csv("/content/viggo-video-game-chatbot-dataset/train.csv")
print(train_df.head())  # View the first few rows to understand its structure


Skipping, found downloaded files in "./viggo-video-game-chatbot-dataset" (use force=True to force download)
          gem_id                             meaning_representation  \
0  viggo-train-0  inform(name[Dirt: Showdown], release_year[2012...   
1  viggo-train-1  inform(name[Dirt: Showdown], release_year[2012...   
2  viggo-train-2  inform(name[Dirt: Showdown], release_year[2012...   
3  viggo-train-3   request(release_year[2014], specifier[terrible])   
4  viggo-train-4   request(release_year[2014], specifier[terrible])   

                                              target  \
0  Dirt: Showdown from 2012 is a sport racing gam...   
1  Dirt: Showdown is a sport racing game that was...   
2  Dirt: Showdown is a driving/racing sport game ...   
3        Were there even any terrible games in 2014?   
4  What's the most terrible game that you played ...   

                                          references  
0  ["Dirt: Showdown from 2012 is a sport racing g...  
1  ['Dirt: Showdow

In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Define the start and end tokens
start_token = '<starttoken>'
end_token = '<endtoken>'

# Prepare the texts for the model by adding start and end tokens
input_texts = ['<starttoken> ' + text for text in train_df['meaning_representation'].values]
output_texts = ['<starttoken> ' + text + ' <endtoken>' for text in train_df['target'].values]

# Initialize the tokenizer and fit it on the texts
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

# First, fit the tokenizer with the texts including the special tokens
tokenizer.fit_on_texts(input_texts + output_texts)

# Now, explicitly add the start and end tokens to the tokenizer's word index
if start_token not in tokenizer.word_index:
    tokenizer.word_index[start_token] = len(tokenizer.word_index) + 1
if end_token not in tokenizer.word_index:
    tokenizer.word_index[end_token] = len(tokenizer.word_index) + 2

# Convert the texts to sequences
input_sequences = tokenizer.texts_to_sequences(input_texts)
output_sequences = tokenizer.texts_to_sequences(output_texts)

# Set the maximum sequence lengths
max_input_length = max([len(seq) for seq in input_sequences])
max_output_length = max([len(seq) for seq in output_sequences])

# Pad sequences to ensure uniform length
encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')
decoder_input_data = pad_sequences(output_sequences, maxlen=max_output_length, padding='post')

# Prepare the decoder target data (shifted by one time step)
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]


In [65]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define vocabulary size (number of unique tokens)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# Encoder model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=100, input_length=max_input_length)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder model
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=100, input_length=max_output_length)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Full Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [67]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Define a checkpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    'chatbot_model_checkpoint.keras',
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

# Train the model
model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          epochs=1,
          batch_size=64,
          validation_split=0.2,
          callbacks=[checkpoint_callback])


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.5752 - loss: 4.0657
Epoch 1: val_loss improved from inf to 1.86003, saving model to chatbot_model_checkpoint.keras
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 1s/step - accuracy: 0.5763 - loss: 4.0453 - val_accuracy: 0.6733 - val_loss: 1.8600


<keras.src.callbacks.history.History at 0x7ba5f54c7880>

In [68]:
# Load the best model
model.load_weights('chatbot_model_checkpoint.keras')

# Define encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
from tensorflow.keras.layers import Input

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embedding_inf = Embedding(input_dim=vocab_size, output_dim=100)
decoder_lstm_inf = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_inf(decoder_embedding_inf(decoder_inputs), initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inf, decoder_states_inf])


In [69]:
# Function to decode a sequence
def decode_sequence(input_seq):
    # Predict the states from the encoder
    states_value = encoder_model.predict(input_seq)

    # Start sequence with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index[start_token]  # Start token

    stop_condition = False
    decoded_sentence = ''

    # Generate the output sequence
    while not stop_condition:
        output_tokens, states = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')  # Retrieve word from index

        decoded_sentence += ' ' + sampled_word

        # Stop condition if the end token is found or the sentence is too long
        if sampled_word == end_token or len(decoded_sentence.split()) > max_output_length:
            stop_condition = True

        target_seq[0, 0] = sampled_token_index
        states_value = states

    return decoded_sentence


In [71]:
# Test the chatbot with an example input
test_input = "Dirt"
test_input_seq = tokenizer.texts_to_sequences([test_input])
test_input_seq = pad_sequences(test_input_seq, maxlen=max_input_length, padding='post')

response = decode_sequence(test_input_seq)
print(response)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22