# Chatbot Model Training Notebook

This notebook demonstrates how to preprocess text data, train an LSTM-based model for intent classification, and save the model for later use in a FastAPI chatbot application.

The training data is loaded from `../data/intents.json` and includes a few sample intents.

In [None]:
# Import necessary libraries for data manipulation and model building
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# For saving the tokenizer and label encoder
import pickle


In [None]:
# Load the intents JSON file which contains training data
# Since the notebook is inside the 'notebooks/' folder, we use '../data/intents.json' to navigate to the data folder
with open('../data/intents.json') as file:
    data = json.load(file)

# Display the loaded data (optional)
print(data)

In [None]:
# Prepare the training data

# Initialize lists to store input sentences and corresponding labels
sentences = []
labels = []

# Loop over each intent in the JSON data
for intent in data['intents']:
    for pattern in intent['patterns']:
        sentences.append(pattern)  # Add each pattern (user input) to the sentences list
        labels.append(intent['tag'])  # Add the corresponding tag to the labels list

# Output the first few examples to verify
print('Sentences:', sentences[:5])
print('Labels:', labels[:5])

In [None]:
# Tokenize the text data to convert words into numerical sequences

# Initialize the Tokenizer with an out-of-vocabulary token
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)  # Build the vocabulary based on the input sentences
word_index = tokenizer.word_index  # A dictionary mapping words to their index

# Convert sentences into sequences of integers
sequences = tokenizer.texts_to_sequences(sentences)

# Determine the maximum sequence length to pad all sequences uniformly
max_length = max(len(seq) for seq in sequences)

# Pad sequences so that each sequence has the same length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Display an example sequence
print('Example sequence:', padded_sequences[0])

In [None]:
# Encode the labels (tags) into numeric format

# Initialize the LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(labels)

# Transform the text labels into integers
encoded_labels = label_encoder.transform(labels)

# Convert the integer labels into one-hot encoded vectors
num_classes = len(label_encoder.classes_)
one_hot_labels = tf.keras.utils.to_categorical(encoded_labels, num_classes=num_classes)

# Display the first few encoded labels
print('Encoded labels:', encoded_labels[:5])

In [None]:
# Define and compile the LSTM model for intent classification

# Build a Sequential model with an Embedding layer, LSTM layers, and Dense layers
model = Sequential([
    # The Embedding layer converts each word (integer) into a dense vector of fixed size
    Embedding(input_dim=len(word_index) + 1, output_dim=64, input_length=max_length),
    
    # The first LSTM layer processes the sequence data
    LSTM(64, return_sequences=True),
    Dropout(0.5),  # Dropout helps prevent overfitting
    
    # A second LSTM layer to further capture sequential patterns
    LSTM(32),
    Dropout(0.5),
    
    # A Dense layer to learn intermediate features
    Dense(32, activation='relu'),
    
    # Output layer with softmax activation for multi-class classification
    Dense(num_classes, activation='softmax')
])

# Compile the model with categorical crossentropy loss and the Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model.summary()

In [None]:
# Train the model on the padded sequences and one-hot encoded labels

# Here, we train for 30 epochs with a batch size of 8.
# Adjust the number of epochs and batch size as needed for your dataset.
history = model.fit(padded_sequences, one_hot_labels, epochs=30, batch_size=8, verbose=1)

In [None]:
# Save the trained model to the '../model/' directory
model.save('../model/chatbot_model.h5')

# Also, save the tokenizer and label encoder for later use during inference
with open('../model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../model/label_encoder.pickle', 'wb') as enc_file:
    pickle.dump(label_encoder, enc_file, protocol=pickle.HIGHEST_PROTOCOL)

print('Model and preprocessing objects saved successfully!')

### Next Steps

After training the model, you can move on to building the FastAPI app (`app/main.py`) which will load the model, tokenizer, and label encoder to serve the chatbot in real time.