In [39]:
!pip install tensorflow nltk numpy



In [40]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset
with open('data.json', 'r') as file:
    data = json.load(file)

data_with_tokens = []
for pair in data:
    modified_output = f'start {pair["output"]} end'
    data_with_tokens.append({'input': pair['input'], 'output': modified_output})

# Prepare tokenizer
tokenizer = Tokenizer()
corpus = [item['input'] + ' ' + item['output'] for item in data_with_tokens]
tokenizer.fit_on_texts(corpus)
VOCAB_SIZE = len(tokenizer.word_index) + 1

# Convert texts to sequences
sequences = [tokenizer.texts_to_sequences([item['input'], item['output']]) for item in data]
input_sequences, output_sequences = zip(*[(seq[0], seq[1]) for seq in sequences])
input_padded = pad_sequences(input_sequences, padding='post')
output_padded = pad_sequences(output_sequences, padding='post')


In [41]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define model parameters
embedding_dim = 256
lstm_units = 256

# Define the encoder component
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(VOCAB_SIZE, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder component
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(VOCAB_SIZE, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the seq2seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_19 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 input_20 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_10 (Embedding)    (None, None, 256)            11264     ['input_19[0][0]']            
                                                                                                  
 embedding_11 (Embedding)    (None, None, 256)            11264     ['input_20[0][0]']            
                                                                                           

In [42]:
import numpy as np
# Assuming output_padded is shifted by one timestep for decoder input
decoder_input_data = pad_sequences(output_padded, padding='post', maxlen=output_padded.shape[1] + 1)
decoder_target_data = np.hstack([output_padded, np.zeros((output_padded.shape[0], 1))]).astype(np.int32)

model.fit([input_padded, decoder_input_data], np.expand_dims(decoder_target_data, -1), batch_size=32, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79f337588400>

In [43]:
# Setup the encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Setup the decoder inference model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [44]:
def beam_search_decoder(predictions, top_k=3):
    output_sequences = [([], 0)]

    for token_probs in predictions:
        new_sequences = []
        # Append new tokens to old sequences and re-score
        for old_seq, old_score in output_sequences:
            for char_index in range(len(token_probs)):
                new_seq = old_seq + [char_index]
                # Consider log probability to prevent underflow
                new_score = old_score + np.log(token_probs[char_index])
                new_sequences.append((new_seq, new_score))

        # Sort all new sequences in the beam by score
        output_sequences = sorted(new_sequences, key=lambda val: val[1], reverse=True)
        # Select the top k based on score
        output_sequences = output_sequences[:top_k]

    return output_sequences

# Modify the decoding section in the respond_to_user function to use beam search
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        output_tokens = output_tokens[0, -1, :]  # Get the last token's probabilities

        # Use beam search to get top_k sequences; here we consider only top 1 for simplicity
        sequences = beam_search_decoder([output_tokens], top_k=1)
        sampled_token_index = sequences[0][0][0]

        sampled_char = tokenizer.index_word.get(sampled_token_index, '?')
        if (sampled_char == 'end' or len(decoded_sentence) > 50):
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    return decoded_sentence.strip()


In [45]:
def respond_to_user(input_text):
    # Convert the user input text to a sequence of integers
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max([len(seq) for seq in input_sequences]))

    # Decode the sequence to find the response
    response = decode_sequence(input_padded)
    return response

In [46]:
user_input = "What should I eat during pregnancy?"
response = respond_to_user(user_input)
print("Chatbot response:", response)

Chatbot response: it's it's it's to to to to to a a a ? ? ? ? ? ? ? ?
