In [1]:
import pandas as pd

In [2]:
# df = pd.read_csv('./dialogs.txt' , sep='\t' , names=['Question' , 'Answer'])
# df

In [3]:
df = pd.read_csv('./final_data.csv' , sep=',' )
df

# Assuming df is the DataFrame
df.rename(columns={"response": "Question", "context1": "Answer"}, inplace=True)


In [4]:
df = df[["Question", "Answer"]]

In [5]:
df

Unnamed: 0,Question,Answer
0,Do you think it wise to trust Hagrid with some...,Hagrid is bringing him.
1,"Ah, Professor, I would trust Hagrid with my life.",Do you think it wise to trust Hagrid with some...
2,"Professor Dumbledore, sir.","Ah, Professor, I would trust Hagrid with my life."
3,Professor McGonagall.,"Professor Dumbledore, sir."
4,"No problems, I trust, Hagrid?",Professor McGonagall.
...,...,...
4913,"How fast is it, Harry?","Yeah, let's see."
4914,Lumos.,"How fast is it, Harry?"
4915,I solemnly swear that I am up to no good.,Lumos.
4916,Mischief managed.,I solemnly swear that I am up to no good.


In [6]:
# Check for null values
null_question = df['Question'].isnull().sum()
null_answer = df['Answer'].isnull().sum()

if null_question > 0:
    print("There are", null_question, "null values in the 'Question' column.")
else:
    print("There are no null values in the 'Question' column.")

if null_answer > 0:
    print("There are", null_answer, "null values in the 'Answer' column.")
else:
    print("There are no null values in the 'Answer' column.")

# Check for whitespace values
whitespace_question = df['Question'].apply(lambda x: x.isspace()).sum()
whitespace_answer = df['Answer'].apply(lambda x: x.isspace()).sum()

if whitespace_question > 0:
    print("There are", whitespace_question, "whitespace values in the 'Question' column.")
else:
    print("There are no whitespace values in the 'Question' column.")

if whitespace_answer > 0:
    print("There are", whitespace_answer, "whitespace values in the 'Answer' column.")
else:
    print("There are no whitespace values in the 'Answer' column.")

There are no null values in the 'Question' column.
There are no null values in the 'Answer' column.
There are no whitespace values in the 'Question' column.
There are no whitespace values in the 'Answer' column.


In [7]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', ' ', text)  # Replace all digits with spaces
    text = re.sub(r'([^\w\s])', r' \1 ', text)  # Add a space before and after each punctuation character
    text = re.sub(r'\s+', ' ', text)  # Replace all consecutive spaces with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

df['Encoder Inputs']=df['Question'].apply(clean_text)
df['Decoder Inputs']="<sos> " + df['Answer'].apply(clean_text) + ' <eos>'
df["Decoder Targets"] = df['Answer'].apply(clean_text) + ' <eos>'

df.head()

Unnamed: 0,Question,Answer,Encoder Inputs,Decoder Inputs,Decoder Targets
0,Do you think it wise to trust Hagrid with some...,Hagrid is bringing him.,do you think it wise to trust hagrid with some...,<sos> hagrid is bringing him . <eos>,hagrid is bringing him . <eos>
1,"Ah, Professor, I would trust Hagrid with my life.",Do you think it wise to trust Hagrid with some...,"ah , professor , i would trust hagrid with my ...",<sos> do you think it wise to trust hagrid wit...,do you think it wise to trust hagrid with some...
2,"Professor Dumbledore, sir.","Ah, Professor, I would trust Hagrid with my life.","professor dumbledore , sir .","<sos> ah , professor , i would trust hagrid wi...","ah , professor , i would trust hagrid with my ..."
3,Professor McGonagall.,"Professor Dumbledore, sir.",professor mcgonagall .,"<sos> professor dumbledore , sir . <eos>","professor dumbledore , sir . <eos>"
4,"No problems, I trust, Hagrid?",Professor McGonagall.,"no problems , i trust , hagrid ?",<sos> professor mcgonagall . <eos>,professor mcgonagall . <eos>


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum number of words to keep based on word frequency
num_words = 10000

# Define the maximum sequence length
max_seq_length = 10

# Create a tokenizer and fit it on the 'Encoder Inputs' and 'Decoder Inputs' columns of the DataFrame
tokenizer = Tokenizer(num_words=num_words, oov_token='<unk>')
tokenizer.fit_on_texts(df['Encoder Inputs'].tolist() + df['Decoder Inputs'].tolist())

# Convert the text data to sequences of integers using the tokenizer
encoder_inputs = tokenizer.texts_to_sequences(df['Encoder Inputs'].tolist())
decoder_inputs = tokenizer.texts_to_sequences(df['Decoder Inputs'].tolist())
decoder_targets = tokenizer.texts_to_sequences(df['Decoder Targets'].tolist())

# Pad the sequences to ensure they all have the same length
encoder_inputs = pad_sequences(encoder_inputs, maxlen=max_seq_length, padding='post', truncating='post')
decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_seq_length, padding='post', truncating='post')
decoder_targets = pad_sequences(decoder_targets, maxlen=max_seq_length, padding='post', truncating='post')

2024-03-27 20:19:37.069352: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-27 20:19:37.917427: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-03-27 20:19:37.917537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [9]:
decoder_targets[1:3]

array([[ 30,   5,  61,  10, 790,   8, 424, 108,  51, 101],
       [241,  69,   7,  81, 424, 108,  51,  36, 304,   3]], dtype=int32)

In [10]:
df['Decoder Targets'][1:3]

1    do you think it wise to trust hagrid with some...
2    ah , professor , i would trust hagrid with my ...
Name: Decoder Targets, dtype: object

In [11]:
# Get the vocabulary size of the tokenizer
vocab_size = len(tokenizer.word_index)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 3207


In [12]:
print(encoder_inputs.shape , "\n" , decoder_inputs.shape , "\n" , decoder_targets.shape)

(4918, 10) 
 (4918, 10) 
 (4918, 10)


In [13]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
encoder_inputs_train, encoder_inputs_test, decoder_inputs_train, decoder_inputs_test, decoder_targets_train, decoder_targets_test = train_test_split(encoder_inputs, decoder_inputs, decoder_targets, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("Train set shapes:", encoder_inputs_train.shape, decoder_inputs_train.shape, decoder_targets_train.shape)
print("Test set shapes:", encoder_inputs_test.shape, decoder_inputs_test.shape, decoder_targets_test.shape)

Train set shapes: (3934, 10) (3934, 10) (3934, 10)
Test set shapes: (984, 10) (984, 10) (984, 10)


In [14]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model

num_encoder_tokens = len(tokenizer.word_index) + 1
num_decoder_tokens = len(tokenizer.word_index) + 1
latent_dim = 64
embedding_dim = 50

# Define the input sequence
encoder_inputs = Input(shape=(max_seq_length,))

#_________________Embedding________________________

encoder_embedding = Embedding(num_encoder_tokens, embedding_dim , mask_zero=True)
encoder_inputs_embedded = encoder_embedding(encoder_inputs)


#_________________Encoder________________________

# Encoder - LSTM1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(encoder_inputs_embedded)

# Encoder - LSTM2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# Encoder - LSTM2
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
#______________________________________________

# Discard the encoder outputs and only keep the states
encoder_states = [state_h, state_c]

# Define the decoder input sequence
decoder_inputs = Input(shape=(max_seq_length,))

# Add an embedding layer
decoder_embedding = Embedding(num_decoder_tokens, embedding_dim , mask_zero=True)
decoder_inputs_embedded = decoder_embedding(decoder_inputs)

#_________________Decoder________________________

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# Get the decoder outputs and states
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_embedded, initial_state=encoder_states)

# Define the decoder output layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

# Get the decoder outputs
decoder_outputs = decoder_dense(decoder_outputs)

# Define the Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

2024-03-27 20:19:39.318898: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-27 20:19:39.360680: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2024-03-27 20:19:39.360710: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-03-27 20:19:39.361102: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neur

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 10, 50)       160400      ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 10, 64),     29440       ['embedding[0][0]']              
                                 (None, 64),                                                      
                                 (None, 64)]                                                      
                                                                                              

In [15]:
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=15)


batch_size = 32
epochs = 300

# One-hot encode the decoder targets
decoder_targets_train = to_categorical(decoder_targets_train, num_decoder_tokens)
decoder_targets_test = to_categorical(decoder_targets_test, num_decoder_tokens)

# Define the Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'] , sample_weight_mode='temporal')

# Train the model
model.fit([encoder_inputs_train, decoder_inputs_train], decoder_targets_train,
          validation_data=([encoder_inputs_test, decoder_inputs_test], decoder_targets_test),
          batch_size=batch_size, epochs=epochs )
          # batch_size=batch_size, epochs=epochs , callbacks=[early_stopping])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7ffac8068950>

In [16]:
import numpy as np

In [17]:
from tensorflow.keras.models import Model

# Define encoder model to get encoder states
encoder_model = Model(encoder_inputs, encoder_states)

# Define decoder model with encoder states as initial state
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_embedded = decoder_embedding(decoder_inputs_single)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs_single_embedded, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs_single] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Helper function to generate a response given an input sequence
def generate_response(input_seq):
    # Encode the input sequence to get the initial decoder states
    states_value = encoder_model.predict(input_seq)

    # Initialize the target sequence with a start token
    target_seq = np.array([[tokenizer.word_index['sos']]])

    stop_condition = False
    response = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token from the output distribution
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
        # If the predicted word index is 0, use a period instead
        if sampled_token_index == 0:
            sampled_token = '.'
        else:
            sampled_token = tokenizer.index_word[sampled_token_index]
        
        response.append(sampled_token)

        # Exit condition: either hit max length or find stop token
        if sampled_token == 'eos' or len(response) > max_seq_length:
            stop_condition = True

        # Update the target sequence with the sampled token
        target_seq = np.array([[sampled_token_index]])

        # Update the decoder states
        states_value = [h, c]

    return ' '.join(response)

In [19]:
# Test the response generation
input_sequence = tokenizer.texts_to_sequences(["Little tyke fell asleep just as we were flying over Bristol."])
input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post', truncating='post')
response = generate_response(input_sequence)
print("Input:", f'{input_sequence}')
print("Response:", response)


Input: [[ 138 1708  589  791   55   71   23  102  681  179]]
Response: it ' s not a term one usually hears in the
