In [2]:
#imports from libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from string import punctuation
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string
import tensorflow as tf


# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pip install nltk




In [4]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.models import Model

In [6]:
# Reading
nRowsRead = None # specify 'None' if want to read whole file
data_train = pd.read_csv('train.csv', delimiter=',', nrows = nRowsRead)
data_train.dataframeName = 'train.csv'
nRow, nCol = data_train.shape
print(f'There are {nRow} rows and {nCol} columns')
select_data_train = data_train[['Context', 'Response']]
select_data_train.head()

There are 3512 rows and 2 columns


Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [7]:
# Convert to DataFrame
df = pd.DataFrame(data_train)
# Check for missing values in 'Context' and 'Response' columns
missing_context = df['Context'].isnull().sum()
missing_response = df['Response'].isnull().sum()

print(f'Missing values in Context: {missing_context}')
print(f'Missing values in Response: {missing_response}')

# Drop rows with missing 'Context' or 'Response'
df.dropna(subset=['Context', 'Response'], inplace=True)

Missing values in Context: 0
Missing values in Response: 4


In [8]:
#Lemmatisation
stop_words = set(stopwords.words('english') + list(string.punctuation))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

lemmatized_questions = []
lemmatized_answers = []

for index, row in df.iterrows():
    if pd.isnull(row['Context']) or pd.isnull(row['Response']):
        continue
        
    # Tokenize and lemmatise question
    question_tokens = word_tokenize(row['Context'].lower())
    question_tokens = [word for word in question_tokens if word not in stop_words]
    lemmatized_question = [lemmatizer.lemmatize(word) for word in question_tokens]
    
    # Tokenize and lemmatise answer
    answer_tokens = word_tokenize(row['Response'].lower())
    answer_tokens = [word for word in answer_tokens if word not in stop_words]
    lemmatized_answer = [lemmatizer.lemmatize(word) for word in answer_tokens]
    
    # Append lemmatised question and answer to lists
    lemmatized_questions.append(lemmatized_question)
    lemmatized_answers.append(lemmatized_answer)

# Print lengths of lemmatised questions and answers
print("Number of lemmatised questions:", len(lemmatized_questions))
print("Number of lemmatised answers:", len(lemmatized_answers))

Number of lemmatised questions: 3508
Number of lemmatised answers: 3508


In [14]:
# Tokenize input and output sequences
context_tokenizer = Tokenizer()
context_tokenizer.fit_on_texts(lemmatized_questions)
context_seq = context_tokenizer.texts_to_sequences(lemmatized_questions)
context_seq = pad_sequences(context_seq, padding='post')

response_tokenizer = Tokenizer()
response_tokenizer.fit_on_texts(lemmatized_answers)
response_seq = response_tokenizer.texts_to_sequences(lemmatized_answers)
response_seq = pad_sequences(response_seq, padding='post')




print("Shape of context_seq:", context_seq.shape)
print("Shape of response_seq:", response_seq.shape)
print("Shape of context_seq (sliced for input):", context_seq[:,:-1].shape)
print("Shape of response_seq (sliced for output):", response_seq[:,1:].shape)

max_question_length = max(len(seq) for seq in context_seq)
max_response_length = max(len(seq) for seq in response_seq)
sequence_length = max(max_question_length, max_response_length)  # Using the maximum found length
context_seq = pad_sequences(context_seq, maxlen=sequence_length, padding='post')
response_seq = pad_sequences(response_seq, maxlen=sequence_length, padding='post')





# Define your neural network architecture
vocab_size = len(context_tokenizer.word_index) + 1
embed_size = 128
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embed_size, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embed_size, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention mechanism
attention_out = Attention()([decoder_outputs, encoder_outputs])
print("Shape of attention out:", attention_out.shape)



# Concatenate attention output and decoder output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_out])

decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)





print("Shape of decoder_outputs:", decoder_outputs.shape)
print("Shape of encoder_outputs:", encoder_outputs.shape)
print("Shape of decoder_concat_input:", decoder_concat_input.shape)







# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Print the model summary
model.summary()

# Train the model (assuming you have train_context and train_response after preprocessing)
model.fit([context_seq, response_seq[:,:-1]], response_seq[:,1:], batch_size=64, epochs=10, validation_split=0.2)

# After training, the model for inference
# Define encoder model
encoder_model = Model(encoder_inputs, [encoder_outputs, encoder_states])

# Define decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_attention_out, decoder_attention_states = attention_layer([decoder_outputs, encoder_outputs])
decoder_concat_out = Concatenate(axis=-1)([decoder_outputs, decoder_attention_out])

decoder_states_outputs = [decoder_states, decoder_attention_states]

decoder_outputs = decoder_dense(decoder_concat_out)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states_outputs
)

# Define functions for inference
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    encoder_output, states_value = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = response_tokenizer.word_index['<start>']
    
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = response_tokenizer.index_word[sampled_token_index]
        
        if sampled_token != '<end>':
            decoded_sentence += ' ' + sampled_token
        
        # Exit condition: either hit max length or find stop character.
        if (sampled_token == '<end>' or len(decoded_sentence.split()) > max_response_length):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
    
    return decoded_sentence


Shape of context_seq: (3508, 244)
Shape of response_seq: (3508, 466)
Shape of context_seq (sliced for input): (3508, 243)
Shape of response_seq (sliced for output): (3508, 465)
Shape of attention out: (None, None, 256)
Shape of decoder_outputs: (None, None, 3256)
Shape of encoder_outputs: (None, None, 256)
Shape of decoder_concat_input: (None, None, 512)


Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node functional_9_1/attention_4_1/sub defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\Dell\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start

  File "C:\Users\Dell\anaconda3\Lib\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "C:\Users\Dell\anaconda3\Lib\asyncio\base_events.py", line 607, in run_forever

  File "C:\Users\Dell\anaconda3\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "C:\Users\Dell\anaconda3\Lib\asyncio\events.py", line 80, in _run

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 516, in dispatch_queue

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 505, in process_one

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 412, in dispatch_shell

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 740, in execute_request

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\ipkernel.py", line 422, in do_execute

  File "C:\Users\Dell\anaconda3\Lib\site-packages\ipykernel\zmqshell.py", line 546, in run_cell

  File "C:\Users\Dell\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3024, in run_cell

  File "C:\Users\Dell\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3079, in _run_cell

  File "C:\Users\Dell\anaconda3\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\Dell\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3284, in run_cell_async

  File "C:\Users\Dell\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3466, in run_ast_nodes

  File "C:\Users\Dell\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code

  File "C:\Users\Dell\AppData\Local\Temp\ipykernel_29536\792910659.py", line 84, in <module>

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 329, in fit

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 122, in one_step_on_iterator

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 110, in one_step_on_data

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 57, in train_step

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\layers\layer.py", line 826, in __call__

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\models\functional.py", line 199, in call

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\ops\function.py", line 151, in _run_through_graph

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\models\functional.py", line 583, in call

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\layers\layer.py", line 826, in __call__

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\layers\attention\attention.py", line 227, in call

  File "C:\Users\Dell\anaconda3\Lib\site-packages\keras\src\layers\attention\attention.py", line 175, in _apply_scores

Incompatible shapes: [64,465,466] vs. [64,466]
	 [[{{node functional_9_1/attention_4_1/sub}}]] [Op:__inference_one_step_on_iterator_15422]

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, Concatenate, Attention
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

# Assuming vocabulary size, embedding size, and latent dimensions are defined
vocab_size = 10000  # Adjust based on your tokenizer
embed_size = 128
latent_dim = 256
max_seq_length = 466  # Adjust based on your data

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embed_size, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embed_size, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training (Simulated without teacher forcing)
# Start with the start-of-sequence token as input and generate each next token
for epoch in range(10):  # Number of epochs
    for seq_index in range(len(context_seq)):  # Iterate over each sequence
        input_seq = context_seq[seq_index:seq_index+1]
        target_seq = np.zeros((1, 1))  # Start with start-of-sequence token
        target_seq[0, 0] = response_tokenizer.word_index['<sos>']  # Assuming <sos> is your start token
        
        for i in range(max_seq_length - 1):  # Generate sequence
            output_tokens = model.predict([input_seq, target_seq])
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            
            if sampled_token_index == response_tokenizer.word_index['<eos>']:  # Assuming <eos> is your end token
                break

# This simplified loop shows the concept and isn't a full training loop as it doesn't update weights based on error.
