In [1]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


2025-04-20 11:11:34.046021: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745147494.268084      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745147494.331886      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import json
import os
import urllib.request
import random
from sklearn.model_selection import train_test_split

url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
filename = "train-v1.1.json"

if not os.path.exists(filename):
    print("Downloading SQuAD v1.1 dataset...")
    urllib.request.urlretrieve(url, filename)
    print("Download complete!")
else:
    print("SQuAD dataset already downloaded.")

with open("train-v1.1.json", "r") as f:
    squad_data = json.load(f)

context_to_qa = {}

# Group one QA per unique context
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        if context not in context_to_qa:
            for qa in paragraph['qas']:
                if qa['answers']:
                    context_to_qa[context] = (qa['question'], f"<{qa['answers'][0]['text']}>")
                    break 

unique_contexts = list(context_to_qa.items())
random.shuffle(unique_contexts)

sampled_contexts = unique_contexts[:20000]

contexts = [c for c, _ in sampled_contexts]
questions = [q for _, (q, _) in sampled_contexts]
answers = [a for _, (_, a) in sampled_contexts]

contexts_train, contexts_temp, questions_train, questions_temp, answers_train, answers_temp = train_test_split(
    contexts, questions, answers, test_size=0.2, random_state=42
)

contexts_val, contexts_test, questions_val, questions_test, answers_val, answers_test = train_test_split(
    contexts_temp, questions_temp, answers_temp, test_size=0.5, random_state=42
)
with open("train_data.json", "w") as f:
    json.dump({"contexts": contexts_train, "questions": questions_train, "answers": answers_train}, f)

# Save validation set
with open("val_data.json", "w") as f:
    json.dump({"contexts": contexts_val, "questions": questions_val, "answers": answers_val}, f)

# Save testing set
with open("test_data.json", "w") as f:
    json.dump({"contexts": contexts_test, "questions": questions_test, "answers": answers_test}, f)

print("Data split completed.")


Downloading SQuAD v1.1 dataset...
Download complete!
Data split completed.


In [3]:
# Use char-level tokenizers
context_tokenizer = Tokenizer( char_level=True,lower=False)
question_tokenizer = Tokenizer(char_level=True,lower=False)
answer_tokenizer = Tokenizer(char_level=True,lower=False, filters='')

context_tokenizer.fit_on_texts(contexts_train)
question_tokenizer.fit_on_texts(questions_train)
answer_tokenizer.fit_on_texts(answers_train)

context_sequences = context_tokenizer.texts_to_sequences(contexts_train)
question_sequences = question_tokenizer.texts_to_sequences(questions_train)
answer_sequences = answer_tokenizer.texts_to_sequences(answers_train)

max_context_len = max([len(seq) for seq in context_sequences])
max_question_len = max([len(seq) for seq in question_sequences])
max_answer_len = max([len(seq) for seq in answer_sequences])

context_padded = pad_sequences(context_sequences, maxlen=max_context_len, padding='post')
question_padded = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')
answer_padded = pad_sequences(answer_sequences, maxlen=max_answer_len, padding='post')

decoder_input_data = answer_padded[:, :-1]
decoder_target_data = np.expand_dims(answer_padded[:, 1:], -1)

context_sequences_val = context_tokenizer.texts_to_sequences(contexts_val)
question_sequences_val = question_tokenizer.texts_to_sequences(questions_val)
answer_sequences_val = answer_tokenizer.texts_to_sequences(answers_val)

context_padded_val = pad_sequences(context_sequences_val, maxlen=max_context_len, padding='post')
question_padded_val = pad_sequences(question_sequences_val, maxlen=max_question_len, padding='post')
answer_padded_val = pad_sequences(answer_sequences_val, maxlen=max_answer_len, padding='post')

decoder_input_data_val = answer_padded_val[:, :-1]
decoder_target_data_val = np.expand_dims(answer_padded_val[:, 1:], -1)

context_sequences_test = context_tokenizer.texts_to_sequences(contexts_test)
question_sequences_test = question_tokenizer.texts_to_sequences(questions_test)
answer_sequences_test = answer_tokenizer.texts_to_sequences(answers_test)

context_padded_test = pad_sequences(context_sequences_test, maxlen=max_context_len, padding='post')
question_padded_test = pad_sequences(question_sequences_test, maxlen=max_question_len, padding='post')
answer_padded_test = pad_sequences(answer_sequences_test, maxlen=max_answer_len, padding='post')

decoder_input_data_test = answer_padded_test[:, :-1]
decoder_target_data_test = np.expand_dims(answer_padded_test[:, 1:], -1)

import pickle

# Save tokenizers
with open('context_tokenizer.pkl', 'wb') as f:
    pickle.dump(context_tokenizer, f)

with open('question_tokenizer.pkl', 'wb') as f:
    pickle.dump(question_tokenizer, f)

with open('answer_tokenizer.pkl', 'wb') as f:
    pickle.dump(answer_tokenizer, f)


import json

max_lengths = {
    'max_context_len': max_context_len,
    'max_question_len': max_question_len,
    'max_answer_len': max_answer_len
}

with open('max_lengths.json', 'w') as f:
    json.dump(max_lengths, f)


In [4]:
import os
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

# Assuming tokenizers are already fitted
vocab_context = len(context_tokenizer.word_index) + 1
vocab_question = len(question_tokenizer.word_index) + 1
vocab_answer = len(answer_tokenizer.word_index) + 1

dense_units = 256

# ===== Encoder =====
context_input = Input(shape=(context_padded.shape[1],), name="context_input")
question_input = Input(shape=(question_padded.shape[1],), name="question_input")

context_embedding = Embedding(vocab_context, dense_units, mask_zero=True)(context_input)
question_embedding = Embedding(vocab_question, dense_units, mask_zero=True)(question_input)

context_lstm = LSTM(dense_units, return_state=True,dropout=0.3)
_, context_h, context_c = context_lstm(context_embedding)

question_lstm = LSTM(dense_units, return_state=True,dropout=0.3)
_, question_h, question_c = question_lstm(question_embedding)

state_h = Concatenate()([context_h, question_h])  # shape: (dense_units * 2,)
state_c = Concatenate()([context_c, question_c])
encoder_states = [state_h, state_c]

# ===== Decoder (Training) =====
decoder_input = Input(shape=(None,), name="decoder_input")
decoder_embedding_layer = Embedding(vocab_answer, dense_units, mask_zero=True)
decoder_embedding = decoder_embedding_layer(decoder_input)

decoder_lstm = LSTM(dense_units * 2, return_sequences=True, return_state=True,dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# ✅ Shared Dense layers
decoder_dense_hidden_layer = Dense(dense_units, activation='relu', name='decoder_dense_hidden')
decoder_dense_output_layer = Dense(vocab_answer, activation='softmax', name='decoder_dense_output')

decoder_dense_hidden = decoder_dense_hidden_layer(decoder_outputs)
decoder_dense_output = decoder_dense_output_layer(decoder_dense_hidden)

# ===== Final Training Model =====
model = Model([context_input, question_input, decoder_input], decoder_dense_output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])




I0000 00:00:1745147514.324309      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745147514.325031      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [5]:
history = model.fit(
    [context_padded, question_padded, decoder_input_data],
    decoder_target_data,
    batch_size=32,
    epochs=80,
    validation_data=([context_padded_val, question_padded_val, decoder_input_data_val], decoder_target_data_val),
    verbose=2
)


Epoch 1/80


I0000 00:00:1745147524.900663      62 cuda_dnn.cc:529] Loaded cuDNN version 90300


473/473 - 68s - 143ms/step - accuracy: 0.0240 - loss: 2.6212 - val_accuracy: 0.0296 - val_loss: 2.3186
Epoch 2/80
473/473 - 59s - 126ms/step - accuracy: 0.0319 - loss: 2.2206 - val_accuracy: 0.0342 - val_loss: 2.1330
Epoch 3/80
473/473 - 61s - 129ms/step - accuracy: 0.0363 - loss: 2.0519 - val_accuracy: 0.0383 - val_loss: 1.9896
Epoch 4/80
473/473 - 62s - 131ms/step - accuracy: 0.0400 - loss: 1.9180 - val_accuracy: 0.0414 - val_loss: 1.8808
Epoch 5/80
473/473 - 63s - 132ms/step - accuracy: 0.0429 - loss: 1.8065 - val_accuracy: 0.0434 - val_loss: 1.8009
Epoch 6/80
473/473 - 63s - 133ms/step - accuracy: 0.0451 - loss: 1.7173 - val_accuracy: 0.0443 - val_loss: 1.7578
Epoch 7/80
473/473 - 63s - 134ms/step - accuracy: 0.0468 - loss: 1.6447 - val_accuracy: 0.0455 - val_loss: 1.7134
Epoch 8/80
473/473 - 63s - 134ms/step - accuracy: 0.0482 - loss: 1.5811 - val_accuracy: 0.0461 - val_loss: 1.6942
Epoch 9/80
473/473 - 64s - 134ms/step - accuracy: 0.0494 - loss: 1.5272 - val_accuracy: 0.0467 - va

In [6]:
# ===== Encoder Inference Model =====
encoder_model = Model([context_input, question_input], encoder_states)

# ===== Decoder Inference Mhttps://www.kaggle.com/code/kikamagdii/fork-of-nlp-project-trialsodel =====
decoder_inputs_inf = Input(shape=(1,), name="decoder_input_inf")  # one token at a time
decoder_state_input_h = Input(shape=(dense_units * 2,), name="decoder_state_input_h")
decoder_state_input_c = Input(shape=(dense_units * 2,), name="decoder_state_input_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inf = decoder_embedding_layer(decoder_inputs_inf)

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_embedding_inf, initial_state=decoder_states_inputs
)

decoder_hidden_inf = decoder_dense_hidden_layer(decoder_outputs_inf)
decoder_outputs_final = decoder_dense_output_layer(decoder_hidden_inf)

decoder_model = Model(
    [decoder_inputs_inf, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs_final, state_h_inf, state_c_inf]
)

# Save encoder model
encoder_model.save("encoder_model.h5")

# Save decoder model
decoder_model.save("decoder_model.h5")

In [7]:

def generate_answers(input_context, input_question):
    context_seq = context_tokenizer.texts_to_sequences([input_context])
    context_seq = pad_sequences(context_seq, maxlen=max_context_len, padding='post')

    question_seq = question_tokenizer.texts_to_sequences([input_question])
    question_seq = pad_sequences(question_seq, maxlen=max_question_len, padding='post')

    states_value = encoder_model.predict([context_seq, question_seq],verbose=0)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = answer_tokenizer.word_index['<']
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value,verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = answer_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_char == '>' or len(decoded_sentence) > max_answer_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_char

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [8]:
!pip install nltk python-Levenshtein
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import Levenshtein

nltk.download('punkt')  # only if you need it for tokenization

def evaluate_model(data_contexts, data_questions, data_answers, dataset_name="Test"):
    total_bleu = 0.0
    total_levenshtein = 0.0
    num_samples = len(data_contexts)
    smooth = SmoothingFunction().method4  # for short sequences

    print(f"\nEvaluating on {dataset_name} Set...\n")
    
    for i in range(300):
        pred = generate_answers(data_contexts[i], data_questions[i])
        true = data_answers[i].strip('<> ')  # Remove start/end tokens

        # Tokenize for BLEU
        reference = [list(true)]
        candidate = list(pred)
        bleu = sentence_bleu(reference, candidate, smoothing_function=smooth)
        total_bleu += bleu

        # Levenshtein distance
        lev_distance = Levenshtein.distance(pred, true)
        total_levenshtein += lev_distance

        # Optional: print first few results
        if i < 5:
            print(f"Context    : {data_contexts[i]}")
            print(f"Question   : {data_questions[i]}")
            print(f"True Answer: {true}")
            print(f"Predicted  : {pred}")
            print(f"BLEU Score : {bleu:.4f}")
            print(f"Levenshtein Distance: {lev_distance}")
            print("-" * 60)

    avg_bleu = total_bleu / num_samples
    avg_lev = total_levenshtein / num_samples
    print(f"\n{dataset_name} Set Average BLEU Score       : {avg_bleu:.4f}")
    print(f"{dataset_name} Set Average Levenshtein Distance: {avg_lev:.2f}")


Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collecte

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
evaluate_model(contexts_test, questions_test, answers_test, "Test")
evaluate_model(contexts_train, questions_train, answers_train, "Train")



Evaluating on Test Set...

Context    : North Carolina provides a large range of recreational activities, from swimming at the beach to skiing in the mountains. North Carolina offers fall colors, freshwater and saltwater fishing, hunting, birdwatching, agritourism, ATV trails, ballooning, rock climbing, biking, hiking, skiing, boating and sailing, camping, canoeing, caving (spelunking), gardens, and arboretums. North Carolina has theme parks, aquariums, museums, historic sites, lighthouses, elegant theaters, concert halls, and fine dining.
Question   : Fishing, hunting, and birdwatching are what kind of activities that are provided in North Carolina?
True Answer: recreational
Predicted  : Sanskrit dramas
BLEU Score : 0.0270
Levenshtein Distance: 13
------------------------------------------------------------
Context    : Two days later, it was announced that Luis Enrique would return to Barcelona as head coach, after he agreed to a two-year deal. He was recommended by sporting directo