In [1]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


2025-04-15 18:57:36.981730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744743457.239053      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744743457.313754      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import json
import os
import urllib.request
import random
from sklearn.model_selection import train_test_split

url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
filename = "train-v1.1.json"

if not os.path.exists(filename):
    print("Downloading SQuAD v1.1 dataset...")
    urllib.request.urlretrieve(url, filename)
    print("Download complete!")
else:
    print("SQuAD dataset already downloaded.")

with open("train-v1.1.json", "r") as f:
    squad_data = json.load(f)

context_to_qa = {}

# Group one QA per unique context
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        if context not in context_to_qa:
            for qa in paragraph['qas']:
                if qa['answers']:
                    context_to_qa[context] = (qa['question'], f"<{qa['answers'][0]['text']}>")
                    break 

unique_contexts = list(context_to_qa.items())
random.shuffle(unique_contexts)

sampled_contexts = unique_contexts[:20000]

contexts = [c for c, _ in sampled_contexts]
questions = [q for _, (q, _) in sampled_contexts]
answers = [a for _, (_, a) in sampled_contexts]

contexts_train, contexts_temp, questions_train, questions_temp, answers_train, answers_temp = train_test_split(
    contexts, questions, answers, test_size=0.2, random_state=42
)

contexts_val, contexts_test, questions_val, questions_test, answers_val, answers_test = train_test_split(
    contexts_temp, questions_temp, answers_temp, test_size=0.5, random_state=42
)
with open("train_data.json", "w") as f:
    json.dump({"contexts": contexts_train, "questions": questions_train, "answers": answers_train}, f)

# Save validation set
with open("val_data.json", "w") as f:
    json.dump({"contexts": contexts_val, "questions": questions_val, "answers": answers_val}, f)

# Save testing set
with open("test_data.json", "w") as f:
    json.dump({"contexts": contexts_test, "questions": questions_test, "answers": answers_test}, f)

print("Data split completed.")


Downloading SQuAD v1.1 dataset...
Download complete!
Data split completed.


In [3]:

# Use char-level tokenizers
context_tokenizer = Tokenizer(char_level=True, lower=False)
question_tokenizer = Tokenizer(char_level=True, lower=False)
answer_tokenizer = Tokenizer(char_level=True, lower=False, filters='')

context_tokenizer.fit_on_texts(contexts_train)
question_tokenizer.fit_on_texts(questions_train)
answer_tokenizer.fit_on_texts(answers_train)

context_sequences = context_tokenizer.texts_to_sequences(contexts_train)
question_sequences = question_tokenizer.texts_to_sequences(questions_train)
answer_sequences = answer_tokenizer.texts_to_sequences(answers_train)

max_context_len = max([len(seq) for seq in context_sequences])
max_question_len = max([len(seq) for seq in question_sequences])
max_answer_len = max([len(seq) for seq in answer_sequences])

context_padded = pad_sequences(context_sequences, maxlen=max_context_len, padding='post')
question_padded = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')
answer_padded = pad_sequences(answer_sequences, maxlen=max_answer_len, padding='post')

decoder_input_data = answer_padded[:, :-1]
decoder_target_data = np.expand_dims(answer_padded[:, 1:], -1)

context_sequences_val = context_tokenizer.texts_to_sequences(contexts_val)
question_sequences_val = question_tokenizer.texts_to_sequences(questions_val)
answer_sequences_val = answer_tokenizer.texts_to_sequences(answers_val)

context_padded_val = pad_sequences(context_sequences_val, maxlen=max_context_len, padding='post')
question_padded_val = pad_sequences(question_sequences_val, maxlen=max_question_len, padding='post')
answer_padded_val = pad_sequences(answer_sequences_val, maxlen=max_answer_len, padding='post')

decoder_input_data_val = answer_padded_val[:, :-1]
decoder_target_data_val = np.expand_dims(answer_padded_val[:, 1:], -1)

context_sequences_test = context_tokenizer.texts_to_sequences(contexts_test)
question_sequences_test = question_tokenizer.texts_to_sequences(questions_test)
answer_sequences_test = answer_tokenizer.texts_to_sequences(answers_test)

context_padded_test = pad_sequences(context_sequences_test, maxlen=max_context_len, padding='post')
question_padded_test = pad_sequences(question_sequences_test, maxlen=max_question_len, padding='post')
answer_padded_test = pad_sequences(answer_sequences_test, maxlen=max_answer_len, padding='post')

decoder_input_data_test = answer_padded_test[:, :-1]
decoder_target_data_test = np.expand_dims(answer_padded_test[:, 1:], -1)


In [4]:
import matplotlib.pyplot as plt
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import RMSprop, Adam

vocab_context = len(context_tokenizer.word_index) + 1
vocab_question = len(question_tokenizer.word_index) + 1
vocab_answer = len(answer_tokenizer.word_index) + 1

learning_rates = [0.001, 0.0005, 0.0001]
optimizers = {
    "rmsprop": lambda lr: RMSprop(learning_rate=lr),
    "adam": lambda lr: Adam(learning_rate=lr)
}
batch_sizes = [32, 64, 128]
dense_units_list = [vocab_answer//2,vocab_answer] 

os.makedirs("saved_models", exist_ok=True)
os.makedirs("training_curves", exist_ok=True)

i = 1
for lr in learning_rates:
    for opt_name, opt in optimizers.items():
        for batch_size in batch_sizes:
            for dense_units in dense_units_list:
                print(f"\nTraining model {i}: learning_rate={lr}, optimizer={opt_name}, batch_size={batch_size}, dense_units={dense_units}")

                # Encoder
                context_input = Input(shape=(context_padded.shape[1],), name="context_input")
                question_input = Input(shape=(question_padded.shape[1],), name="question_input")

                context_embedding = Embedding(vocab_context, dense_units, mask_zero=True)(context_input)
                question_embedding = Embedding(vocab_question, dense_units, mask_zero=True)(question_input)

                context_lstm = LSTM(dense_units, return_state=True, dropout=0.3)
                _, context_h, context_c = context_lstm(context_embedding)

                question_lstm = LSTM(dense_units, return_state=True, dropout=0.3)
                _, question_h, question_c = question_lstm(question_embedding)

                state_h = Concatenate()([context_h, question_h])
                state_c = Concatenate()([context_c, question_c])
                encoder_states = [state_h, state_c]

                # Decoder
                decoder_input = Input(shape=(None,), name="decoder_input")
                decoder_embedding_layer = Embedding(vocab_answer, dense_units, mask_zero=True)
                decoder_embedding = decoder_embedding_layer(decoder_input)

                decoder_lstm = LSTM(dense_units * 2, return_sequences=True, return_state=True, dropout=0.3)
                decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

                decoder_dense_hidden = Dense(dense_units, activation='relu')(decoder_outputs)
                decoder_dense_output = Dense(vocab_answer, activation='softmax')(decoder_dense_hidden)

                model = Model([context_input, question_input, decoder_input], decoder_dense_output)
                model.compile(optimizer=opt(lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

                history = model.fit(
                    [context_padded, question_padded, decoder_input_data],
                    decoder_target_data,
                    batch_size=batch_size,
                    epochs=10,
                    validation_data=([context_padded_val, question_padded_val, decoder_input_data_val], decoder_target_data_val),
                    verbose=2
                )

                model_name = f"model_lr{lr}_opt{opt_name}_bs{batch_size}_du{dense_units}"
                model_path = f"saved_models/{model_name}"

                # Save encoder inference model
                encoder_model = Model([context_input, question_input], encoder_states)
                encoder_model.save(f"{model_path}_encoder.h5")

                # Save decoder inference model
                decoder_state_input_h = Input(shape=(dense_units * 2,))
                decoder_state_input_c = Input(shape=(dense_units * 2,))
                decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

                decoder_inputs_inf = Input(shape=(1,))
                decoder_embedded_inf = decoder_embedding_layer(decoder_inputs_inf)

                decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
                    decoder_embedded_inf, initial_state=decoder_states_inputs)

                decoder_dense_hidden_inf = Dense(dense_units, activation='relu')(decoder_outputs_inf)
                decoder_outputs_final = Dense(vocab_answer, activation='softmax')(decoder_dense_hidden_inf)

                decoder_model = Model(
                    [decoder_inputs_inf, decoder_state_input_h, decoder_state_input_c],
                    [decoder_outputs_final, state_h_inf, state_c_inf]
                )

                decoder_model.save(f"{model_path}_decoder.h5")

                # Plot training & validation loss and accuracy
                graph_path = f"training_curves/loss_accuracy_curve_{model_name}.png"
                plt.figure(figsize=(12, 6))

                # Loss Plot
                plt.subplot(1, 2, 1)
                plt.plot(history.history['loss'], label='Training Loss')
                plt.plot(history.history['val_loss'], label='Validation Loss')
                plt.title(f"Loss Curve ({model_name})")
                plt.xlabel('Epochs')
                plt.ylabel('Loss')
                plt.legend()
                plt.grid(True)

                # Accuracy Plot
                plt.subplot(1, 2, 2)
                plt.plot(history.history['accuracy'], label='Training Accuracy')
                plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
                plt.title(f"Accuracy Curve ({model_name})")
                plt.xlabel('Epochs')
                plt.ylabel('Accuracy')
                plt.legend()
                plt.grid(True)

                plt.tight_layout()
                plt.savefig(graph_path)
                plt.close()

                i += 1



Training model 1: learning_rate=0.001, optimizer=rmsprop, batch_size=32, dense_units=97


I0000 00:00:1744743478.485046      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1744743478.485771      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/10


I0000 00:00:1744743488.277861      63 cuda_dnn.cc:529] Loaded cuDNN version 90300


473/473 - 43s - 91ms/step - accuracy: 0.0253 - loss: 2.9803 - val_accuracy: 0.0295 - val_loss: 2.6481
Epoch 2/10
473/473 - 33s - 69ms/step - accuracy: 0.0333 - loss: 2.5023 - val_accuracy: 0.0353 - val_loss: 2.4683
Epoch 3/10
473/473 - 33s - 69ms/step - accuracy: 0.0367 - loss: 2.3829 - val_accuracy: 0.0390 - val_loss: 2.3710
Epoch 4/10
473/473 - 32s - 69ms/step - accuracy: 0.0390 - loss: 2.3053 - val_accuracy: 0.0403 - val_loss: 2.3120
Epoch 5/10
473/473 - 33s - 69ms/step - accuracy: 0.0408 - loss: 2.2463 - val_accuracy: 0.0406 - val_loss: 2.2906
Epoch 6/10
473/473 - 33s - 69ms/step - accuracy: 0.0425 - loss: 2.1964 - val_accuracy: 0.0445 - val_loss: 2.1952
Epoch 7/10
473/473 - 33s - 69ms/step - accuracy: 0.0442 - loss: 2.1501 - val_accuracy: 0.0459 - val_loss: 2.1671
Epoch 8/10
473/473 - 33s - 70ms/step - accuracy: 0.0455 - loss: 2.1087 - val_accuracy: 0.0475 - val_loss: 2.1247
Epoch 9/10
473/473 - 33s - 70ms/step - accuracy: 0.0470 - loss: 2.0704 - val_accuracy: 0.0484 - val_loss: 2