In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
import urllib.request
import os

2025-04-21 09:14:08.425556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745226848.920774      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745226849.062094      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
filename = "train-v1.1.json"

if not os.path.exists(filename):
    print("Downloading SQuAD v1.1 dataset...")
    urllib.request.urlretrieve(url, filename)
    print("Download complete!")
else:
    print("SQuAD dataset already downloaded.")

Downloading SQuAD v1.1 dataset...
Download complete!


In [None]:
import random
from sklearn.model_selection import train_test_split

with open("train-v1.1.json", "r") as f:
    squad_data = json.load(f)


context_to_qa = {}
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        if context not in context_to_qa:
            for qa in paragraph['qas']:
                if qa['answers']:
                    context_to_qa[context] = (qa['question'], qa['answers'][0]['text'])
                    break 

unique_contexts = list(context_to_qa.items())
random.shuffle(unique_contexts)
sampled_contexts = unique_contexts[:20000]

contexts = [c for c, _ in sampled_contexts]
questions = [q for _, (q, _) in sampled_contexts]
answers = [f"<start> {a} <end>" for _, (_, a) in sampled_contexts]

contexts_train, contexts_temp, questions_train, questions_temp, answers_train, answers_temp = train_test_split(
    contexts, questions, answers, test_size=0.2, random_state=42
)

contexts_val, contexts_test, questions_val, questions_test, answers_val, answers_test = train_test_split(
    contexts_temp, questions_temp, answers_temp, test_size=0.5, random_state=42
)

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>",filters='')
tokenizer.fit_on_texts(contexts_train + questions_train + answers_train)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

context_sequences = tokenizer.texts_to_sequences(contexts_train)
context_sequences_val = tokenizer.texts_to_sequences(contexts_val)
context_sequences_test = tokenizer.texts_to_sequences(contexts_test)

question_sequences = tokenizer.texts_to_sequences(questions_train)
question_sequences_val = tokenizer.texts_to_sequences(questions_val)
question_sequences_test = tokenizer.texts_to_sequences(questions_test)

answer_sequences = tokenizer.texts_to_sequences(answers_train)
answer_sequences_val = tokenizer.texts_to_sequences(answers_val)
answer_sequences_test = tokenizer.texts_to_sequences(answers_test)

max_context_len = max(len(c.split()) for c in contexts)
max_question_len = max(len(q.split()) for q in questions)
max_answer_len = max(len(a.split()) for a in answers)

encoder_input_context_data = pad_sequences(context_sequences, maxlen=max_context_len, padding='post')
encoder_input_question_data = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')
decoder_input_data = pad_sequences(answer_sequences, maxlen=max_answer_len, padding='post')

context_data_val = pad_sequences(context_sequences_val, maxlen=max_context_len, padding='post')
question_data_val = pad_sequences(question_sequences_val, maxlen=max_question_len, padding='post')
answer_data_val = pad_sequences(answer_sequences_val, maxlen=max_answer_len, padding='post')

context_data_test = pad_sequences(context_sequences_test, maxlen=max_context_len, padding='post')
question_data_test = pad_sequences(question_sequences_test, maxlen=max_question_len, padding='post')
answer_data_test = pad_sequences(answer_sequences_test, maxlen=max_answer_len, padding='post')

decoder_target_data = decoder_input_data[:, 1:]
decoder_target_data = np.pad(decoder_target_data, ((0, 0), (0, 1)), mode='constant')

In [None]:
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Bidirectional
from tensorflow.keras.models import Model
import json


embedding_dims = [128]
latent_dims = [64]
dense_units = [vocab_size]
optimizers = ['adam']
learning_rates = [0.001]
batch_sizes = [32]

results = []

for embedding_dim in embedding_dims:
    for latent_dim in latent_dims:
        for dense_unit in dense_units:
            for optimizer_name in optimizers:
                for learning_rate in learning_rates:
                    for batch_size in batch_sizes:

                        if optimizer_name == 'adam':
                            optimizer = Adam(learning_rate=learning_rate, clipnorm = 1.0)
                        elif optimizer_name == 'rmsprop':
                            optimizer = RMSprop(learning_rate=learning_rate, clipnorm = 1.0)

                        context_inputs = Input(shape=(max_context_len,))
                        question_inputs = Input(shape=(max_question_len,))
                        decoder_inputs = Input(shape=(max_answer_len,))

                        embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True)
                        context_emb = embedding_layer(context_inputs)
                        question_emb = embedding_layer(question_inputs)
                        decoder_emb = embedding_layer(decoder_inputs)

                        _, forward_context_h, forward_context_c, backward_context_h, backward_context_c = Bidirectional(
                            LSTM(latent_dim, return_state=True)
                        )(context_emb)
                        _, forward_question_h, forward_question_c, backward_question_h, backward_question_c = Bidirectional(
                            LSTM(latent_dim, return_state=True)
                        )(question_emb)

                        merged_h = Concatenate()([forward_context_h, backward_context_h, forward_question_h, backward_question_h])
                        merged_c = Concatenate()([forward_context_c, backward_context_c, forward_question_c, backward_question_c])

                        decoder_lstm = LSTM(latent_dim * 4, return_sequences=True, return_state=True)
                        decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=[merged_h, merged_c])

                        decoder_dense = Dense(dense_unit, activation='softmax')
                        decoder_outputs = decoder_dense(decoder_outputs)

                        model = Model([context_inputs, question_inputs, decoder_inputs], decoder_outputs)
                        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

                        history = model.fit(
                            [encoder_input_context_data, encoder_input_question_data, decoder_input_data],
                            decoder_target_data,
                            epochs=35,
                            batch_size=batch_size,
                            validation_split=0.1,
                            verbose=2
                        )

                        results.append({
                            'embedding_dim': embedding_dim,
                            'latent_dim': latent_dim,
                            'dense_unit': dense_unit,
                            'optimizer': optimizer_name,
                            'learning_rate': learning_rate,
                            'batch_size': batch_size,
                            'history': history.history
                        })

                        plt.figure()
                        plt.plot(history.history['sparse_categorical_accuracy'], label='Train Accuracy')
                        plt.plot(history.history['val_sparse_categorical_accuracy'], label='Validation Accuracy')
                        plt.title(f"Accuracy (Emb={embedding_dim}, Lat={latent_dim}, Dense={dense_unit}, Opt={optimizer_name}, LR={learning_rate}, Batch={batch_size})")
                        plt.xlabel('Epochs')
                        plt.ylabel('Accuracy')
                        plt.legend()
                        graph_path = f"accuracy_emb{embedding_dim}_lat{latent_dim}_dense{dense_unit}_opt{optimizer_name}_lr{learning_rate}_batch{batch_size}.png"
                        plt.savefig(graph_path)
                        plt.close()

with open('hyperparameter_results.json', 'w') as f:
    json.dump(results, f)

I0000 00:00:1745226878.520202      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745226878.520987      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/35


I0000 00:00:1745226890.592519      62 cuda_dnn.cc:529] Loaded cuDNN version 90300


425/425 - 217s - 511ms/step - loss: 6.4756 - sparse_categorical_accuracy: 0.2231 - val_loss: 5.7443 - val_sparse_categorical_accuracy: 0.0469
Epoch 2/35
425/425 - 210s - 495ms/step - loss: 5.1265 - sparse_categorical_accuracy: 0.0477 - val_loss: 5.7948 - val_sparse_categorical_accuracy: 0.0474
Epoch 3/35
425/425 - 210s - 494ms/step - loss: 4.7682 - sparse_categorical_accuracy: 0.0480 - val_loss: 5.9040 - val_sparse_categorical_accuracy: 0.0474
Epoch 4/35
425/425 - 210s - 495ms/step - loss: 4.4265 - sparse_categorical_accuracy: 0.0491 - val_loss: 6.1200 - val_sparse_categorical_accuracy: 0.0459
Epoch 5/35
425/425 - 210s - 494ms/step - loss: 4.0936 - sparse_categorical_accuracy: 0.0505 - val_loss: 6.3170 - val_sparse_categorical_accuracy: 0.0436
Epoch 6/35
425/425 - 210s - 495ms/step - loss: 3.7532 - sparse_categorical_accuracy: 0.0517 - val_loss: 6.5139 - val_sparse_categorical_accuracy: 0.0441
Epoch 7/35
425/425 - 210s - 494ms/step - loss: 3.4145 - sparse_categorical_accuracy: 0.0531 -

In [None]:
encoder_model_inf = Model(
    [context_inputs, question_inputs],
    [merged_h, merged_c]
)

decoder_input_inf = Input(shape=(1,))

decoder_state_input_h = Input(shape=(latent_dim * 4,)) 
decoder_state_input_c = Input(shape=(latent_dim * 4,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_emb_inf = embedding_layer(decoder_input_inf)

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_emb_inf, initial_state=decoder_states_inputs
)
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model_inf = Model(
    [decoder_input_inf] + decoder_states_inputs,
    [decoder_outputs_inf, state_h_inf, state_c_inf]
)

In [None]:
def generate_answers(context_text, question_text, max_len=50):
    context_seq = tokenizer.texts_to_sequences([context_text])
    question_seq = tokenizer.texts_to_sequences([question_text])
    
    context_seq = pad_sequences(context_seq, maxlen=max_context_len, padding='post')
    question_seq = pad_sequences(question_seq, maxlen=max_question_len, padding='post')

    states_value = encoder_model_inf.predict([context_seq, question_seq])

    target_seq = np.array([[tokenizer.word_index['<start>']]])
    
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:

        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, "")
        
        if sampled_word == "<end>" or len(decoded_sentence.split()) > max_len:
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_word
            
            target_seq = np.array([[sampled_token_index]])
            states_value = [h, c]
    
    return decoded_sentence.strip()

In [None]:
%pip install nltk python-Levenshtein

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import Levenshtein


def evaluate_model(data_contexts, data_questions, data_answers, dataset_name="Test"):
    total_bleu = 0.0
    total_levenshtein = 0.0
    num_samples = len(data_contexts)
    smooth = SmoothingFunction().method4 

    print(f"\nEvaluating on {dataset_name} Set...\n")
    
    for i in range(300):
        pred = generate_answers(data_contexts[i], data_questions[i])
        true = data_answers[i].replace('<start>', '').replace('<end>', '').strip()

        reference = [list(true)]
        candidate = list(pred)
        bleu = sentence_bleu(reference, candidate, smoothing_function=smooth)
        total_bleu += bleu

        lev_distance = Levenshtein.distance(pred, true)
        total_levenshtein += lev_distance

        if i < 5:
            print(f"Context    : {data_contexts[i]}")
            print(f"Question   : {data_questions[i]}")
            print(f"True Answer: {true}")
            print(f"Predicted  : {pred}")
            print(f"BLEU Score : {bleu:.4f}")
            print(f"Levenshtein Distance: {lev_distance}")
            print("-" * 60)

    avg_bleu = total_bleu / num_samples
    avg_lev = total_levenshtein / num_samples
    print(f"\n{dataset_name} Set Average BLEU Score       : {avg_bleu:.4f}")
    print(f"{dataset_name} Set Average Levenshtein Distance: {avg_lev:.2f}")

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collecte

In [11]:
evaluate_model(contexts_test, questions_test, answers_test, "Test")
evaluate_model(contexts_train, questions_train, answers_train, "Train")


Evaluating on Test Set...

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 634ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Context    : The Atlantic Ocean has less influence on the climate of the Piedmont region, which has hotter summers and colder winters than in the coast. Daytime highs in the Piedmont often reach over 90 °F (32 °C) in the summer. While it is not common for the temperature to reach over 100 °F (38 °C) in the state, such temperatures, when they occur, typically are found only in the lower-elevation areas of the Piedmont and far-inland areas of the coastal plain. The weaker influence of the Atlantic Ocean also means that temperatures in the Piedmont often fluctuate more widely than in the coast.
Question   : What region of North Carolina has hotter summers and colder winters th