In [17]:
# CS4990 Prompt Engineering
# Assignment 4 - A Simple Question Answer Language Model

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import tensorflow as tf
Tokenizer = tf.keras.preprocessing.text.Tokenizer
# Correct import statements
# Access keras models and layers through TensorFlow
Dropout = tf.keras.layers.Dropout
BatchNormalization = tf.keras.layers.BatchNormalization
Sequential = tf.keras.models.Sequential
Dense = tf.keras.layers.Dense
Embedding = tf.keras.layers.Embedding
Bidirectional = tf.keras.layers.Bidirectional
LSTM = tf.keras.layers.LSTM
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
to_categorical = tf.keras.utils.to_categorical
Adam = tf.keras.optimizers.Adam
EarlyStopping = tf.keras.callbacks.EarlyStopping



In [18]:
# Open and read the content of the Python file

file_path = 'dataset.py'

# Initialize an empty dictionary to hold variables defined in the file
namespace = {}

# Execute the file's contents within the provided namespace
with open(file_path, 'r') as file:
    exec(file.read(), namespace)

# Access the list from the namespace where it was executed
scientific_facts_part1 = namespace.get('scientific_facts_part1', [])

# Check if the list was loaded correctly
print(f"Loaded {len(scientific_facts_part1)} sentences.")
print(scientific_facts_part1[:5])  # Display the first 5 sentences



Loaded 178 sentences.
['The Earth revolves around the Sun.', 'Water is made up of two hydrogen atoms and one oxygen atom.', 'Humans have 23 pairs of chromosomes.', 'The force of gravity keeps us on the ground.', 'Plants produce oxygen through a process called photosynthesis.']


In [29]:
# Initialize CountVectorizer to convert text into numerical format
vectorizer = CountVectorizer()

# Split each sentence into input (all words except the last) and output (the last word)
input_data = []
output_data = []

for sentence in scientific_facts_part1:
    words = sentence.split()
    if len(words) > 1:  # Ensure that there's at least one word to predict
        input_data.append(' '.join(words[:-1]))  # All words except the last
        output_data.append(words[-1])            # The last word

# Verify that input and output data are prepared correctly
print("Sample input data:", input_data[:3])  # Check a few input sentences
print("Sample output data:", output_data[:3])  # Check corresponding target words

# Convert input sentences into feature vectors
X = vectorizer.fit_transform(input_data).toarray()
# Transform output data into numerical format using a simple approach
# For SVM, we will convert the target words to integer indices
unique_words = list(set(output_data))
word_to_index = {word: idx for idx, word in enumerate(unique_words)}
y = np.array([word_to_index[word] for word in output_data])

# Convert output words into feature vectors
# y = vectorizer.transform(output_data).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check vectorizer vocabulary
print("Vectorizer vocabulary size:", len(vectorizer.vocabulary_))
print("Sample words from vocabulary:", list(vectorizer.vocabulary_.keys())[:10])

# Ensure correct transformation of labels
sample_word = output_data[0]  # Take a sample word from output data
encoded_sample = vectorizer.transform([sample_word]).toarray()  # Encode it
decoded_sample = vectorizer.inverse_transform(encoded_sample)  # Decode it back
print(f"Encoded sample: {encoded_sample}, Decoded sample: {decoded_sample}")


Sample input data: ['The Earth revolves around the', 'Water is made up of two hydrogen atoms and one oxygen', 'Humans have 23 pairs of']
Sample output data: ['Sun.', 'atom.', 'chromosomes.']
Vectorizer vocabulary size: 597
Sample words from vocabulary: ['the', 'earth', 'revolves', 'around', 'water', 'is', 'made', 'up', 'of', 'two']
Encoded sample: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [30]:
# Initialize the SVM model with a linear kernel
svm_model = SVC(kernel='linear', probability=True)

# Train the SVM model on the training data
# svm_model.fit(X_train, y_train.argmax(axis=1))
svm_model.fit(X, y)

# Test the SVM model predictions on a sample input
test_sentence = "The process of breaking down food into energy in the human body is called "  # Example test sentence
test_vector = vectorizer.transform([test_sentence]).toarray()
# print("Test Vector:", test_vector)


# Test the SVM model's predictions on the test set
svm_pred = svm_model.predict(test_vector)

# Correctly interpret the predicted index
predicted_index = svm_pred[0]

# Map the predicted index back to the corresponding word using the index-to-word mapping
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Check if the predicted index is valid
if predicted_index in index_to_word:
    predicted_word_svm = index_to_word[predicted_index]
    print("SVM Prediction:", predicted_word_svm)
else:
    print("Predicted index is out of range.")

# Predict using the SVM model
svm_pred = svm_model.predict(test_vector)
print("Predicted Index by SVM:", svm_pred)

# # Calculate and print the accuracy of the SVM model
# svm_accuracy = accuracy_score(y_test.argmax(axis=1), svm_predictions)
# print("SVM Accuracy:", svm_accuracy)

SVM Prediction: metabolism.
Predicted Index by SVM: [87]


In [46]:
#Neural network
import numpy as np
# Step 1: Prepare the tokenizer and encode the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(scientific_facts_part1)
total_words = len(tokenizer.word_index) + 1

# Create sequences for training (predicting only the last word)
input_sequences = []
target_words = []

for line in scientific_facts_part1:
    token_list = tokenizer.texts_to_sequences([line])[0]
    if len(token_list) > 1:  # Ensure there is more than one word in the sentence
        input_sequences.append(token_list[:-1])  # All words except the last one
        target_words.append(token_list[-1])  # Only the last word

# Pad input sequences to the same length
max_sequence_len = max(len(x) for x in input_sequences)
X = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Convert the target words into categorical format
y = to_categorical(target_words, num_classes=total_words)

# Step 2: Define a robust neural network model
nn_model = Sequential([
    Embedding(total_words, 128),  # Embedding layer
    LSTM(128),  # Single LSTM layer
    Dense(total_words, activation='softmax')  # Output layer
])

# Compile the model
nn_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.005), metrics=['accuracy'])

# Step 3: Add early stopping and train the model
# early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
history = nn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Step 4: Testing the model with a sample input
test_sentence = "The Earth revolves around the"
test_sequence = tokenizer.texts_to_sequences([test_sentence])[0]
test_padded = pad_sequences([test_sequence], maxlen=max_sequence_len, padding='pre')
prediction = np.argmax(nn_model.predict(test_padded), axis=-1)
predicted_word = tokenizer.index_word[prediction[0]]

print("Neural Network Prediction:", predicted_word)

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 220ms/step - accuracy: 0.0000e+00 - loss: 6.5312 - val_accuracy: 0.0278 - val_loss: 6.3918
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step - accuracy: 0.0262 - loss: 5.7161 - val_accuracy: 0.0278 - val_loss: 7.0815
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - accuracy: 0.0213 - loss: 4.8727 - val_accuracy: 0.0278 - val_loss: 8.1111
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - accuracy: 0.0327 - loss: 4.6808 - val_accuracy: 0.0278 - val_loss: 8.7362
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.0306 - loss: 4.6994 - val_accuracy: 0.0278 - val_loss: 9.0693
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.0362 - loss: 4.6456 - val_accuracy: 0.0278 - val_loss: 9.1477
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━

In [47]:
# # Make predictions with the neural network
# nn_pred = nn_model.predict(test_vector_padded)
# predicted_index = np.argmax(nn_pred, axis=1)[0]

# # Map the predicted index to the corresponding word in the filtered output vocabulary
# predicted_word_nn = index_to_output_word.get(predicted_index, "Unknown")
# print("Neural Network Prediction:", predicted_word_nn)
def predict_last_word(sentence):
    # Tokenize and pad the sentence
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Make prediction for the sentence
    predicted = nn_model.predict(token_list_padded)
    predicted_index = np.argmax(predicted, axis=-1)[0]

    # Map the predicted index to the corresponding word
    predicted_word = tokenizer.index_word.get(predicted_index, "Unknown")

    return predicted_word

# Example sentences to test
test_sentences = [
    "The Earth revolves around the",
    "Water is made up of two hydrogen",
    "Humans have 23 pairs of",
    "The chemical formula for table salt is",
    "DNA stands for deoxyribonucleic",
    "The largest planet in our solar system is",
    "The process of converting light energy into chemical energy is",
    "The greenhouse effect traps heat in the Earth's",
    "A herbivore is an animal that feeds on",
    "A parallelogram is a four-sided shape with opposite sides that are",
    "The human respiratory system is responsible for breathing and gas"
]

# Predict the last word for each sentence
for sentence in test_sentences:
    predicted_word = predict_last_word(sentence)
    print(f"Sentence: '{sentence}' -> Predicted Last Word: '{predicted_word}'")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
Sentence: 'The Earth revolves around the' -> Predicted Last Word: 'sun'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Sentence: 'Water is made up of two hydrogen' -> Predicted Last Word: 'atom'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Sentence: 'Humans have 23 pairs of' -> Predicted Last Word: 'chromosomes'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Sentence: 'The chemical formula for table salt is' -> Predicted Last Word: 'nitrogen'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Sentence: 'DNA stands for deoxyribonucleic' -> Predicted Last Word: 'acid'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Sentence: 'The largest planet in our solar system is' -> Predicted Last Word: 'oxygen'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Sentence: 'The process of

In [48]:
# Print a sample of the target words to verify correctness
print("Sample output words:", output_data)

# Ensure the set of unique output words matches your expectations
unique_output_words = set(output_data)
print("Unique output words:", unique_output_words)

Sample output words: ['Sun.', 'atom.', 'chromosomes.', 'ground.', 'photosynthesis.', 'second.', 'water.', 'oxygen.', 'cell.', 'NaCl.', 'gas.', 'System.', 'level.', 'acid.', 'Jupiter.', 'oxygen.', 'days.', 'bones.', 'sound.', 'cell.', 'particles.', 'old.', 'tissues.', 'system.', 'hydrogen.', 'nickel.', 'Sun.', 'chlorophyll.', 'oxygen.', 'year.', 'neurons.', 'gravitation.', 'blue.', 'temperature.', 'is.', 'matter.', 'element.', 'Planet.', 'system.', 'atoms.', 'charge.', 'Einstein.', "'Au'.", 'light.', 'photosynthesis.', 'phenomenon.', 'Sun.', 'microorganisms.', 'angles.', 'Earth.', 'obscured.', 'number.', 'fur.', 'solute.', 'cells.', 'colors.', 'mitosis.', '2006.', 'compound.', 'environment.', 'phenomena.', 'earthquakes.', 'motion.', 'reaction.', 'atmosphere.', 'ammonia.', 'insulators.', 'point.', 'world.', 'plants.', 'minerals.', 'atmosphere.', 'equation.', 'carbon.', 'years.', 'orbit.', 'chambers.', 'mass.', 'transpiration.', 'Way.', 'nucleus.', 'gases.', 'animals.', 'space.', 'power.'