In [None]:
import pandas as pd # Import the pandas library, commonly aliased as 'pd', for data manipulation and analysis.

df = pd.read_csv('../Data/100_Unique_QA_Dataset.csv') # Read the CSV file '100_Unique_QA_Dataset.csv' into a pandas DataFrame named 'df'. This DataFrame will hold our question-answer pairs.

df.head() # Display the first 5 rows of the DataFrame 'df'. This is useful for quickly inspecting the data structure and content after loading.

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [26]:
# Define a function called 'tokenize' that takes a 'text' string as input.
def tokenize(text):
  text = text.lower() # Convert the input text to lowercase to ensure uniformity and reduce vocabulary size (e.g., 'What' and 'what' are treated the same).
  text = text.replace('?','') # Remove question marks from the text, as they are usually not relevant for tokenization in this context.
  text = text.replace("'","") # Remove apostrophes from the text. This helps in standardizing words (e.g., 'don't' becomes 'dont').
  return text.split() # Split the processed text into a list of words (tokens) based on whitespace. This returns a list of individual words.

In [27]:
tokenize('What is the capital of France?') # Call the 'tokenize' function with a sample question string to demonstrate its functionality and see the tokenized output.

['what', 'is', 'the', 'capital', 'of', 'france']

In [28]:
# Initialize a dictionary called 'vocab'. This dictionary will store our vocabulary, mapping each unique word to a numerical index.
vocab = {'<UNK>':0} # Start with a special token '<UNK>' (Unknown) and assign it the index 0. This token will be used for words not found in the vocabulary during inference.

In [29]:
# Define a function 'build_vocab' that takes a 'row' (presumably a row from a DataFrame containing 'question' and 'answer' columns) as input.
def build_vocab(row):
  tokenized_question = tokenize(row['question']) # Tokenize the 'question' text from the current row using the 'tokenize' function.
  tokenized_answer = tokenize(row['answer']) # Tokenize the 'answer' text from the current row using the 'tokenize' function.

  merged_tokens = tokenized_question + tokenized_answer # Combine the tokenized question and answer into a single list of tokens. This ensures all words from both are considered for the vocabulary.

  for token in merged_tokens: # Iterate through each token in the combined list.

    if token not in vocab: # Check if the current token is not already present in the 'vocab' dictionary.
      vocab[token] = len(vocab) # If the token is new, add it to the 'vocab' dictionary and assign it a new unique index, which is the current size of the vocabulary.

In [30]:
df.apply(build_vocab, axis=1) # Apply the 'build_vocab' function to each row of the DataFrame 'df'. 'axis=1' ensures the function is applied row-wise, processing each question-answer pair to build the vocabulary.

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [31]:
len(vocab) # Print the total number of unique words (tokens) collected in the 'vocab' dictionary after processing all the data. This indicates the size of our vocabulary.

324

In [32]:
# Define a function 'text_to_indices' that converts a given 'text' into a list of numerical indices based on the provided 'vocab' dictionary.
def text_to_indices(text, vocab):

  indexed_text = [] # Initialize an empty list to store the numerical indices of the tokens.

  for token in tokenize(text): # Tokenize the input text using the 'tokenize' function and iterate through each resulting token.

    if token in vocab: # Check if the current token exists in our established 'vocab' dictionary.
      indexed_text.append(vocab[token]) # If the token is in the vocabulary, append its corresponding numerical index to 'indexed_text'.
    else:
      indexed_text.append(vocab['<UNK>']) # If the token is not found in the vocabulary, append the index of the '<UNK>' (Unknown) token.

  return indexed_text # Return the list of numerical indices representing the input text.

In [33]:
text_to_indices("What is campusx", vocab) # Test the 'text_to_indices' function with a sample phrase "What is campusx" and our 'vocab' to see how it converts words into numerical representations. 'campusx' is likely not in the vocab, so it will be mapped to '<UNK>'.

[1, 2, 0]

In [34]:
import torch # Import the PyTorch library, a popular open-source machine learning framework.
from torch.utils.data import Dataset, DataLoader # Import 'Dataset' and 'DataLoader' classes from PyTorch's data utilities. 'Dataset' is an abstract class representing a dataset, and 'DataLoader' provides an iterable over a dataset.

In [35]:
class QADataset(Dataset): # Define a custom dataset class named 'QADataset' that inherits from PyTorch's 'Dataset'. This class will handle loading and preprocessing our QA data.

  def __init__(self, df, vocab): # The constructor method for the QADataset class, taking a DataFrame 'df' and the 'vocab' dictionary as inputs.
    self.df = df # Store the input DataFrame 'df' as an instance variable.
    self.vocab = vocab # Store the input vocabulary dictionary 'vocab' as an instance variable.

  def __len__(self): # This method returns the total number of samples (rows) in the dataset.
    return self.df.shape[0] # Return the number of rows in the DataFrame, which corresponds to the number of question-answer pairs.

  def __getitem__(self, index): # This method retrieves a single sample (question-answer pair) given an 'index'.

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab) # Convert the 'question' text of the row at the given 'index' into a list of numerical indices using 'text_to_indices' and the stored vocabulary.
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab) # Convert the 'answer' text of the row at the given 'index' into a list of numerical indices.

    return torch.tensor(numerical_question), torch.tensor(numerical_answer) # Convert the lists of numerical indices for question and answer into PyTorch tensors and return them as a pair.

In [36]:
dataset = QADataset(df, vocab) # Create an instance of our 'QADataset' class, passing in the DataFrame 'df' and the 'vocab' dictionary. This prepares our data for use with PyTorch.

In [37]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True) # Create a 'DataLoader' instance. It will load data from our 'dataset' in batches. 'batch_size=1' means one sample per batch, and 'shuffle=True' shuffles the data at each epoch for better training performance.

In [38]:
for question, answer in dataloader: # Iterate through the 'dataloader' to get batches of questions and answers.
  print(question, answer[0]) # Print the tensor representing the numerical question and the first element of the answer tensor. Since batch_size is 1, answer[0] gives the single answer tensor.

tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([113])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([9])
tensor([[ 42, 255,   2, 256,  83, 257, 258]]) tensor([259])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([188])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([134])
tensor([[ 10,  75, 111]]) tensor([112])
tensor([[10, 96,  3, 97]]) tensor([98])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([194])
tensor([[  1,   2,   3,   4,   5, 206]]) tensor([207])
tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([162])
tensor([[10, 29,  3, 30, 31]]) tensor([32])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([287])
tensor([[10, 75, 76]]) tensor([77])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([36])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([244])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]]) tensor([173])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]

In [39]:
import torch.nn as nn # Import the 'torch.nn' module, which contains classes for building neural networks in PyTorch (e.g., layers, activation functions).

In [40]:
class SimpleRNN(nn.Module): # Define a neural network class named 'SimpleRNN' that inherits from 'nn.Module', the base class for all neural network modules in PyTorch.

  def __init__(self, vocab_size): # Constructor for the SimpleRNN model, taking 'vocab_size' (the total number of unique words in our vocabulary) as input.
    super().__init__() # Call the constructor of the parent class (nn.Module). This is essential for proper initialization of PyTorch modules.
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50) # Define an embedding layer. This layer converts input integer indices (words) into dense, fixed-size vectors (embeddings) of size 50.
    self.rnn = nn.RNN(50, 64, batch_first=True) # Define a simple Recurrent Neural Network (RNN) layer. It takes input features of size 50 (from the embedding layer) and produces hidden states of size 64. 'batch_first=True' means the input tensor's batch dimension comes first.
    self.fc = nn.Linear(64, vocab_size) # Define a fully connected (linear) layer. It takes the 64-dimensional hidden state from the RNN and projects it to 'vocab_size' dimensions, representing the probability distribution over the vocabulary for the output word.

  def forward(self, question): # The forward pass method, defining how data flows through the network. It takes a 'question' tensor (numerical indices) as input.
    embedded_question = self.embedding(question) # Pass the 'question' tensor through the embedding layer to get word embeddings.
    hidden, final = self.rnn(embedded_question) # Pass the embedded question through the RNN layer. 'hidden' contains the output for each step, and 'final' is the final hidden state (or cell state for LSTMs/GRUs). For a simple RNN, 'final' is typically the last hidden state.
    output = self.fc(final.squeeze(0)) # Pass the final hidden state from the RNN through the fully connected layer. 'final.squeeze(0)' removes the batch dimension if it's a single element (e.g., from (1, 1, 64) to (1, 64)).

    return output # Return the output from the fully connected layer, which represents the model's prediction for the answer word.

In [41]:
x = nn.Embedding(324, embedding_dim=50) # Create an example embedding layer with a vocabulary size of 324 and embedding dimension of 50.
y = nn.RNN(50, 64, batch_first=True) # Create an example RNN layer with input size 50, hidden size 64, and batch_first set to True.
z = nn.Linear(64, 324) # Create an example linear layer that maps from 64 features to 324 output features.

a = dataset[0][0].reshape(1,6) # Take the first question from the dataset, which is a tensor, and reshape it to a batch size of 1 and sequence length of 6. This simulates an input to the model.
print("shape of a:", a.shape) # Print the shape of the reshaped input tensor 'a'.
b = x(a) # Pass the reshaped input 'a' through the embedding layer 'x'.
print("shape of b:", b.shape) # Print the shape of the tensor 'b' after embedding. It should be (batch_size, sequence_length, embedding_dim).
c, d = y(b) # Pass the embedded tensor 'b' through the RNN layer 'y'. 'c' will be the output from all time steps, and 'd' will be the final hidden state.
print("shape of c:", c.shape) # Print the shape of 'c' (output of RNN for all time steps).
print("shape of d:", d.shape) # Print the shape of 'd' (final hidden state of RNN).

e = z(d.squeeze(0)) # Pass the final hidden state 'd' (after squeezing out the sequence length dimension if it's 1) through the linear layer 'z'.

print("shape of e:", e.shape) # Print the shape of 'e', which is the final output of the model before activation, typically (batch_size, vocab_size).

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [42]:
learning_rate = 0.001 # Set the learning rate for the optimizer. This value determines the step size at each iteration while moving toward a minimum of the loss function.
epochs = 20 # Set the number of training epochs. An epoch means one complete pass through the entire training dataset.

In [43]:
model = SimpleRNN(len(vocab)) # Instantiate our 'SimpleRNN' model. We pass the size of our vocabulary (len(vocab)) to the model's constructor, which is used to define the embedding and output layers.

In [44]:
criterion = nn.CrossEntropyLoss() # Define the loss function. 'nn.CrossEntropyLoss()' is commonly used for multi-class classification problems, such as predicting a word from a vocabulary. It combines LogSoftmax and NLLLoss.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Define the optimizer. 'torch.optim.Adam' is an optimization algorithm that updates model weights during training. It takes the model's parameters and the specified 'learning_rate'.

In [45]:
# Training loop: This block iterates over the dataset multiple times (epochs) to train the model.

for epoch in range(epochs): # Loop for the specified number of 'epochs'.

  total_loss = 0 # Initialize 'total_loss' to 0 at the beginning of each epoch to accumulate the loss over all batches.

  for question, answer in dataloader: # Iterate through each batch in the 'dataloader'. Each batch contains a 'question' tensor and an 'answer' tensor.

    optimizer.zero_grad() # Clear the gradients of all optimized tensors. This is crucial before each backward pass to prevent gradient accumulation from previous iterations.

    # forward pass
    output = model(question) # Perform a forward pass: feed the 'question' tensor to the model to get its predictions ('output').

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0]) # Calculate the loss between the model's 'output' (predictions) and the true 'answer'. 'answer[0]' is used because our batch size is 1, and the answer tensor might have an extra dimension.

    # gradients
    loss.backward() # Perform a backward pass: compute the gradients of the loss with respect to the model's parameters.

    # update
    optimizer.step() # Update the model's parameters using the computed gradients and the optimizer's update rules.

    total_loss = total_loss + loss.item() # Add the current batch's loss (as a Python scalar using .item()) to the 'total_loss' for the epoch.

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}") # After each epoch, print the current epoch number and the average loss for that epoch.

Epoch: 1, Loss: 526.866436
Epoch: 2, Loss: 455.015809
Epoch: 3, Loss: 376.643435
Epoch: 4, Loss: 317.801134
Epoch: 5, Loss: 268.262268
Epoch: 6, Loss: 221.662940
Epoch: 7, Loss: 179.073948
Epoch: 8, Loss: 141.565675
Epoch: 9, Loss: 109.871347
Epoch: 10, Loss: 84.316274
Epoch: 11, Loss: 65.380416
Epoch: 12, Loss: 51.709098
Epoch: 13, Loss: 40.875442
Epoch: 14, Loss: 32.925297
Epoch: 15, Loss: 27.025303
Epoch: 16, Loss: 22.561723
Epoch: 17, Loss: 18.951467
Epoch: 18, Loss: 16.034594
Epoch: 19, Loss: 13.633483
Epoch: 20, Loss: 11.900106


In [46]:
# Define a prediction function that takes the trained 'model', a 'question' string, and an optional 'threshold' for prediction confidence.
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab) # Convert the input 'question' string into its numerical representation using the 'text_to_indices' function and the global 'vocab'.

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0) # Convert the list of numerical indices into a PyTorch tensor and add an extra dimension at the beginning (unsqueeze(0)) to represent the batch dimension (even for a single question).

  # send to model
  output = model(question_tensor) # Pass the prepared 'question_tensor' through the trained model to get the raw output (logits).

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1) # Apply the softmax function to the model's output (logits) along dimension 1 to convert them into probabilities. These probabilities represent the model's confidence for each word in the vocabulary.

  # find index of max prob
  value, index = torch.max(probs, dim=1) # Find the maximum probability ('value') and its corresponding index ('index') along dimension 1. This 'index' is the predicted word's index in the vocabulary.

  if value < threshold: # Check if the confidence (maximum probability) of the predicted word is below a specified 'threshold'.
    print("I don't know") # If the confidence is too low, print a message indicating the model is unsure.

  print(list(vocab.keys())[index]) # Retrieve the word corresponding to the predicted 'index' from the 'vocab' dictionary (by getting its keys as a list) and print it as the model's answer.

In [47]:
predict(model, "What is the largest planet in our solar system?") # Call the 'predict' function with the trained 'model' and a sample question to get an answer.

jupiter


In [48]:
list(vocab.keys())[7] # Access the vocabulary dictionary's keys, convert them to a list, and retrieve the word at index 7. This is likely done to demonstrate how to map an index back to a word, or to check a specific word's index.

'paris'