<a href="https://colab.research.google.com/github/jonash-chataut/PyTorch-learnings-/blob/main/qa_system_RNN_based_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
df=pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [6]:
# tokenize
def tokenize(text):
  text=text.lower()
  text=text.replace("'",'')
  text=text.replace(',','')
  text=text.replace('?','')
  return text.split()


In [7]:
tokenize('what is capital of france?')

['what', 'is', 'capital', 'of', 'france']

In [8]:
# vocab
vocab={'<UNK>':0}

In [16]:
def build_vocab(row):
  # print(row['question'],row['answer'])
  tokenize_question=tokenize(row['question'])
  tokenize_answer=tokenize(row['answer'])
  merged_tokens=tokenize_question+tokenize_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token]=len(vocab) #uniquley adding words no repeating


In [17]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [19]:
len(vocab)

324

In [20]:
# convert words to numerical indices
def text_to_indices(text,vocab):
  indexed_text=[]

  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [22]:
text_to_indices("What is the laptop",vocab)

[1, 2, 3, 0]

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader

In [71]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self,index):
    numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(numerical_question),torch.tensor(numerical_answer)

In [72]:
dataset=QADataset(df,vocab)

In [73]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True) #no padding requied as only one batch size xa

In [74]:
for question,answer in dataloader:
  print(question,answer)

tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([[179]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([[23]])
tensor([[ 42, 137,   2, 226,  12,   3, 227, 228]]) tensor([[155]])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([[233]])
tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([[121]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[  1,   2,   3, 122, 123,  19,   3,  45]]) tensor([[124]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 10, 308,   3, 309, 310]]) tensor([[311]])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([[95]])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([[65]])
tensor([[10, 75, 76]]) tensor([[77]])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([[188]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[ 42, 299, 300, 118

In [75]:
import torch.nn as nn

In [76]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [77]:
learning_rate=0.001
epochs=20

In [78]:
model=SimpleRNN(len(vocab))

In [79]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [80]:
# training loop

for epoch in range(epochs):
  train_loss=0 # Initialize train_loss for each epoch
  for question,answer in dataloader:
    optimizer.zero_grad()

    # forward pass
    output=model(question)

    # loss -> output shape (1,324) - (1)
    # Ensure answer is correctly shaped for CrossEntropyLoss
    loss = criterion(output, answer.squeeze(1))

    # gradients
    loss.backward()

    # update
    optimizer.step()

    train_loss=train_loss+loss.item() # Use train_loss for epoch accumulation

  print(f'Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}') # Print train_loss and correct epochs count

Epoch 1/20, Loss: 528.0455
Epoch 2/20, Loss: 454.8772
Epoch 3/20, Loss: 375.3415
Epoch 4/20, Loss: 316.7942
Epoch 5/20, Loss: 265.1108
Epoch 6/20, Loss: 216.5082
Epoch 7/20, Loss: 172.6286
Epoch 8/20, Loss: 134.1099
Epoch 9/20, Loss: 103.4653
Epoch 10/20, Loss: 78.8007
Epoch 11/20, Loss: 61.1160
Epoch 12/20, Loss: 47.5570
Epoch 13/20, Loss: 38.0829
Epoch 14/20, Loss: 30.8669
Epoch 15/20, Loss: 25.5217
Epoch 16/20, Loss: 21.7555
Epoch 17/20, Loss: 18.1802
Epoch 18/20, Loss: 16.0260
Epoch 19/20, Loss: 14.1487
Epoch 20/20, Loss: 12.0503


In [87]:
def predict(model,question,threshold=0.5):
  numerical_question=text_to_indices(question,vocab)

  # tensor
  question_tensor=torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output=model(question_tensor)

  # softmax
  probabilities=nn.functional.softmax(output,dim=1)
  # print(probabilities)

  # find index of max prob
  value,index=torch.max(probabilities,dim=1)

  if value<threshold:
    print("I am not sure")

  print(list(vocab.keys())[index])


In [91]:
predict(model,"Capital of germany")

berlin
