In [41]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [42]:
device = 'cuda' if torch.cuda.is_available else 'cpu'
device

'cuda'

In [43]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [44]:
# tokenize

def tokenize(sent):
  sent = sent.lower()
  sent = sent.replace('?', '')
  sent = sent.replace("'", '')
  return sent.split()

In [45]:
# vocab
vocab = {'<UNK>': 0}

def build_vocab(row):
  ques = tokenize(row['question'])
  ans = tokenize(row['answer'])

  merged = ques + ans
  for token in merged:
    if token not in vocab:
      vocab[token] = len(vocab)


df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [46]:
# convert words to numerial indices
def text_to_indices(text, vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

text_to_indices('What is france Harshit', vocab)

[1, 2, 6, 0]

In [56]:
class QADataset(Dataset):
  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    ques = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    ans = text_to_indices(self.df.iloc[index]['answer'], self.vocab)
    return torch.tensor(ques), torch.tensor(ans)


In [57]:
dataset = QADataset(df, vocab)

dataset[5]

(tensor([10, 29,  3, 30, 31]), tensor([32]))

In [58]:
dataloader = DataLoader(dataset, 1, shuffle=True)

In [63]:
%%capture

for q, a in dataloader:
  print(q, a)

In [127]:
class SimpleRNN(nn.Module):
  def __init__(self, vocab):
    super().__init__()
    self.embedding = nn.Embedding(len(vocab), 50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, len(vocab))

  def forward(self, x):
    embedded_question = self.embedding(x)
    hidden, final = self.rnn(embedded_question)
    out = self.fc(final.squeeze(0))

    return out


In [128]:
x = nn.Embedding(324, embedding_dim=50)
a = x(dataset[15][0])

y = nn.RNN(50, 64)
l = y(a)[0]
m = y(a)[1]

l.shape, m.shape

(torch.Size([8, 64]), torch.Size([1, 64]))

In [129]:
z = nn.Linear(64, 324)
z(m).shape

torch.Size([1, 324])

In [135]:
lr = 0.001
epochs = 200

model = SimpleRNN(vocab)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
for epoch in range(epochs):
  total_loss = 0
  for question, answer in dataloader:
    optimizer.zero_grad()
    output = model(question)
    loss = criterion(output, answer[0])
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'Epoch: {epoch + 1}, Loss: {total_loss}')

In [165]:
def predict(model, question, threshold=0.5):
  model.eval()
  with torch.no_grad():
    question = text_to_indices(question, vocab)
    question = torch.tensor(question).unsqueeze(0)
    output = model(question)
    probs = nn.functional.softmax(output, dim=1)
    top_prob, top_indices = torch.max(probs, dim=1)
    if top_prob < 0.5:
      print("I don't know")
    return top_prob, list(vocab.keys())[top_indices]

In [167]:
predict(model, "capital of india?")

(tensor([1.0000]), 'delhi')