In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
df = pd.read_csv("../Datasets/100_Unique_QA_Dataset.csv")
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [3]:
def tokenize(text):
    text = text.lower()
    text = text.replace("?","")
    text = text.replace("'","")
    text = text.split()
    
    return text

In [4]:
print(tokenize("My Name is Iztihad"))

['my', 'name', 'is', 'iztihad']


In [5]:
vocab = {"<UNK>": 0}

In [6]:
def build_vocab(row):

    tokenized_ques = tokenize(row["question"])
    tokenized_ans = tokenize(row["answer"])

    merged_tokens = tokenized_ques + tokenized_ans

    for token in merged_tokens:

        if token not in vocab:
            vocab[token] = len(vocab)
    

In [7]:
df.apply(build_vocab, axis = 1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [8]:
print(len(vocab))

324


In [37]:


def text_to_indices(text, vocab):

    indexed_text = []

    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab["<UNK>"])
            
    return indexed_text

In [38]:
print(text_to_indices("The capital of Italy is Rome", vocab))

[3, 4, 5, 135, 2, 136]


In [39]:
class QADataset(Dataset):

    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        numerical_ques = text_to_indices(self.df.iloc[index]["question"], self.vocab)
        numerical_ans = text_to_indices(self.df.iloc[index]["answer"], self.vocab)

        return torch.tensor(numerical_ques), torch.tensor(numerical_ans)
    

In [40]:
dataset = QADataset(df, vocab)

In [41]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [42]:
for question, answer in dataloader:
    print(question, answer[0])

tensor([[ 10,   2,  62,  63,   3, 283,   5, 284]]) tensor([285])
tensor([[  1,   2,   3,  69,   5, 155]]) tensor([156])
tensor([[ 10,  75, 208]]) tensor([209])
tensor([[ 10,  75, 111]]) tensor([112])
tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([321])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([99])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([184])
tensor([[ 42, 137,   2,  62,  39,   3, 322, 323]]) tensor([6])
tensor([[ 42, 167,   2,   3,  17, 168, 169]]) tensor([170])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([268])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([114])
tensor([[ 78,  79, 261, 151,  14, 262, 153]]) tensor([36])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([316])
tensor([[42, 43, 44, 45, 46, 47, 48]]) tensor([49])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([154])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([273])
te

In [43]:
class SimpleRNN(nn.Module):
    
    def __init__(self, vocab_size):
        
        super().__init__()

        self.embedded = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.linear = nn.Linear(64, vocab_size)

    def forward(self, question):
        embedded_ques = self.embedded(question)
        hidden, final = self.rnn(embedded_ques)
        output = self.linear(final.squeeze(0))

        return output


In [44]:
learning_rate = 0.001
epochs = 20

In [45]:
model = SimpleRNN(len(vocab))


In [46]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

In [48]:
for epoch in range(epochs):

    total_loss = 0

    for question, answer in dataloader:

        optimizer.zero_grad()

        output = model.forward(question)

        loss = criterion(output, answer[0])

        loss.backward()

        optimizer.step()

        total_loss = total_loss + loss.item()
    
    print(f"Epoch: {epoch + 1}, Loss: {total_loss}")



Epoch: 1, Loss: 520.1810736656189
Epoch: 2, Loss: 454.8496232032776
Epoch: 3, Loss: 377.600772857666
Epoch: 4, Loss: 313.1642470359802
Epoch: 5, Loss: 260.8204460144043
Epoch: 6, Loss: 211.26364409923553
Epoch: 7, Loss: 167.89968848228455
Epoch: 8, Loss: 130.50466805696487
Epoch: 9, Loss: 100.42884701490402
Epoch: 10, Loss: 76.91246449947357
Epoch: 11, Loss: 59.77621926367283
Epoch: 12, Loss: 47.07108788192272
Epoch: 13, Loss: 37.720104210078716
Epoch: 14, Loss: 30.54422239214182
Epoch: 15, Loss: 25.097416043281555
Epoch: 16, Loss: 20.73392879962921
Epoch: 17, Loss: 17.56129292398691
Epoch: 18, Loss: 14.94973099604249
Epoch: 19, Loss: 12.799005810171366
Epoch: 20, Loss: 11.045778054744005


In [50]:
def predict(model, question, threshold = 0.5):

    numerical_ques = text_to_indices(question, vocab)

    question_tensor = torch.tensor(numerical_ques).unsqueeze(0)

    output = model.forward(question_tensor)

    probability = nn.functional.softmax(output, dim=1)

    value, index = torch.max(probability, dim=1)

    if value < threshold:
        print("I don't know")
    
    else:
        print(list(vocab.keys())[index])

In [54]:
predict(model, "What is the capital of Germany?")

berlin


In [53]:
predict(model, "What is the capital of Bangladesh?")

I don't know
