In [70]:
import pandas as pd 

df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [71]:
df.sample(5)

Unnamed: 0,question,answer
21,Who invented the telephone?,Alexander-Graham-Bell
47,What is the longest-running animated TV show?,Simpsons
14,Who is the author of '1984'?,George-Orwell
36,Which country is famous for sushi?,Japan
16,What is the capital of India?,Delhi


In [73]:
df.shape

(90, 2)

In [74]:
df.shape

(90, 2)

In [75]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [76]:
## tokenize 
def tokenize(text):
    text = text.lower()
    text = text.replace('?','')
    text = text.replace('"','')
    return text.split()

In [77]:
## vocabulary : unique words 
vocabs = {'<UNK>' : 0}

In [78]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])
    merged_token = tokenized_question + tokenized_answer
    for token in merged_token :
        if token not in vocabs:
            vocabs[token] = len(vocabs)

In [79]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [80]:
len(vocabs)

326

In [81]:
# convert words to indices 

def text_to_indices(text,vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else : 
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [82]:
text_to_indices('what is movie susan',vocabs)

[1, 2, 311, 0]

In [83]:
import torch
from torch.utils.data import Dataset,DataLoader

In [84]:
class CustomDataset(Dataset):
    def __init__(self,df,vocabs):
        self.df = df
        self.vocabs = vocabs
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        embedded_question = text_to_indices(self.df.iloc[index]['question'],self.vocabs)
        embedded_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocabs)

        return torch.tensor(embedded_question,dtype=torch.long),torch.tensor(embedded_answer,dtype=torch.long)

In [85]:
dataset = CustomDataset(df,vocabs)

In [86]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [87]:
dataloader = DataLoader(
    dataset,
    shuffle=True,
    pin_memory=True
)

In [88]:
for question,answer in dataloader:
    print(question,answer)

tensor([[ 10,   2,  62,  63,   3, 285,   5, 286]]) tensor([[287]])
tensor([[ 42, 217, 118, 218, 219,  19,  14, 220,  43]]) tensor([[221]])
tensor([[ 10,  75, 111]]) tensor([[112]])
tensor([[ 10, 140,   3, 141, 172,   5,   3,  70, 173]]) tensor([[174]])
tensor([[ 78,  79, 263, 152,  14, 264, 154]]) tensor([[36]])
tensor([[  1,   2,   3,  17, 115,  83,  84]]) tensor([[116]])
tensor([[ 42, 137,   2, 227, 143,   3, 228, 229]]) tensor([[156]])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([[54]])
tensor([[ 42, 137,   2,  62,  39,   3, 324, 325]]) tensor([[6]])
tensor([[10, 96,  3, 97]]) tensor([[98]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[  1,   2,   3,  33,  34,   5, 247]]) tensor([[248]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[262]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
tensor([[ 42, 

In [89]:
import torch.nn as nn

In [90]:
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn = nn.RNN(50,128,batch_first=True)
        self.out = nn.Linear(128,vocab_size) 
    
    def forward(self,question):
        embedded_question = self.embedding(question)
        hidden , final = self.rnn(embedded_question)
        output = self.out(final.squeeze(0))

        return output
        

In [91]:
learning_rate = 0.001
epochs = 20

In [92]:
model = SimpleRNN(len(vocabs))

In [93]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [94]:
for epoch in range(epochs):
    total_loss = 0
    for question,answer in dataloader:
        optimizer.zero_grad()
        output = model(question)
        loss = criterion(output,answer[0])
        loss.backward()
        optimizer.step()
        total_loss = total_loss + loss.item()
    print(f'Epoch : {epoch+1} , Loss : {total_loss:.4f}')


Epoch : 1 , Loss : 527.0996
Epoch : 2 , Loss : 407.8316
Epoch : 3 , Loss : 297.6416
Epoch : 4 , Loss : 208.7935
Epoch : 5 , Loss : 132.9678
Epoch : 6 , Loss : 80.1363
Epoch : 7 , Loss : 49.1645
Epoch : 8 , Loss : 32.8893
Epoch : 9 , Loss : 22.1256
Epoch : 10 , Loss : 16.1213
Epoch : 11 , Loss : 11.9592
Epoch : 12 , Loss : 9.0985
Epoch : 13 , Loss : 7.3446
Epoch : 14 , Loss : 5.9939
Epoch : 15 , Loss : 5.0384
Epoch : 16 , Loss : 4.2730
Epoch : 17 , Loss : 3.6775
Epoch : 18 , Loss : 3.2079
Epoch : 19 , Loss : 2.8128
Epoch : 20 , Loss : 2.5014


In [95]:
def predict(model,question,threshold=0.5):

    # conver question to numbers 
    numerical_question = text_to_indices(question,vocabs)
    # tensor 
    question_tensor = torch.tensor(numerical_question).unsqueeze(0) ## unsqueeze to add the dimension for batch_size
    # send to model 
    output = model(question_tensor)
    # converts logits to probabilities using softmax
    probab = torch.nn.functional.softmax(output,dim=1)
    # find index of max probab 
    value,index = torch.max(probab,dim=1)

    if value < threshold : 
        print("I don't know")
    print(list(vocabs.keys())[index])
            


In [96]:
predict(model,"What is the largest planed in our solar system ?")

jupiter


In [98]:
predict(model,"What is the capital of france ?")

paris


In [99]:
predict(model,"Who discovered gravity ?")

newton
