In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn

In [2]:
df = pd.read_csv("../../../Data/100_Unique_QA_Dataset.csv")
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


# 1. Tokenization

In [3]:
def tokenize(text):
    text = text.lower()
    text = text.replace("?","")
    text = text.replace("'","")
    return text.split()

In [4]:
tokenize("Who wrote 'To Kill a Mockingbird'?")

['who', 'wrote', 'to', 'kill', 'a', 'mockingbird']

# 2. Build vocab

In [5]:
vocab = {
    "UNK":0
}

In [6]:
def build_vocab(row):
    question_tokens = tokenize(row['question'])
    answer_tokens = tokenize(row['answer'])
    
    merge_tokens = question_tokens + answer_tokens
    
    for token in merge_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [7]:
build_vocab(df.iloc[0,:])

In [8]:
df.apply(build_vocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [9]:
vocab

{'UNK': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'united

# 3. Text to numeric representation

In [10]:
def text_to_numeric_rep(text, vocab):
    numerical_represent = []
    for word in tokenize(text):
        if word in vocab:
            numerical_represent.append(vocab[word])
        else:
            numerical_represent.append(0)
    return numerical_represent

In [11]:
# print(text_to_numeric_rep("What is the capital of Germany?"))

# 4. Dataloader

In [12]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self,df, vocab):
        self.df = df
        self.vocab = vocab
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        numerical_question = text_to_numeric_rep(self.df.iloc[index]['question'], self.vocab)
        numerical_answer = text_to_numeric_rep(self.df.iloc[index]['answer'], self.vocab)
        
        return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [13]:
dataset = CustomDataset(df,vocab)

In [14]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [15]:
for ques, ans in dataloader:
    print(ques,ans)
    break

tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])


# 5. Model Build

In [19]:
class MyRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50,64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)
        
    def forward(self, question):
        embedded_ques = self.embedding(question)
        hidden_output, final_output  = self.rnn(embedded_ques)
        return self.fc(final_output.squeeze(0))

# 6. Train Model

In [20]:
learning_rate = 0.001
epochs = 50

In [21]:
model = MyRNN(len(vocab))

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
for epoch in range(epochs):
    epochs_loss = 0
    for question, answer in dataloader:
        optimizer.zero_grad()
        
        y_pred = model(question)
        
        loss = criterion(y_pred, answer[0])
        
        loss.backward()
        
        optimizer.step()
        
        epochs_loss += loss.item()
    avg_epoch_loss = epochs_loss/len(dataloader)
        
    print(f"Epoch {epoch+1}: Loss {avg_epoch_loss}")

Epoch 1: Loss 5.835732126235962
Epoch 2: Loss 5.074254258473714
Epoch 3: Loss 4.1657337692048815
Epoch 4: Loss 3.4849736319647895
Epoch 5: Loss 2.9205668873257107
Epoch 6: Loss 2.3967658506499396
Epoch 7: Loss 1.9206106238894993
Epoch 8: Loss 1.5023357066843244
Epoch 9: Loss 1.157876263724433
Epoch 10: Loss 0.8862549202309714
Epoch 11: Loss 0.6799574772516886
Epoch 12: Loss 0.5270513297783004
Epoch 13: Loss 0.4203493810362286
Epoch 14: Loss 0.33833623445696304
Epoch 15: Loss 0.27879472788837223
Epoch 16: Loss 0.23149375046292942
Epoch 17: Loss 0.19881039361159006
Epoch 18: Loss 0.17142770571841134
Epoch 19: Loss 0.14795433688494894
Epoch 20: Loss 0.12859844813744228
Epoch 21: Loss 0.11277186013758182
Epoch 22: Loss 0.09929669325550397
Epoch 23: Loss 0.08816642533573839
Epoch 24: Loss 0.07856650207605627
Epoch 25: Loss 0.06997940374745262
Epoch 26: Loss 0.06305696583456463
Epoch 27: Loss 0.05694458790951305
Epoch 28: Loss 0.051871440766586195
Epoch 29: Loss 0.04721116423606873
Epoch 30:

# 7. Evaluation

In [38]:
def Predict(model, question, threshold=0.5):
    # convert text to number
    numerical_ques = text_to_numeric_rep(question, vocab)
    
    # make it tensor
    tensor_ques = torch.tensor(numerical_ques).unsqueeze(0)
    
    # probabilities
    output_probability = model(tensor_ques)
    
    # use softmax
    probs = torch.nn.functional.softmax(output_probability, dim=1)
    
    max_prob , index = torch.max(probs, dim=1)
    
    if max_prob > threshold:
        print(list(vocab.keys())[index])
    else:
        print("I  dont Know")

In [39]:
Predict(model, "What is the capital of France?")

paris


In [40]:
list(vocab.keys())[7]

'paris'