In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/uniqueqa/UniqueQA.xlsx


In [2]:
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import torch.nn as nn
import torch.optim as optim

In [3]:
df = pd.read_excel("/kaggle/input/uniqueqa/UniqueQA.xlsx")

df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [4]:
# tokenize

def tokenize(text):
    text = str(text)
    text = text.lower()
    text = text.replace("?", "")
    text = text.replace("'", "")
    #print(text)
    return text.split()

In [5]:
# create vocab

vocab = {'<UNK>': 0}



In [6]:

def build_vocab(row):

    #print(row)
    #print(row['question'], " : ", row['answer'])

    ques_tokenized = tokenize(row['question'])
    ans_tokenized = tokenize(row['answer'])

    merged_tokens = ques_tokenized + ans_tokenized

    #print(merged_tokens)

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)



    

In [7]:
df.apply(build_vocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [8]:
# convert to numerical indices
def text_to_indices(text, vocab):
    indexed_text = []

    for token in tokenize(text):
        if token in vocab:
            index = vocab[token]
            indexed_text.append(index)
        else:
            indexed_text.append(vocab['<UNK>'])
    
    return indexed_text

In [9]:
text_to_indices('What is CampusX', vocab)

[1, 2, 0]

In [10]:
class QADataset(Dataset):

    def __init__(self, df, vocab):

        self.df = df
        self.vocab = vocab


    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        numerical_ques = text_to_indices(self.df.iloc[idx]['question'], self.vocab)
        numerical_ans = text_to_indices(self.df.iloc[idx]['answer'], self.vocab)

        return torch.tensor(numerical_ques), torch.tensor(numerical_ans)
        

In [11]:
dataset = QADataset(df, vocab)

In [12]:
def collate_batch(batch):

    #print(batch)
    data, labels = zip(*batch)
    batch = [torch.tensor(d) if not isinstance(d, torch.Tensor) else d for d in data  ]
    
    padded_batch = pad_sequence(batch, batch_first=True, padding_value = 0)
    #print(f"padded_batch : {len(padded_batch)}")
    return padded_batch, torch.tensor(labels)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True) #, collate_fn = collate_batch)   
#dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn = collate_batch)

In [13]:
# for ques, ans in dataloader:
#     print(ques, ans)
# print(len(dataloader))
# for idx, (ques, ans) in enumerate(dataloader):
#     print(f" idx: {idx}")
#     print(f" ques: {ques}")
#     print(f" ans: {ans}")
    
        

# 23
#  idx: 0
#  ques: tensor([[  1,   2,   3, 234,   5, 235,   0,   0],
#         [  1,   2,   3,   4,   5,  73,   0,   0],
#         [ 42,  18,   2,   3, 281,  12,   3, 282],
#         [  1,   2,   3,   4,   5, 286,   0,   0]])
#  ans: tensor([131,  74, 205, 287])


# Creating NN Module

In [14]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, no_of_neurons):
        
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(embedding_dim, no_of_neurons, batch_first=True)
        self.output = nn.Linear(no_of_neurons, vocab_size)


    def forward(self, question):
        embedded_ques = self.embedding(question)
        # print(f"embedded_ques : {embedded_ques}, shape: {embedded_ques.shape}")
        hidden, final = self.rnn(embedded_ques)
        # print(f"hidden : {hidden}, shape: {hidden.shape}")
        # print(f"final : {final}, shape: {final.shape}")
        final_output = self.output(final)
        # print(f"final_output : {final_output}, shape: {final_output.shape}")
        return final_output.squeeze(0)

# Training

In [15]:
learning_rate = 0.005
epochs = 50

vocab_size, embedding_dim, no_of_neurons = len(vocab), 50, 64
print(f"vocab_size, embedding_dim, no_of_neurons : {vocab_size}, {embedding_dim}, {no_of_neurons}")
model = SimpleRNN(vocab_size, embedding_dim, no_of_neurons)

vocab_size, embedding_dim, no_of_neurons : 324, 50, 64


In [16]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [17]:
for epoch in range(epochs):
    total_loss = []

    for ques, ans in dataloader:
    # for idx, (ques, ans) in enumerate(dataloader):

        # clear grads
        optimizer.zero_grad()


        # forward pass
        y_pred = model(ques)

        #y_pred = y_pred.squeeze(0)
        #print(f"ques: {ques}, {ques.shape}")
        # print(f"y_pred:  {y_pred.shape}")
        # print(f"ans:  {ans.shape}")

        # loss calculation
        loss = loss_function(y_pred, ans[0])

        # calculating gradients
        loss.backward()

        # updating gradients
        optimizer.step()
        
        
        total_loss.append(loss.item())
        # if epoch > 22:
        #     print(f"epoch: {epoch}")
        #     print(f"idx: {idx}")
        #     print(f"ques: {ques}")
        #     print(f"loss.item(): {loss.item()}")
        #     print(f"total_loss: {total_loss}")
        #     print(f"total_loss len: {len(total_loss)}")
            
            
    print(f"for epoch : {epoch}/{epochs}, avg_loss: {sum(total_loss)/len(total_loss)}")

for epoch : 0/50, avg_loss: 5.8994513034820555
for epoch : 1/50, avg_loss: 3.512935147020552
for epoch : 2/50, avg_loss: 1.6862750411033631
for epoch : 3/50, avg_loss: 0.6423856594496303
for epoch : 4/50, avg_loss: 0.3301770228892565
for epoch : 5/50, avg_loss: 0.19584685613711675
for epoch : 6/50, avg_loss: 0.14397725572602615
for epoch : 7/50, avg_loss: 0.09815231391953097
for epoch : 8/50, avg_loss: 0.05986206933028168
for epoch : 9/50, avg_loss: 0.03833759959994091
for epoch : 10/50, avg_loss: 0.026423521381285456
for epoch : 11/50, avg_loss: 0.02130416307805313
for epoch : 12/50, avg_loss: 0.018020256701856852
for epoch : 13/50, avg_loss: 0.015577957277289695
for epoch : 14/50, avg_loss: 0.013547546975314618
for epoch : 15/50, avg_loss: 0.012001551118575864
for epoch : 16/50, avg_loss: 0.010652241364328397
for epoch : 17/50, avg_loss: 0.009522325980166594
for epoch : 18/50, avg_loss: 0.00857429867092934
for epoch : 19/50, avg_loss: 0.007734550370110406
for epoch : 20/50, avg_loss:

In [18]:
def predict(model, ques, threshold=0.5):

    numerical_ques = text_to_indices(ques, vocab)

    ques_tensor = torch.tensor(numerical_ques).unsqueeze(0)

    logits = model(ques_tensor)

    print(logits.shape)

    probab = nn.functional.softmax(logits, dim=1)

    #print(f"probab: {probab}")

    value, index = torch.max(probab, dim=1)
    #print(probab)
    print(value, index)
    print(list(vocab.keys())[index])

    if value < threshold:
        return "I dont know"
    else:
        return list(vocab.keys())[index]
    

    

    
    

In [19]:
answer = predict(model, "President of India")

print(answer)

torch.Size([1, 324])
tensor([0.9175], grad_fn=<MaxBackward0>) tensor([74])
delhi
delhi
