<a href="https://colab.research.google.com/github/laya7171/pytorch/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")

In [3]:
df.head(5)

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [4]:
def tokenize(text):
  text = text.lower()  #preprocessing of the data
  text = text.replace("?", "") #removing the ?, ''
  text = text.replace("'","")
  return text.split()

In [5]:
answer = tokenize("What is the capital of France?")

In [6]:
answer

['what', 'is', 'the', 'capital', 'of', 'france']

In [7]:
vocab = {'<UNK>':0}

In [8]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_token = tokenized_question + tokenized_answer

  for token in merged_token:

    if token != vocab:
      vocab[token] = len(vocab)


In [9]:
df.apply(build_vocab, axis = 1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [10]:
vocab

{'<UNK>': 0,
 'what': 317,
 'is': 322,
 'the': 322,
 'capital': 317,
 'of': 320,
 'france': 324,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 308,
 'wrote': 189,
 'to': 282,
 'kill': 13,
 'a': 301,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 168,
 'planet': 281,
 'in': 297,
 'our': 20,
 'solar': 130,
 'system': 130,
 'jupiter': 23,
 'boiling': 24,
 'point': 60,
 'water': 134,
 'celsius': 27,
 '100': 28,
 'painted': 130,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 245,
 'root': 245,
 '64': 35,
 '8': 263,
 'chemical': 161,
 'symbol': 161,
 'for': 322,
 'gold': 40,
 'au': 41,
 'which': 322,
 'year': 220,
 'did': 44,
 'world': 185,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 260,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 322,
 'as': 319,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-o

In [11]:
len(vocab)

324

In [12]:
#convert word to numerical values
def text_to_indices(text, vocab):
  indexed_text = []

  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token]) #will return the text like [1,2,3]
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text



In [13]:
text_to_indices("what is living stone?", vocab)

[317, 322, 222, 0]

In [14]:
import torch
from torch.utils.data import DataLoader , Dataset

In [15]:
class CustomData(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab


  def __len__(self):
    return self.df.shape[0]


  def __getitem__(self, idx):
    numerical_question = text_to_indices(df.iloc[idx]['question'], self.vocab)
    numerical_answer = text_to_indices(df.iloc[idx]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [16]:
dataset = CustomData(df, vocab)

In [17]:
dataset[0]

(tensor([317, 322, 322, 317, 320, 324]), tensor([7]))

In [18]:
dataloader = DataLoader(dataset, batch_size = 1, shuffle = True)

In [19]:
for question, answer in dataloader:
  print(question, answer)

tensor([[317, 322, 322,  50,  51, 297, 322, 185]]) tensor([[52]])
tensor([[322, 250, 251, 301, 252, 253]]) tensor([[254]])
tensor([[322, 277, 301, 322, 119, 120, 120]]) tensor([[279]])
tensor([[322, 281, 301, 322, 186, 187]]) tensor([[188]])
tensor([[317, 322, 322, 161, 133, 320, 134]]) tensor([[134]])
tensor([[317, 322, 322, 234, 320, 235]]) tensor([[236]])
tensor([[322, 318, 322, 322, 319, 322, 319, 320, 320]]) tensor([[321]])
tensor([[308, 130, 130, 236]]) tensor([[132]])
tensor([[322, 241, 241, 241, 242, 297, 322, 243]]) tensor([[244]])
tensor([[317, 322, 322, 317, 320, 260]]) tensor([[54]])
tensor([[317, 322, 322, 317, 320,  73]]) tensor([[74]])
tensor([[322, 220,  44, 185,  46,  47,  48]]) tensor([[49]])
tensor([[308, 322, 322, 319, 322, 283, 320, 284]]) tensor([[285]])
tensor([[308, 130, 322,  30,  31]]) tensor([[32]])
tensor([[322, 290, 291, 301, 292, 303, 293, 294]]) tensor([[295]])
tensor([[317, 322, 322, 260, 320, 229]]) tensor([[156]])
tensor([[322, 322, 301, 322, 247, 320,

In [20]:
import torch.nn as nn

In [24]:
class Model(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self,x):
    output = self.embedding(x)
    hidden, final = self.embedding(x)
    output = self.fc(final)
    return output

In [25]:
lr = 0.001
epochs = 20

In [26]:
model = Model(len(vocab))

In [27]:
criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [28]:
for epoch in range(epochs):
  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    output = model(question)

    loss = criteria(output, answer)

    loss.backward()

    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch+1}, total loss {total_loss:4f}")


ValueError: not enough values to unpack (expected 2, got 1)