In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Join words back into a single string
    text = ' '.join(words)

    return text


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.enc = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True)

    def forward(self, x):
        output, (hidden, cell) = self.enc(x)
        return output, hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_len):
      super(Decoder, self).__init__()
      self.dec_lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
      self.dec_contain = nn.Sequential(nn.Linear(hidden_size, vocab_len),
                                       nn.Softmax(dim=1))
      # self.dec = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1)
    def forward(self, x, hidden, cell):
      output, (hidden, cell) = self.dec_lstm(x, (hidden, cell))

      # x = self.dec_contain(output)
      word_probs = self.dec_contain(hidden)

      return hidden, cell, word_probs

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_embed, dec_embed, enc_vocab, dec_vocab):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.enc_embed = enc_embed
        self.dec_embed = dec_embed
        self.enc_vocab = enc_vocab
        self.dec_vocab = dec_vocab

    def get_embeddings(self, embedding, sentence, vocab):
      # pass
      sent_vect = []
      for word in sentence.split():
          # print(torch.tensor(inp_vocab.index(word)))
          sent_vect.append(embedding(torch.tensor(vocab.index(word))))
      return sent_vect

    def one_hot_encode(self, word, vocab):
      """One-hot encodes a word given a vocabulary."""
      vector = np.zeros(len(vocab))
      index = vocab.index(word)
      vector[index] = 1
      return vector

    def forward(self, input_sent, output_sent):
        # print('in forward')
        self.input_vector = self.get_embeddings(self.enc_embed, input_sent, self.enc_vocab)
        # print('inp vec ',self.input_vector)
        self.output_vector = self.get_embeddings(self.dec_embed, output_sent, self.dec_vocab)
        # print('out vec ',self.output_vector)
        output, hidden, cell = self.encoder(torch.stack(self.input_vector))
        num_epochs = 100
        for i in range(num_epochs):
            output_return = []
            # Forward pass
            # output_vector, hidden, cell = seq2seq(input_sent, output_sent)
            word = 0

            total_loss = 0
            print(output_sent)
            print(output_sent.split(' '))
            output_sent_list = output_sent.split(' ')
            while output_sent_list[word] != '<EOS>':

              hidden, cell, word_probs = decoder(self.output_vector[word].unsqueeze(0), hidden, cell)
              print(output_sent_list[word])
              loss = criterion(word_probs, torch.tensor(np.argmax(self.one_hot_encode(output_sent_list[word], self.dec_vocab))).unsqueeze(0))
              output_return.append(word_probs)
              total_loss += loss
              word += 1
            print(total_loss)
            total_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        return output_return

In [None]:
# why this
# loss = loss_fn(probs.view(-1), self.get_one_hot_vector(output_sentence[i]))
# decoder_loss += loss

In [None]:
input_sent = 'hi how you doing'
output_sent = '<SOS> i am good thank you <EOS>'

In [None]:
input_sentences = [
    "hi how you doing",
    "what is your name?",
    "where do you live?",
    "what time is it?",
    "can you help me?",
    "do you like music?"
]

output_sentences = [
    "<SOS> i am good thank you <EOS>",
    "<SOS> my name is chatbot <EOS>",
    "<SOS> i live on the internet <EOS>",
    "<SOS> it is time to learn <EOS>",
    "<SOS> sure how can i assist you? <EOS>",
    "<SOS> yes i enjoy listening to music. <EOS>"]

In [None]:
inp_vocab = []
# for sentence in eng:
# for input_sent in input_sentences:
for word in input_sent.split():
    if word not in inp_vocab:
        inp_vocab.append(word)

In [None]:
inp_vocab

['hi', 'how', 'you', 'doing']

In [None]:
out_vocab = []
# for sentence in eng:
# for output_sent in output_sentences:
for word in output_sent.split():
    if word not in out_vocab:
        out_vocab.append(word)

In [None]:
out_vocab

['<SOS>', 'i', 'am', 'good', 'thank', 'you', '<EOS>']

In [None]:
embedding_size=50
enc_hidden_size=100
dec_hidden_size=100
enc_input_size=50
dec_input_size=50
inp_embedding_layer = nn.Embedding(num_embeddings=len(inp_vocab), embedding_dim=embedding_size)
out_embedding_layer = nn.Embedding(num_embeddings=len(out_vocab), embedding_dim=embedding_size)

In [None]:
print(inp_embedding_layer)

Embedding(4, 50)


In [None]:
print(inp_embedding_layer.weight.detach().numpy())

[[ 2.13076806e+00 -9.33988631e-01  1.21974874e+00  1.28016710e-01
  -1.60798717e+00 -1.95194706e-01 -5.88075258e-02  1.30081868e+00
  -2.13420391e-02 -1.43376380e-01  1.15112531e+00  2.28009343e+00
   1.30249655e+00  5.70573449e-01  6.74722254e-01  4.23277058e-02
   2.76317656e-01  2.39894700e+00 -5.95420182e-01 -2.12876177e+00
   4.73678201e-01 -2.15748653e-01 -2.16076159e+00  1.65899113e-01
  -7.03141689e-01  1.24811184e+00 -4.01742250e-01 -1.85715914e-01
  -5.19189954e-01  1.34843662e-01  8.41690719e-01  8.34490061e-01
   8.72783005e-01  2.92702705e-01  1.05150843e+00 -1.32221782e+00
  -1.18941516e-01 -1.18107140e+00 -2.73309678e-01  2.29983285e-01
   1.78908920e+00 -7.18628764e-01  5.00304937e-01 -1.27007079e+00
   9.45923448e-01 -1.25248659e+00  2.55595624e-01 -5.58273256e-01
   9.77232695e-01  1.95764005e-01]
 [-6.28940821e-01  7.20330656e-01 -5.59481800e-01 -5.21588147e-01
  -3.16611025e-03  1.07826161e+00  6.94356740e-01 -4.65836138e-01
   5.84785044e-01  6.13555074e-01  1.6686

In [None]:
encoder = Encoder(enc_input_size, enc_hidden_size)

In [None]:
input_vector = []
for word in input_sent.split():
    # print(torch.tensor(inp_vocab.index(word)))
    input_vector.append(inp_embedding_layer(torch.tensor(inp_vocab.index(word))))
    # s_vec.append(self.eng_embedding_layer(torch.tensor(self.eng_vocab.index(word))))

In [None]:
output_vector = []
for word in output_sent.split():
    # print(torch.tensor(outp_vocab.index(word)))
    output_vector.append(out_embedding_layer(torch.tensor(out_vocab.index(word))))
    # s_vec.append(self.eng_embedding_layer(torch.tensor(self.eng_vocab.index(word))))

In [None]:
output_vector

[tensor([-0.1284, -0.6546,  1.6615, -1.6890, -0.2014,  0.2810,  0.5544, -1.1776,
         -0.6192,  0.2470, -0.1544,  1.3186,  0.9725,  0.8846,  0.1095, -0.0104,
         -0.6786,  0.7825, -0.4731, -0.3256,  0.4379, -1.3537,  0.2509,  0.5786,
          1.0986,  0.8935, -0.1647,  0.2491,  0.0614,  0.2159, -0.7320,  1.1594,
         -0.0112, -0.5136, -0.2857,  0.2529,  1.6472, -0.3020,  1.1507, -0.5187,
         -1.8043,  1.2281, -1.2050, -0.3544,  0.4508,  0.3842, -1.2760, -1.2438,
          1.2391,  0.6777], grad_fn=<EmbeddingBackward0>),
 tensor([ 5.3754e-01, -1.2638e+00, -3.3719e-01, -1.6494e-01,  4.4901e-01,
         -1.1164e-01, -1.9729e+00, -2.2187e-02, -1.7742e+00,  4.7445e-01,
         -1.1080e+00, -3.7965e-01,  4.0331e-01,  6.6508e-01, -2.0861e+00,
          3.7146e-01,  8.9876e-01,  1.1195e+00,  9.3256e-02, -3.3874e-01,
         -1.0177e-01, -7.1229e-01,  1.0532e+00, -7.5843e-01,  5.0591e-01,
         -1.5059e+00,  4.3468e-01, -5.3279e-02,  8.2676e-01, -1.5547e+00,
          1

In [None]:
input_vector

[tensor([ 2.1308, -0.9340,  1.2197,  0.1280, -1.6080, -0.1952, -0.0588,  1.3008,
         -0.0213, -0.1434,  1.1511,  2.2801,  1.3025,  0.5706,  0.6747,  0.0423,
          0.2763,  2.3989, -0.5954, -2.1288,  0.4737, -0.2157, -2.1608,  0.1659,
         -0.7031,  1.2481, -0.4017, -0.1857, -0.5192,  0.1348,  0.8417,  0.8345,
          0.8728,  0.2927,  1.0515, -1.3222, -0.1189, -1.1811, -0.2733,  0.2300,
          1.7891, -0.7186,  0.5003, -1.2701,  0.9459, -1.2525,  0.2556, -0.5583,
          0.9772,  0.1958], grad_fn=<EmbeddingBackward0>),
 tensor([-0.6289,  0.7203, -0.5595, -0.5216, -0.0032,  1.0783,  0.6944, -0.4658,
          0.5848,  0.6136,  0.1669, -0.1351, -1.7730, -0.2589,  0.0876,  1.4684,
          0.4889,  0.5943,  1.1865, -0.2900, -0.3549, -0.0453, -0.0140, -0.4783,
          1.9465,  0.1940,  0.8903,  0.1170,  0.6455,  0.8125,  0.6040, -1.0522,
         -0.0529, -1.3983, -0.2364,  0.8449,  2.1243, -0.1362,  0.6104, -0.2816,
         -0.7518, -1.4539,  1.5865,  0.7418, -1.65

In [None]:
# output, hidden, cell = encoder(torch.stack(input_vector))

In [None]:
# dec_input_size, dec_hidden_size, len(out_vocab)

In [None]:
# out_vocab

In [None]:
decoder = Decoder(dec_input_size, dec_hidden_size, len(out_vocab))

In [None]:
# output_vector[0].unsqueeze(0).shape

In [None]:
# decoder(torch.stack(output_vector), hidden, cell)

In [None]:
# prompt: get one hot encoded values of out_vocab

# Assuming 'out_vocab' is a list of unique output vocabulary words
import numpy as np

def one_hot_encode(word, vocab):
  """One-hot encodes a word given a vocabulary."""
  vector = np.zeros(len(vocab))
  index = vocab.index(word)
  vector[index] = 1
  return vector

# Example usage:
one_hot_encodings = [one_hot_encode(word, out_vocab) for word in out_vocab]

# Print the one-hot encodings
for encoding in one_hot_encodings:
  print(encoding)


[1. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 1.]


In [None]:
# one_hot_encodings

In [None]:
# # prompt: find what word is where in one hot encoding

# import numpy as np
# # Assuming 'out_vocab' is a list of unique output vocabulary words
# # and 'one_hot_encodings' contains the corresponding one-hot encodings

def find_word_from_one_hot(one_hot_vector, vocab):
  """Finds the word corresponding to a one-hot encoded vector."""
  index = np.argmax(one_hot_vector)
  return vocab[index]

# # Example usage:
# for encoding in one_hot_encodings:
#   word = find_word_from_one_hot(encoding, out_vocab)
#   print(encoding, "-->", word)


In [None]:
target = torch.arange(0, 7)
print(target)

tensor([0, 1, 2, 3, 4, 5, 6])


In [None]:
input_sentences[0]

'hi how you doing'

In [None]:
output_sentences[0]

'<SOS> i am good thank you <EOS>'

In [None]:
inp_vocab

['hi', 'how', 'you', 'doing']

In [None]:
seq2seq = Seq2Seq(encoder, decoder, inp_embedding_layer, out_embedding_layer, inp_vocab, out_vocab)

In [None]:

learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=learning_rate)


In [None]:
target = np.array([np.argmax(encoding) for encoding in one_hot_encodings])

In [None]:
torch.tensor(target)

tensor([0, 1, 2, 3, 4, 5, 6])

In [None]:
input_vector

[tensor([ 0.2720, -0.3124, -0.4127, -0.2861, -1.2205,  0.6570,  0.4520, -1.6402,
          0.7862,  1.0680,  1.4405,  0.9481,  1.3281,  1.0168, -0.2595,  2.6585,
         -1.1429, -0.0527, -0.7941,  0.5246,  0.6063, -2.2283, -0.3867, -0.9152,
         -0.7451,  0.5235,  0.6943,  1.7494, -0.7467,  0.0274,  0.2308, -0.4509,
         -1.5400, -1.6771,  0.3777, -1.4154,  0.3567,  2.1284, -1.3562, -1.1239,
         -1.1784, -0.5821,  0.0455, -1.2773, -0.4938, -0.5078, -1.3935, -0.5019,
         -0.5406, -0.7392], grad_fn=<EmbeddingBackward0>),
 tensor([ 0.4347,  2.1043, -0.2836,  1.0755,  1.0242, -0.0472,  0.4805, -0.5692,
         -0.4540,  0.3467, -0.5508, -0.4473,  0.7816,  0.8689, -0.5304, -0.8568,
         -0.8221,  0.6589,  1.3224, -0.1596,  0.6737,  0.1054, -0.1699,  0.3529,
          0.9251, -1.7029, -0.9851, -1.1845,  1.1419,  0.8968,  0.1494, -0.9110,
          1.0235, -0.8580, -0.4716, -0.9589, -0.8099, -0.4303,  0.9083,  0.2684,
          0.5482, -1.5455, -0.5920, -0.0971,  0.91

In [None]:
output = seq2seq(input_sent, output_sent)

<SOS> i am good thank you <EOS>
['<SOS>', 'i', 'am', 'good', 'thank', 'you', '<EOS>']
<SOS>
i
am
good
thank
you
tensor(11.4741, grad_fn=<AddBackward0>)
<SOS> i am good thank you <EOS>
['<SOS>', 'i', 'am', 'good', 'thank', 'you', '<EOS>']
<SOS>
i
am
good
thank
you
tensor(11.4750, grad_fn=<AddBackward0>)


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [None]:
output

In [None]:
# loss = criterion(output, torch.tensor(target))


In [None]:
# print(loss)

In [None]:
# loss.backward()
# optimizer.step()
# optimizer.zero_grad()

In [None]:
# torch.stack(input_vector).shape

In [None]:
num_epochs = 100
for i in range(num_epochs):
    # Forward pass
    output_vector, hidden, cell = seq2seq(input_sent, output_sent)
    word = 0
    optimizer.zero_grad()
    total_loss = 0
    while output_sent.split(' ')[word] != '<EOS>':
      # print(output_vector[word].unsqueeze(0))
      output = decoder(output_vector[word].unsqueeze(0), hidden, cell)
    # optimizer.zero_grad()
    # Compute loss
      # print(output)
      # print(torch.tensor(target[word]).unsqueeze(0))
      loss = criterion(output, torch.tensor(target[word]).unsqueeze(0))
      total_loss += loss
      word += 1
    print(total_loss)
    total_loss.backward()
    optimizer.step()

        # Update weights

In [None]:
word

In [None]:
output_sent#[word]

In [None]:
output = seq2seq(input_sent, output_sent)

In [None]:
output.shape

In [None]:
def softmax_to_ohe(softmax_tensor):
    # Get the index of the maximum value in each row (i.e., along the last dimension)
    _, max_indices = torch.max(softmax_tensor, dim=1)

    # Create a new tensor of zeros with the same shape as the original tensor
    ohe_tensor = torch.zeros_like(softmax_tensor)

    # Scatter ones at the locations of the max indices
    ohe_tensor.scatter_(1, max_indices.unsqueeze(1), 1.0)

    return ohe_tensor

In [None]:
output_ohe = softmax_to_ohe(output)

In [None]:
sent = ''
for ohe in output_ohe:
  # print(ohe)
  curr_word = find_word_from_one_hot(ohe, out_vocab)
  if curr_word == '<SOS>':
    continue
  if curr_word == '<EOS>':
    break
  sent += curr_word + ' '

In [None]:
sent