# LSTM Bot

## Project Overview

In this project, you will build a chatbot to converse with you on a variety of different questions. The chatbot will use a Sequence to Sequence text generation model with an LSTM as it's memory unit. You will also learn to use pretrained word embeddings to improve the performance of the model. At the conclusion of the project, you will be able to show your chatbot to potential employers.

Additionally, you have the option to use pretrained word embeddings in your model. We have loaded Brown Embeddings from Gensim in the starter code below. Compare the performance of your model with pre-trained embeddings versus without.



---



A sequence to sequence model (Seq2Seq) has two components:
- An Encoder consisting of an embedding layer and LSTM unit.
- A Decoder consisting of an embedding layer, LSTM unit, and linear output unit.

The Seq2Seq model works by accepting an input into the Encoder, passing the hidden state from the Encoder to the Decoder, which the Decoder uses to output a series of token predictions.

## Dependencies

- Pytorch
- Numpy
- Pandas
- NLTK
- Gzip
- Gensim


Please choose a dataset from the Torchtext website. We recommend looking at the Multi30K and Squad datasets first. Here is a link to the website where you can view your options:

- https://pytorch.org/text/stable/datasets.html





In [None]:
pip install torchdata==0.3.0 torchvision==0.12.0 torchtext==0.12.0 torch

In [None]:
import gensim
import nltk
import numpy as np
import pandas as pd
import gzip
import torch
from nltk.corpus import brown

nltk.download('brown')
nltk.download('punkt')

# Output, save, and load brown embeddings

model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

w2v = gensim.models.Word2Vec.load('brown.embedding')

In [None]:
from torchtext import datasets

In [None]:
def loadDF():
    data = {"question": [], "answer": []}
    index = 0
    train_iter, dev_iter = datasets.SQuAD2()
    for context, question, answers, indices in train_iter:
        if answers[0]:
            data["question"].append(question)
            data["answer"].append(answers[0])
        index += 1
    df =  pd.DataFrame.from_dict(data)
    return df
#### note: this function is from a comment on the forum here - https://knowledge.udacity.com/questions/888774

In [None]:
data = loadDF()

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer

def prepare_text(sentence):
    '''

    Our text needs to be cleaned with a tokenizer. This function will perform that task.
    https://www.nltk.org/api/nltk.tokenize.html

    '''
    #tokens = word_tokenize(sentence)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    tokens = [token.lower() for token in tokens]
    return tokens

In [None]:
data['question_tokens'] = data['question'].apply(prepare_text)
data['answer_tokens'] = data['answer'].apply(prepare_text)

In [None]:
data

In [None]:
from sklearn.model_selection import train_test_split
def split(SRC, TRG):
    
    '''
    Input: SRC, our list of questions from the dataset
            TRG, our list of responses from the dataset

    Output: Training and test datasets for SRC & TRG

    '''
    
    SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset = train_test_split(SRC, TRG, test_size=0.2, random_state=42)
    
    return SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset


In [None]:
SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset = split(data['question_tokens'], data['answer_tokens'])

In [None]:
SRC_train_dataset

In [None]:
TRG_train_dataset

In [None]:
class Vocabulary:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        
        self.add_token('<UNK>')

    def add_token(self, token):
        if token not in self.word2index:
            self.word2index[token] = self.num_words
            self.word2count[token] = 1
            self.index2word[self.num_words] = token
            self.num_words += 1
        else:
            self.word2count[token] += 1

    def add_tokens(self, tokens):
        for token in tokens:
            self.add_token(token)
            
    def discard_rare_words(self, min_count):
        tokens_to_remove = []
        for token in self.word2count:
            if self.word2count[token] < min_count:
                tokens_to_remove.append(token)

        for token in tokens_to_remove:
            del self.word2index[token]
            del self.word2count[token]

        self.index2word = {index: token for token, index in self.word2index.items()}
        self.num_words = len(self.word2index)

    def __len__(self):
        return self.num_words

    def __str__(self):
        return f"Vocabulary size: {self.num_words}"

    def token_to_index(self, token):
        return self.word2index.get(token, self.word2index['<UNK>'])

    def index_to_token(self, index):
        return self.index2word.get(index, '<UNK>')

    def get_token_count(self, token):
        return self.word2count.get(token, 0)

In [None]:
vocabulary = Vocabulary()
vocabulary_src = Vocabulary()
vocabulary_trg = Vocabulary()

In [None]:
for row in SRC_train_dataset:
    vocabulary.add_tokens(row)
    vocabulary_src.add_tokens(row)

In [None]:
for row in TRG_train_dataset:
    vocabulary.add_tokens(row)
    vocabulary_trg.add_tokens(row)

In [None]:
print(vocabulary)
print(vocabulary.token_to_index('how'))
print(vocabulary.index_to_token(3))
print(vocabulary.get_token_count('how'))

In [None]:
vocabulary.discard_rare_words(2)
print(vocabulary)

In [None]:
len(vocabulary_src)

In [None]:
import torch.nn as nn
class Encoder(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.hidden = torch.zeros(1, input_size, self.hidden_size)
        
        # self.embedding provides a vector representation of the inputs to our model
        self.embedding = nn.Embedding(self.input_size, self.hidden_size)
        
        # self.lstm, accepts the vectorized input and passes a hidden state
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, 1) 
        
    
    def forward(self, i):
        
        '''
        Inputs: i, the src vector
        Outputs: o, the encoder outputs
                h, the hidden state
                c, the cell state
        '''
        embedded = self.embedding(i)
        o, (h, c) = self.lstm(embedded)
        
        return o, h, c
    

class Decoder(nn.Module):
      
    def __init__(self, hidden_size, output_size):
        
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # self.embedding provides a vector representation of the target to our model
        self.embedding = nn.Embedding(output_size, self.hidden_size)
        
        # self.lstm, accepts the embeddings and outputs a hidden state
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)

        # self.ouput, predicts on the hidden state via a linear output layer   
        self.output = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, i, h):
        
        '''
        Inputs: i, the target vector
        Outputs: o, the prediction
                h, the hidden state
        '''
        embedded = self.embedding(i)  # Embed the target vector
        embedded = embedded.unsqueeze(0)  # Add a batch dimension

        o, h = self.lstm(embedded, (h, torch.zeros_like(h)))  # Pass the embedded input and previous hidden state through the LSTM

        o = o.squeeze(0)  # Remove the batch dimension from the output
        o = self.output(o)
        
        return o, h
        
        

class Seq2Seq(nn.Module):
    
    def __init__(self, encoder_input_size, encoder_hidden_size, decoder_hidden_size, decoder_output_size):
        
        super(Seq2Seq, self).__init__()
        
        self.input_size = encoder_input_size
        self.hidden_size = encoder_hidden_size
        self.output_size = decoder_output_size
        
        self.encoder = Encoder(self.input_size, self.hidden_size)
        self.decoder = Decoder(self.hidden_size, self.output_size)
        
        assert self.encoder.hidden_size == self.decoder.hidden_size, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert self.encoder.lstm.num_layers == self.decoder.lstm.num_layers, \
            "Encoder and decoder must have equal number of layers!"
    
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):      
        encoder_outputs, encoder_hidden, encoder_cell = self.encoder(src)
    
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        decoder_input = trg[0]
        
        o = torch.zeros(trg.shape[0], self.decoder.output_size)
    
        for t in range(1, trg.shape[0]):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            o[t] = decoder_output

            use_teacher_forcing = random.random() < teacher_forcing_ratio
            
            if use_teacher_forcing:
                decoder_input = trg[t]
            else:
                decoder_input = decoder_output.argmax(dim=1)
                decoder_input = decoder_input.detach()
        
        return o

    



In [None]:
INPUT_DIM = len(vocabulary_src)
OUTPUT_DIM = len(vocabulary_trg)
HID_DIM = 512

In [None]:
enc = Encoder(INPUT_DIM,HID_DIM)
dec = Decoder(HID_DIM, OUTPUT_DIM)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(INPUT_DIM, HID_DIM, HID_DIM, OUTPUT_DIM).to(device)

In [None]:
model

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
SRC_train_dataset

In [None]:
vocabulary.word2index['<UNK>'] = len(vocabulary.word2index)
def turn_to_indices(dataset):    
    # Create an empty list to store the indices
    dataset_indices = []

    # Iterate through each row in the dataset
    for row in dataset:
        # Create an empty list to store the indices of tokens in the row
        row_indices = []

        # Iterate through each token in the row
        for token in row:
            # Convert the token to its index using the token_to_index function
            index = vocabulary.token_to_index(token)

            # Append the index to the row_indices list
            row_indices.append(index)

        # Append the row_indices list to the dataset_indices list
        dataset_indices.append(row_indices)
    return dataset_indices

In [None]:
questions_list_train = turn_to_indices(SRC_train_dataset)
questions_list_test = turn_to_indices(SRC_test_dataset)
answers_list_train = turn_to_indices(TRG_train_dataset)
answers_list_test = turn_to_indices(TRG_test_dataset)

In [None]:
def get_median(list_of_lists):

    # Calculate the lengths of the inner lists
    lengths = [len(inner_list) for inner_list in list_of_lists]

    # Sort the lengths
    sorted_lengths = sorted(lengths)
    
    # Find the median length
    if len(sorted_lengths) % 2 == 0:
        median_length = (sorted_lengths[len(sorted_lengths) // 2] + sorted_lengths[len(sorted_lengths) // 2 - 1]) / 2
    else:
        median_length = sorted_lengths[len(sorted_lengths) // 2]
        
    return median_length


In [None]:
get_median(questions_list_train)

In [None]:
get_median(answers_list_train)

In [None]:
def trim_inner_lists(list_of_lists, max_length):
    truncated_list_of_lists = [inner_list[:max_length] for inner_list in list_of_lists]
    return truncated_list_of_lists

In [None]:
max_length_question = 13
max_length_answer = 4

In [None]:
questions_list_train = trim_inner_lists(questions_list_train, max_length_question)
answers_list_train = trim_inner_lists(answers_list_train, max_length_answer)

In [None]:
padded_questions_list_train = [seq + [0] * (max_length_question - len(seq)) for seq in questions_list_train]
padded_answers_list_train = [seq + [0] * (max_length_answer - len(seq)) for seq in answers_list_train]

In [None]:
train_data = list(zip(padded_questions_list_train, padded_answers_list_train))
test_data = list(zip(questions_list_test, answers_list_test))

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer

import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sequence, label = self.data[index]
        return torch.tensor(sequence), torch.tensor(label)

In [None]:
train_dataset = MyDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
model.train()
num_epochs = 3

for epoch in range(num_epochs):
    for batch_idx, (src, trg) in enumerate(train_dataloader):
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        loss = criterion(output, trg)

        loss.backward()
        optimizer.step()

        if batch_idx % log_interval == 0:
            print(f"Epoch: {epoch+1}, Batch: {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item()}")

In [None]:
model.train()
num_epochs = 3

for epoch in range(num_epochs):
    for batch_idx, (src, trg) in enumerate(train_dataloader):
        print(src)
        print(trg)