# Introduction

<div class="alert alert-warning">
<font color=black>

**What?** Building an LSTM network

</font>
</div>

# Import python modules

In [67]:
from torchtext.data import Field
from torchtext.data import TabularDataset
import torch
from torchtext.data import Iterator, BucketIterator
from torchtext import vocab
import torch.nn as nn

# Tokenization

In [2]:
tokenizer = lambda words : words.split()

In [3]:
tokenizer("This is a test for tokenizer")

['This', 'is', 'a', 'test', 'for', 'tokenizer']

# Creating fields

In [None]:
"""
The Field class lets us perform common text processing tasks and holds the vocabulary of the data at hand. 
"""

In [None]:
# 
Review = Field(sequential=True, tokenize=tokenizer, lower=True)

In [5]:
# We then define the field for labels
Label = Field(sequential=False, use_vocab=False)

In [6]:
# We can add a token at the beginning and end of an input string
SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True)

In [7]:
# We can set the sequence to a fixed length
SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True, fix_length=50)

In [8]:
# We can set an unknown token
SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', unk_token='<unk>')

In [9]:
# We can set the batch dimension as the first dimension
SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', unk_token='<unk>', batch_first=True)

# Load the toxic comments dataset

In [10]:
"""
This is just a news calssigication dataset
"""

In [15]:
train_datafields = [("id", None),
                 ("content", Review), ("Business", Label),
                 ("SciTech", Label), ("Sports", Label),
                 ("World", Label)]

In [23]:
test_datafields = [("id", None),
                  ("content", Review)]

In [53]:
# the original code has an ald keyword -> valid! Just use -> "validation"
train, valid = TabularDataset.splits(path='reviewAnalysis', 
                                    train='train.csv',
                                    validation='valid.csv',
                                    format='csv',
                                    skip_header=True,
                                    fields=train_datafields)

In [54]:
test = TabularDataset(path="reviewAnalysis/test.csv",
                        format='csv',
                        skip_header=True,
                        fields=test_datafields)

In [None]:
"""
Finally, we called the build_vocab() method in the Fields object to build the possible library of words with a 
minimum presence of two times in the dataset. A word that is not in the vocabulary would be assigned an unknown 
tag in the validation and test sets.
"""

In [50]:
Review.build_vocab(train, min_freq=2)

# Developing iterators

In [None]:
"""
Convert the dataset into iterators so that we have the appropriate batches ready to iterate in each epoch.

We used iterators to build training, testing, and validations batches and moved the datasets into an 
appropriate CPU or GPU device. The Iterators make it super elegant to do these tasks. We used a specialized 
iterator class called BucketIterator, which groups the input sequences into sequences of similar length and
shuffles them automatically. We defined the batch size and found the device that was available on the machine.
"""

In [56]:
BATCH_SIZE = 128

In [65]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [66]:
train_iter, valid_iter, test_iter = BucketIterator.splits(
                                     (train, valid, test),
                                     batch_size=BATCH_SIZE,
                                     device=device,
                                     sort_key=lambda x: len(x.comment_text), 
                                     sort_within_batch=False
)

# Exploring word embeddings

In [32]:
"""
TorchText has a vocab module that deals with embeddings. We can download pretrained embeddings by mentioning 
the name of the embedding that we need in this recipe. We used a pretrained GloVe (a GloVe is a word vector 
technique) model, that is trained using 6 billion tokens with a 100-embedding dimension vector—glove.6B.50d.
"""

In [69]:
vec = vocab.Vectors('glove.6B.50d.txt', './vec/glove_embedding/')

100%|█████████▉| 399999/400000 [00:11<00:00, 33557.58it/s]


In [70]:
Review.build_vocab(train, min_freq=2, vectors=vec)

# Building an LSTM network

In [None]:
"""
Long short-term memory (LSTM) networks are a type of recurrent neural network that has internal gates that helps 
in better information persistence. These gates are tiny neural networks that control when information needs to 
be saved and when it can be erased or forgotten. 

RNNs suffer from vanishing and exploding gradients, making it 
difficult to learn long-term dependencies. LSTMs are resistant to exploding and vanishing gradients, although 
it is still mathematically possible.
"""

In [76]:
class LSTMClassifier(nn.Module):
        
    #Base class constructor    
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(len(Review.vocab), embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.rnn(x)
        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [77]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

In [78]:
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)

# Multilayer LSTMs

In [82]:
class MultiLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(len(Review.vocab), embedding_dim)
        # num_layers is the new parameter
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.rnn(x)
        hidden = self.dropout(hidden)
        return self.fc(hidden[-1])

In [83]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT = 1
DROPOUT = 0.5
NUM_LAYERS = 2

In [84]:
model = MultiLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, NUM_LAYERS)

# Bidirectional LSTMs

In [None]:
"""
This recipe builds on the multilayer LSTM recipe. In a normal LSTM, the LSTM reads the input sequence from first 
to last; however, in a bidirectional LSTM, there is a second LSTM that reads the sequence from last to first—that 
is, a backward RNN. This type of LSTM improves the model performance when the prediction at the current timestamp 
is dependent on the inputs further on in the sequence
"""

In [85]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(len(Review.vocab), embedding_dim)
        # That is the new bit -> bidirectional=True
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.rnn(x)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

In [86]:
model = BiLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, NUM_LAYERS)

# References

<div class="alert alert-warning">
<font color=black>

- Jibin Mathew, PyTorch Artificial Intelligence Fundamentals

</font>
</div>