In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from torch.utils.tensorboard import SummaryWriter

from sklearn.decomposition import PCA
import numpy as np
import plotly.graph_objs as go
from datetime import datetime

In [None]:
writer = SummaryWriter('runs/encoderdecoder_model_log_'+datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('wordnet')

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
# hyperparameters
EMBEDDING_DIM = 300

In [None]:
# create vocabular
with open('text.txt','r') as f:
    text = f.read().lower()
    
    sentences = sent_tokenize(text)
    
    vocabulary = set()
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        for word in words:
            if word.isalpha():
                vocabulary.add(wordnet_lemmatizer.lemmatize(word))

In [None]:
print(sentences[:3])

In [None]:
print(list(vocabulary)[:3])

In [None]:
word_to_ix = {word: i for i, word in enumerate(vocabulary)}

In [None]:
def process_words(words):
    words_processed = []
    for word in words:
        if word.isalpha():
            words_processed.append(wordnet_lemmatizer.lemmatize(word))
    return words_processed

In [None]:
# create dataset for guess next word based on 5 previous words
number_of_previous_words = 5
dataset = []
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words_processed = process_words(words)
    for idx,word in enumerate(words_processed):
        if idx+number_of_previous_words+1<len(words_processed):
            pretext = words_processed[idx:idx+number_of_previous_words]
            target = words_processed[idx+number_of_previous_words]
            dataset.append((pretext, target))
            examples = []

In [None]:
print(dataset[:3])

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.length = len(data)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        input_words = self.data[index][0]
        input_vectors = [word_to_ix[w] for w in input_words]
        target = word_to_ix[self.data[index][1]]
        
        input_tensor = torch.tensor(input_vectors, dtype=torch.int64)
        target_tensor = torch.tensor(target, dtype=torch.int64)
        return (input_tensor, target_tensor)

In [None]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, embedding_dim, num_layers1, num_layers2, num_layers3, hidden_size1, hidden_size2, hidden_size3, dropout):
        super(Encoder, self).__init__()
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.num_layers3 = num_layers3
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.hidden_size3 = hidden_size3
        self.dropout = dropout
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(input_size=embedding_dim, num_layers=self.num_layers1, hidden_size=self.hidden_size1, dropout=self.dropout)
        self.lstm2 = nn.LSTM(input_size=self.hidden_size1, num_layers=self.num_layers2, hidden_size=self.hidden_size2, dropout=self.dropout)
        self.lstm3 = nn.LSTM(input_size=self.hidden_size2, num_layers=self.num_layers3, hidden_size=self.hidden_size3, dropout=self.dropout)

    def forward(self, inputs):
        inputs = inputs.view(-1,inputs.shape[0])
        embeds = self.embeddings(inputs)
        out, (hidden, cell) = self.lstm1(embeds)
        out, (hidden, cell) = self.lstm2(hidden)
        out, (hidden, cell) = self.lstm3(hidden)
        return hidden, cell

In [None]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, embedding_dim, num_layers1, num_layers2, hidden_size1, hidden_size2, dropout):
        super(Decoder, self).__init__()
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.dropout = dropout
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(input_size=embedding_dim, num_layers=self.num_layers1, hidden_size=self.hidden_size1, dropout=self.dropout)
        self.lstm2 = nn.LSTM(input_size=self.hidden_size1, num_layers=self.num_layers2, hidden_size=self.hidden_size2, dropout=self.dropout)
        self.linear = nn.Linear(self.hidden_size2, vocab_size) # output size is actually vocabular size

    def forward(self, inputs, encoder_hidden_state, encoder_cell_state):
        inputs = inputs.view(1,inputs.shape[0])
        embeds = self.embeddings(inputs)
        out, (hidden, cell) = self.lstm1(embeds, (encoder_hidden_state, encoder_cell_state))
        out, (hidden, cell) = self.lstm2(hidden)
        out = self.linear(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs[0]

In [None]:
class EncoderDecoder(nn.Module):

    def __init__(self, encoder, decoder):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inputs):
        
        input_words = inputs[0]
        target_words = inputs[1]
        
        encoder_hidden, encoder_cell = self.encoder(input_words)
        decoder_out = self.decoder(target_words, encoder_hidden, encoder_cell)
        return decoder_out

In [None]:
encoder = Encoder(len(vocabulary), EMBEDDING_DIM, 1, 2, 1, 1024, 512, 256, 0.2)
print(encoder)

In [None]:
decoder = Decoder(len(vocabulary), EMBEDDING_DIM, 1, 2, 256, 512, 0.2)
print(decoder)

In [None]:
encoder_decoder = EncoderDecoder(encoder, decoder)
print(encoder_decoder)

In [None]:
# hyperparameters
losses = []
loss_function = nn.NLLLoss()
optimizer = optim.SGD(encoder_decoder.parameters(), lr=0.001)

In [None]:
# prepare data
batch_size = 4
train_data = MyDataset(dataset)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [None]:
# model training
running_loss = 0.0
for epoch in range(1000):
    total_loss = 0
    for i, data in enumerate(train_dataloader,0):  
        input_tensor, target_tensor = data
        encoder_decoder.zero_grad()

        log_probs = encoder_decoder((input_tensor,target_tensor))
        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        running_loss += loss.item()
        if i % 100 == 99:    

            # ...log the running loss
            writer.add_scalar('training loss',
                            running_loss / 100,
                            epoch * len(train_dataloader) + i)
            running_loss = 0.0
    print('Epoch: ', str(epoch+1), '/1000 ', ', Loss: ', str(total_loss))

In [None]:
torch.save(encoder_decoder.state_dict(), 'encoderdecoder_model.pth')