In [None]:
# Prerequisite
! pip install pytorch_lightning
! pip install nltk
!python -m nltk.downloader punkt

In [1]:
import pytorch_lightning as pl
import torch
import math
from torch import nn
from torch import optim
import pytorch_lightning.loggers as pl_loggers

from nltk.tokenize import word_tokenize, sent_tokenize
import random
import operator
import re

In [2]:
class Prep:
    """Preparing tokenization and frequences."""
    def __init__(self):
        with open("./data/wiki.test.txt") as f:
            self.test = f.read()
        with open("./data/wiki.train.txt") as f:
            self.train = f.read()
        with open("./data/wiki.valid.txt") as f:
            self.valid = f.read()
        # self.test1 = "After release , it received downloadable content . along with an expanded edition in November of that year ."
        # self.test2 = "After it received ."
        self.word_freqs = {"<oov>":1}

    def tokenize(self, corpus):
        """
        Tokenized the lines, remove the titles, and make it lowercase,
        return lines list.
        list[list[word]]
        """
        
        # Create token list
        sent_tokens = [word_tokenize(t) for t in sent_tokenize(corpus)]
        random.shuffle(sent_tokens)
        word_tokens = [[w.lower() for w in s] for s in sent_tokens]
        
        # Remove last punctuation, add <s></s>
        word_tokens = [["<s>"] + s + ["</s>"] if s[-1].isalnum() else ["<s>"] + s[:-1] + ["</s>"] for s in word_tokens]
        corpus = []
        for s in word_tokens:
            corpus.extend(s)
        return corpus
    
    def building_vocab(self, corpus):
        """Building vocab list from training set."""
        for w in corpus:
            # the word has already been found
            if w in self.word_freqs:
                self.word_freqs[w] += 1
            # the word has not yet already been found
            else:
                self.word_freqs[w] = 1

In [3]:
class Vocab(object):
    """ Converts word tokens to indices, and vice versa. """

    def __init__(self, freqs, corpus, window_size):
        super().__init__()
        self.indix2token = tuple(freqs)
        self.token2index = {k: v for v, k in enumerate(self.indix2token)}
        self.corpus = corpus
        self.window_size = window_size
        self.encoded_list = []
        self.data, self.target = self.encoding()
        
        
    def __len__(self):
        return len(self.encoded_list)

    def __getitem__(self, key):
        return torch.tensor(self.data[key]),torch.tensor(self.target[key])

    def encoding(self):  
        def retrive(key):
            if isinstance(key, int):
                return None
            else:
                return self.token2index[key]
        encoded_list = [retrive(i) for i in self.corpus]
        self.encoded_list = [encoded_list[i:i + self.window_size] for i in range(0, len(encoded_list), self.window_size) if len(encoded_list[i:i + self.window_size])==self.window_size]
        data = [s[:-1] for s in self.encoded_list]
        target = [s[1:] for s in self.encoded_list]
        return data, target
    
    def decoding(self):
        def retrive(self, key):
            if isinstance(key, int):
                return self.indix2token[key]
            else:
                return None
        decoded_list = [[retrive(w) for w in s] for s in self.corpus]

In [4]:
p = Prep()
# Prepare vocab
train_corpus = p.tokenize(p.train)
p.building_vocab(train_corpus)

valid_corpus = p.tokenize(p.valid)
p.building_vocab(valid_corpus)

test_corpus = p.tokenize(p.test)
p.building_vocab(test_corpus)

word_freqs = p.word_freqs

train = Vocab(word_freqs, train_corpus, 31)
valid = Vocab(word_freqs, valid_corpus, 31)
test = Vocab(word_freqs, test_corpus, 31)

In [5]:
class TextDateModule(pl.LightningDataModule):
    """Pytorch lightning data module."""
    def __init__(self, train_corpus, valid_corpus, test_corpus):
        super().__init__()
        self.batch_size = 20
        self.train = train_corpus
        self.valid = valid_corpus
        self.test = test_corpus

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train, self.batch_size, num_workers=16, shuffle=True, drop_last=True)
  
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.valid, self.batch_size, num_workers=16, shuffle=False, drop_last=True)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test, self.batch_size, num_workers=16, shuffle=False, drop_last=True)

In [7]:
class TextLightningModule(pl.LightningModule):
    """RNN module"""
    def __init__(self, vocab_size):
        super().__init__()
        self.num_layers = 2
        self.hidden_size = 100 #200
        self.embedding_size = 100
        self.vocab_size = vocab_size
        
        # embedding
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
        #layers
        self.rnn = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers, batch_first=True)
        self.out_fc = nn.Linear(self.hidden_size, vocab_size)
        # loss funciton
        self.loss = nn.CrossEntropyLoss()
        
        self.dropout = nn.Dropout(0.25)
        
    
    def forward(self, data, hidden):
        embedding = self.dropout(self.embedding(data))
        output, hidden = self.rnn(embedding, hidden)
        output = self.out_fc(output)
        return output.view(-1, self.vocab_size), hidden

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=5e-1)
    
    def training_step(self, batch, batch_idx):
        x,y = batch
        y = y.view(-1)
        
        hidden = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        output, hidden = self.forward(x, hidden)
        loss = self.loss(output, y)
        perplexity = math.exp(loss.item())
        
        tensorboard_logs = {'perplexity': {'train': perplexity}, 'loss': {'train': loss.detach()}}
        self.log("loss/train", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("perplexity/train", perplexity, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss, "log": tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x,y = batch
        y = y.view(-1)
        
        hidden = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        output, hidden = self.forward(x, hidden)
        loss = self.loss(output, y)
        perplexity = math.exp(loss.item())
        
        tensorboard_logs = {'perplexity': {'valid': perplexity}, 'loss': {'valid': loss.detach()}}
        self.log("loss/valid", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("perplexity/valid", perplexity, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss, "log": tensorboard_logs}
    
    def test_step(self, batch, batch_idx):
        x,y = batch
        y = y.view(-1)
        
        hidden = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        output, hidden = self.forward(x, hidden)
        loss = self.loss(output, y)
        perplexity = math.exp(loss.item())
        
        tensorboard_logs = {'perplexity': {'test': perplexity}, 'loss': {'test': loss.detach()}}
        self.log("loss/test", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("perplexity/test", perplexity, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss, "log": tensorboard_logs}

    def init_hidden(self, batch_size = 20):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden

In [8]:
# Train RNN
vocab_size = len(word_freqs)
data_module = TextDateModule(train, valid, test)
model = TextLightningModule(vocab_size)

tb_logger = pl_loggers.TensorBoardLogger("./lightning_logs/", name="network_1")
trainer = pl.Trainer(logger=tb_logger, max_epochs=20, gpus=1)
trainer.fit(model, data_module)

result = trainer.test(model, data_module)
print(result)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: ./lightning_logs/network_1

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 2.9 M 
1 | rnn       | RNN              | 40.4 K
2 | out_fc    | Linear           | 2.9 M 
3 | loss      | CrossEntropyLoss | 0     
4 | dropout   | Dropout          | 0     
-----------------------------------------------
5.8 M     Trainable params
0         Non-trainable params
5.8 M     Total params
23.400    Total estimated model params size (MB)


Epoch 0:  90%|█████████ | 3687/4091 [00:35<00:03, 104.84it/s, loss=5.8, v_num=0]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|                                       | 0/404 [00:00<?, ?it/s][A
Epoch 0:  90%|█████████ | 3692/4091 [00:35<00:03, 103.31it/s, loss=5.8, v_num=0][A
Epoch 0:  91%|█████████ | 3716/4091 [00:35<00:03, 103.69it/s, loss=5.8, v_num=0][A
Epoch 0:  91%|█████████▏| 3742/4091 [00:35<00:03, 104.12it/s, loss=5.8, v_num=0][A
Epoch 0:  92%|█████████▏| 3768/4091 [00:36<00:03, 104.54it/s, loss=5.8, v_num=0][A
Epoch 0:  93%|█████████▎| 3794/4091 [00:36<00:02, 104.96it/s, loss=5.8, v_num=0][A
Epoch 0:  93%|█████████▎| 3820/4091 [00:36<00:02, 105.38it/s, loss=5.8, v_num=0][A
Epoch 0:  94%|█████████▍| 3846/4091 [00:36<00:02, 105.79it/s, loss=5.8, v_num=0][A
Epoch 0:  95%|█████████▍| 3872/4091 [00:36<00:02, 106.20it/s, loss=5.8, v_num=0][A
Epoch 0:  95%|█████████▌| 3898/4091 [00:36<00:01, 106.60it/s, loss=5.8, v_num=0][A
Epoch 0:  96%|█████████▌| 3924/4091 [00:36<00

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing:  97%|██████████████████████████████▏| 451/463 [00:02<00:00, 245.98it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'loss/test': 4.6080121994018555, 'perplexity/test': 102.39926147460938}
--------------------------------------------------------------------------------
Testing: 100%|███████████████████████████████| 463/463 [00:02<00:00, 183.32it/s]
[{'loss/test': 4.6080121994018555, 'perplexity/test': 102.39926147460938}]


In [9]:
class TextLSTMModule(pl.LightningModule):
    """LSTM modeule."""
    def __init__(self, vocab_size):
        super().__init__()
        self.num_layers = 2
        self.hidden_size = 100 #200
        self.embedding_size = 100
        self.vocab_size = vocab_size
        
        # embedding
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
        #layers
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.num_layers, batch_first=True)
        self.out_fc = nn.Linear(self.hidden_size, vocab_size)
        # loss funciton
        self.loss = nn.CrossEntropyLoss()
        
        self.dropout = nn.Dropout(0.25)
        
    
    def forward(self, data, hidden, cell):
        embedding = self.dropout(self.embedding(data))
        output, hidden = self.lstm(embedding, (hidden, cell))
        output = self.out_fc(output)
        return output.view(-1, self.vocab_size), (hidden, cell)

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=5)
    
    def training_step(self, batch, batch_idx):
        x,y = batch
        y = y.view(-1)
        
        hidden = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        cell = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        output, (hidden, cell) = self.forward(x, hidden, cell)
        loss = self.loss(output, y)
        perplexity = math.exp(loss.item())
        
        tensorboard_logs = {'perplexity': {'train': perplexity}, 'loss': {'train': loss.detach()}}
        self.log("loss/train", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("perplexity/train", perplexity, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss, "log": tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        x,y = batch
        y = y.view(-1)
        
        hidden = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        cell = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        output, (hidden, cell) = self.forward(x, hidden, cell)
        loss = self.loss(output, y)
        perplexity = math.exp(loss.item())
        
        tensorboard_logs = {'perplexity': {'valid': perplexity}, 'loss': {'valid': loss.detach()}}
        self.log("loss/valid", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("perplexity/valid", perplexity, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss, "log": tensorboard_logs}
    
    def test_step(self, batch, batch_idx):
        x,y = batch
        y = y.view(-1)
        
        hidden = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        cell = torch.zeros(self.num_layers, 20, self.hidden_size).to(self.device)
        output, (hidden, cell) = self.forward(x, hidden, cell)
        loss = self.loss(output, y)
        perplexity = math.exp(loss.item())
        
        tensorboard_logs = {'perplexity': {'test': perplexity}, 'loss': {'test': loss.detach()}}
        self.log("loss/test", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("perplexity/test", perplexity, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss, "log": tensorboard_logs}

    def init_hidden(self, batch_size = 20):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden, cell

In [10]:
# Train LSTM
lstm_data_module = TextDateModule(train, valid, test)
lstm_model = TextLSTMModule(vocab_size)

tb_logger = pl_loggers.TensorBoardLogger("./lightning_logs/", name="network_2")
trainer = pl.Trainer(logger=tb_logger, gradient_clip_val=0.5, max_epochs=20, gpus=1)
trainer.fit(lstm_model, data_module)

result = trainer.test(lstm_model, data_module)
print(result)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: ./lightning_logs/network_2

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 2.9 M 
1 | lstm      | LSTM             | 161 K 
2 | out_fc    | Linear           | 2.9 M 
3 | loss      | CrossEntropyLoss | 0     
4 | dropout   | Dropout          | 0     
-----------------------------------------------
6.0 M     Trainable params
0         Non-trainable params
6.0 M     Total params
23.884    Total estimated model params size (MB)


Epoch 0:  90%|█████████ | 3687/4091 [00:37<00:04, 99.30it/s, loss=5.33, v_num=0]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|                                       | 0/404 [00:00<?, ?it/s][A
Epoch 0:  90%|█████████ | 3694/4091 [00:37<00:04, 97.91it/s, loss=5.33, v_num=0][A
Epoch 0:  91%|█████████ | 3721/4091 [00:37<00:03, 98.36it/s, loss=5.33, v_num=0][A
Epoch 0:  92%|█████████▏| 3748/4091 [00:37<00:03, 98.79it/s, loss=5.33, v_num=0][A
Epoch 0:  92%|█████████▏| 3775/4091 [00:38<00:03, 99.21it/s, loss=5.33, v_num=0][A
Epoch 0:  93%|█████████▎| 3802/4091 [00:38<00:02, 99.63it/s, loss=5.33, v_num=0][A
Epoch 0:  94%|████████▍| 3829/4091 [00:38<00:02, 100.05it/s, loss=5.33, v_num=0][A
Epoch 0:  94%|████████▍| 3856/4091 [00:38<00:02, 100.47it/s, loss=5.33, v_num=0][A
Epoch 0:  95%|████████▌| 3883/4091 [00:38<00:02, 100.87it/s, loss=5.33, v_num=0][A
Epoch 0:  96%|████████▌| 3910/4091 [00:38<00:01, 101.28it/s, loss=5.33, v_num=0][A
Validating:  55%|███████████████▌            

  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing:  97%|██████████████████████████████ | 449/463 [00:02<00:00, 245.32it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'loss/test': 4.3150153160095215, 'perplexity/test': 76.4348373413086}
--------------------------------------------------------------------------------
Testing: 100%|███████████████████████████████| 463/463 [00:02<00:00, 182.15it/s]
[{'loss/test': 4.3150153160095215, 'perplexity/test': 76.4348373413086}]


In [None]:
# Show results
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs/