In [1]:
%cd /home/bap/hana/Basic-NLP-RNN/rnn/rnn

/home/bap/hana/Basic-NLP-RNN/rnn/rnn


In [2]:
import numpy as np
import io
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
class Config:
    '''
    Config class defines dataset path and hyperparameters.
    '''
    data_train_url = 'dataset/shakespeare_train.txt'
    data_val_url = 'dataset/shakespeare_valid.txt'
    n_hidden = 512
    n_layers = 2
    epochs = 25 
    n_seqs = 128
    n_steps = 100
    lr = 0.001
    clip = 5
    cuda = False
    dropout = 0.5

In [4]:
class Dataset:
    '''
    Load data from data path, preprocess (tokenize & one-hot encode) and get data in array type.
    '''
    def __init__(self, data_train_url = Config.data_train_url, data_val_url = Config.data_val_url):
        with io.open (data_train_url, 'r') as f:
            self.text_train = f.read()
        with io.open (data_val_url, 'r') as f:
            self.text_val = f.read()

    def char_tokenize(self):
        self.chars = tuple(set(self.text_train))
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        self.train_data = np.array([self.char2int[ch] for ch in self.text_train])
        self.val_data = np.array([self.char2int[ch] for ch in self.text_val])

    def one_hot_encode(self, arr, n_labels):
        one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
        one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
        one_hot = one_hot.reshape((*arr.shape, n_labels))
        return one_hot

    def get_data(self):
        self.char_tokenize()
        return self.train_data, self.val_data

In [5]:
data = Dataset()
train_data, val_data = data.get_data()
print("Encoded chars in train:", train_data[:100])
print("Number of chars in vocab: ", len(data.chars))
print("Train text: ", data.text_train[:100])

Encoded chars in train: [49 51 41 14  3 25 64 51  3 51  5 57 53 37 44  4 57 66 29 41 57 25 33 57
 25  6 41 29 21 57 57 42 25 36 53 31 25 66 11 41  3  7 57 41 20 25  7 57
 36 41 25 13 57 25 14  6 57 36 10 38 44 44 22 23 23 37 44 30  6 57 36 10
 20 25 14  6 57 36 10 38 44 44 49 51 41 14  3 25 64 51  3 51  5 57 53 37
 44 28 29 11]
Number of chars in vocab:  67
Train text:  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
class DataLoader:
    '''
    Load data from dataset in batches (batches = n_seqs * n_steps)
    '''
    def __init__(self, train, val):
        self.train = train
        self.val = val

    def __call__(self, arr, n_seqs, n_steps):
        '''
        Create a generator that returns batches of size
        n_seqs x n_steps from arr.
        
        Arguments
        ---------
        arr: np.array
            Array you want to make batches from
        n_seqs: int
            Batch size, the number of sequences per batch
        n_steps: int
            Number of sequence steps per batch
        '''
        batch_size = n_seqs * n_steps
        n_batches = len(arr) // batch_size
        arr = arr[:n_batches * batch_size]
        arr = arr.reshape((n_seqs, -1))
        
        for n in range(0, arr.shape[1], n_steps):
            x = arr[:, n: n + n_steps]
            y = np.zeros_like(x)
            try:
                y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n + n_steps]
            except IndexError:
                y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
            yield x, y

In [9]:
data_loader = DataLoader(train_data, val_data)
next(data_loader(train_data, 1, 5))

(array([[49, 51, 41, 14,  3]]), array([[51, 41, 14,  3, 25]]))

In [14]:
class RNN(nn.Module):
    def __init__(self, vocab_size, n_steps=Config.n_steps, n_hidden=Config.n_hidden, n_layers=Config.n_layers,
                    drop_prob=Config.dropout, lr=Config.lr):
        super().__init__()
        self.vocab_size = vocab_size
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr      
        self.lstm = nn.LSTM(vocab_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)        
        self.dropout = nn.Dropout(drop_prob)      
        self.fc = nn.Linear(n_hidden, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        ''' 
        Initialize weights for fully connected layer 
        '''
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-1, 1)
        
    def init_hidden(self, n_seqs):
        ''' 
        Initializes hidden state 
        '''
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())

    def forward(self, x, hc):
        ''' 
        Forward pass through the network. 
        These inputs are x, and the hidden/cell state `hc`. 
        '''
        x, (h, c) = self.lstm(x, hc)
        x = self.dropout(x)
        x = x.reshape(x.size()[0] * x.size()[1], self.n_hidden)
        x = self.fc(x)
        return x, (h, c)

In [18]:
def train(net, train_data, val_data, epochs=Config.epochs, n_seqs=Config.n_seqs, 
          n_steps=Config.n_steps, lr=Config.lr, clip=Config.clip, cuda=Config.cuda):
    ''' 
        Training a network 
    
        Arguments
        ----------------
        net: RNN network
        train_data: text data to train the network
        val_data: text data to validate the network
        epochs: Number of epochs to train
        n_seqs: Number of mini-sequences per mini-batch, aka batch size
        n_steps: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        cuda: Train with CUDA on a GPU
    '''
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    if cuda:
        net.cuda()
    
    counter = 0
    for e in range(epochs):
        h = net.init_hidden(n_seqs)
        for x, y in data_loader(train_data, n_seqs, n_steps):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = data.one_hot_encode(x, net.vocab_size)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            h = tuple([each.data for each in h])

            net.zero_grad()
            
            output, h = net.forward(inputs, h)
            loss = criterion(output, targets.view(n_seqs*n_steps))

            loss.backward()
            
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            opt.step()
            
            if counter % 10 == 0:
                
                val_h = net.init_hidden(n_seqs)
                val_losses = []
                for x, y in data_loader(val_data, n_seqs, n_steps):
                    x = data.one_hot_encode(x, net.vocab_size)
                    inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
                    
                    if cuda:
                        inputs, targets = inputs.cuda(), targets.cuda()
                    val_h = tuple([each.data for each in val_h])

                    output, val_h = net.forward(inputs, val_h)
                    val_loss = criterion(output, targets.view(n_seqs*n_steps))
                
                    val_losses.append(val_loss.item())
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [19]:
# define and print the net
net = RNN(vocab_size=len(data.chars))
print(net)

RNN(
  (lstm): LSTM(67, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=67, bias=True)
)


In [20]:
train(net=net, train_data=train_data, val_data=val_data, epochs=1, n_seqs=128, n_steps=100, lr=0.001)

Epoch: 1/1... Step: 10... Loss: 3.4232... Val Loss: 3.4073
Epoch: 1/1... Step: 20... Loss: 3.2957... Val Loss: 3.2593
Epoch: 1/1... Step: 30... Loss: 3.1133... Val Loss: 3.0987
Epoch: 1/1... Step: 40... Loss: 2.9358... Val Loss: 2.9181
Epoch: 1/1... Step: 50... Loss: 2.8185... Val Loss: 2.7483
Epoch: 1/1... Step: 60... Loss: 2.6611... Val Loss: 2.6403
Epoch: 1/1... Step: 70... Loss: 2.5618... Val Loss: 2.5690
Epoch: 1/1... Step: 80... Loss: 2.5486... Val Loss: 2.5128
Epoch: 1/1... Step: 90... Loss: 2.4703... Val Loss: 2.4704
Epoch: 1/1... Step: 100... Loss: 2.4520... Val Loss: 2.4362
Epoch: 1/1... Step: 110... Loss: 2.3872... Val Loss: 2.4088
Epoch: 1/1... Step: 120... Loss: 2.3563... Val Loss: 2.3763
Epoch: 1/1... Step: 130... Loss: 2.3285... Val Loss: 2.3553
Epoch: 1/1... Step: 140... Loss: 2.3048... Val Loss: 2.3438
Epoch: 1/1... Step: 150... Loss: 2.2897... Val Loss: 2.3164
Epoch: 1/1... Step: 160... Loss: 2.2828... Val Loss: 2.2957
Epoch: 1/1... Step: 170... Loss: 2.2815... Val Lo