In [1]:
%cd /home/bap/hana/Basic-NLP-RNN/rnn/rnn

/home/bap/hana/Basic-NLP-RNN/rnn/rnn


In [2]:
import numpy as np
import io
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
class Config:
    '''
    Config class defines dataset path and hyperparameters.
    '''
    data_train_url = 'dataset/shakespeare_train.txt'
    data_val_url = 'dataset/shakespeare_valid.txt'
    n_hidden = 512
    n_layers = 2
    epochs = 25 
    n_seqs = 128
    n_steps = 100
    lr = 0.001
    clip = 5
    cuda = False
    dropout = 0.5

In [4]:
class Dataset:
    '''
    Load data from data path, preprocess (tokenize & one-hot encode) and get data in array type.
    '''
    def __init__(self, data_train_url = Config.data_train_url, data_val_url = Config.data_val_url):
        with io.open (data_train_url, 'r') as f:
            self.text_train = f.read()
        with io.open (data_val_url, 'r') as f:
            self.text_val = f.read()

    def char_tokenize(self):
        self.chars = tuple(set(self.text_train))
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        self.train_data = np.array([self.char2int[ch] for ch in self.text_train])
        self.val_data = np.array([self.char2int[ch] for ch in self.text_val])

    def one_hot_encode(self, arr, n_labels):
        one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
        one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
        one_hot = one_hot.reshape((*arr.shape, n_labels))
        return one_hot

    def get_data(self):
        self.char_tokenize()
        return self.train_data, self.val_data

In [5]:
data = Dataset()
train_data, val_data = data.get_data()
print("Encoded chars in train:", train_data[:100])
print("Number of chars in vocab: ", len(data.chars))
print("Train text: ", data.text_train[:100])

Encoded chars in train: [49 51 41 14  3 25 64 51  3 51  5 57 53 37 44  4 57 66 29 41 57 25 33 57
 25  6 41 29 21 57 57 42 25 36 53 31 25 66 11 41  3  7 57 41 20 25  7 57
 36 41 25 13 57 25 14  6 57 36 10 38 44 44 22 23 23 37 44 30  6 57 36 10
 20 25 14  6 57 36 10 38 44 44 49 51 41 14  3 25 64 51  3 51  5 57 53 37
 44 28 29 11]
Number of chars in vocab:  67
Train text:  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
class DataLoader:
    '''
    Load data from dataset in batches (batches = n_seqs * n_steps)
    '''
    def __init__(self, train, val):
        self.train = train
        self.val = val

    def __call__(self, arr, n_seqs, n_steps):
        '''
        Create a generator that returns batches of size
        n_seqs x n_steps from arr.
        
        Arguments
        ---------
        arr: np.array
            Array you want to make batches from
        n_seqs: int
            Batch size, the number of sequences per batch
        n_steps: int
            Number of sequence steps per batch
        '''
        batch_size = n_seqs * n_steps
        n_batches = len(arr) // batch_size
        arr = arr[:n_batches * batch_size]
        arr = arr.reshape((n_seqs, -1))
        
        for n in range(0, arr.shape[1], n_steps):
            x = arr[:, n: n + n_steps]
            y = np.zeros_like(x)
            try:
                y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n + n_steps]
            except IndexError:
                y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
            yield x, y

In [9]:
data_loader = DataLoader(train_data, val_data)
next(data_loader(train_data, 1, 5))

(array([[49, 51, 41, 14,  3]]), array([[51, 41, 14,  3, 25]]))