# Character Level LSTM in PyTorch

__Statistical Language Model__: A trained model to predict the next word/character given all previous words/characters.

__Character-Level Language Model__: The main task of the char-level language model is to predict the next character given all previous characters in a sequence of data, i.e. generates text character by character. 


In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
torch.cuda.is_available()

False

In [3]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [4]:
text[:1000]

"Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverything was in confusion in the Oblonskys' house. The wife had\ndiscovered that the husband was carrying on an intrigue with a French\ngirl, who had been a governess in their family, and she had announced to\nher husband that she could not go on living in the same house with him.\nThis position of affairs had now lasted three days, and not only the\nhusband and wife themselves, but all the members of their family and\nhousehold, were painfully conscious of it. Every person in the house\nfelt that there was no sense in their living together, and that the\nstray people brought together by chance in any inn had more in common\nwith one another than they, the members of the family and household of\nthe Oblonskys. The wife did not leave her own room, the husband had not\nbeen at home for three days. The children ran wild all over the house;\nthe English governess quarreled with the housekeep

### Encoding the Text

In [5]:
## Encoding the text ## 
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch:ii for ii,ch in int2char.items()}
encoded = np.array([char2int[ch] for ch in text])

In [6]:
encoded[:100]

array([ 0,  3, 33, 65, 68,  4, 22, 60, 47, 55, 55, 55, 62, 33, 65, 65, 76,
       60, 57, 33, 48, 32, 18, 32,  4, 67, 60, 33, 22,  4, 60, 33, 18, 18,
       60, 33, 18, 32, 58,  4, 23, 60,  4, 44,  4, 22, 76, 60, 46, 36,  3,
       33, 65, 65, 76, 60, 57, 33, 48, 32, 18, 76, 60, 32, 67, 60, 46, 36,
        3, 33, 65, 65, 76, 60, 32, 36, 60, 32, 68, 67, 60, 25, 80, 36, 55,
       80, 33, 76,  9, 55, 55, 69, 44,  4, 22, 76, 68,  3, 32, 36])

### Data Pre-Processing

In [7]:
def one_hot_encode(arr):

    n_labels = max(arr.flatten()) + 1
    
    one_hot = np.zeros(shape = (np.multiply(*arr.shape) , n_labels))
    
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [8]:
test_seq = np.array([[1,2,3,7],[5,3,2,8]])
one_hot = one_hot_encode(test_seq)

In [9]:
one_hot

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]]])

### Making Training mini-batches

In [10]:
# N: batch size
# M: sequence length
# K: total number of batches

def get_batches(arr, batch_size, seq_length):
    
    # Number of matches we can make from the input array
    n_batches = len(arr) // (batch_size * seq_length)
    
    # keeping enoough character to make full batches
    arr = arr[:n_batches * batch_size * seq_length]
    
    arr = arr.reshape((batch_size, -1))
    
    # iterating over the batches
    for n in range(0, arr.shape[1] , seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:,:-1], y[:,-1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:,:-1], y[:,-1] = x[:, 1:], arr[:,0]
        
        yield x,y

In [22]:
# testing the get_batch function

batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [23]:
x, y

(array([[ 0,  3, 33, 65, 68,  4, 22, 60, 47, 55, 55, 55, 62, 33, 65, 65,
         76, 60, 57, 33, 48, 32, 18, 32,  4, 67, 60, 33, 22,  4, 60, 33,
         18, 18, 60, 33, 18, 32, 58,  4, 23, 60,  4, 44,  4, 22, 76, 60,
         46, 36],
        [67, 25, 36, 60, 68,  3, 33, 68, 60, 33, 68, 68, 22, 33, 20, 68,
          4,  6, 60,  3,  4, 22, 60, 33, 68, 68,  4, 36, 68, 32, 25, 36,
         60, 80, 33, 67, 60,  3,  4, 22, 60,  3, 46, 67, 31, 33, 36,  6,
          9, 60],
        [ 4, 36,  6, 60, 25, 22, 60, 33, 60, 57, 25,  4, 78, 60,  3,  4,
         60, 33, 44, 25, 32,  6,  4,  6, 60,  3, 32, 67, 60, 57, 33, 68,
          3,  4, 22,  9, 60, 62,  4, 55, 18, 25, 25, 58,  4,  6, 60, 22,
         25, 46],
        [67, 60, 68,  3,  4, 60, 20,  3, 32,  4, 57, 60, 68,  3, 25, 46,
         64,  3, 60,  3, 32,  6,  6,  4, 36, 55, 32, 36, 68,  4, 22,  4,
         67, 68, 60, 25, 57, 60,  3, 32, 67, 60, 18, 32, 57,  4, 78, 60,
         25, 57],
        [60, 67, 33, 80, 60,  3,  4, 22, 60, 68,  4,

### Defining the LSTM Network

In [24]:
train_on_gpu = torch.cuda.is_available()

In [25]:
train_on_gpu

False

In [28]:
class CharRNN(nn.Module):
    def __init__(self, chars, n_hidden = 256, n_layers=2, drop_prob=0.5, lr=0.001):
        
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.chars = chars
        
        self.int2chars = dict(enumerate(self.chars))
        self.char2int = {ch : ii for ii,ch in self.int2chars.items()}
        
        self.lstm = nn.LSTM(input_size = len(self.chars), hidden_size = n_hidden,  num_layers = n_layers, \
                             dropout = self.drop_prob, batch_first = True)
        
        self.dropout = nn.Dropout(self.drop_prob)
        
        self.fc = nn.Linear(self.n_hidden, len(self.chars))
    
    def forward(sel, x, hidden):
        
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        
        return out, hidden
        
    def init_hidden(self, batch_size):
        # creating two new tensors with size n_layers * batch_size * n_hidden,
        # initialized to zero, for hidden state and cell state of the LSTM
        
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
            
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
            
        

### Training 

In [29]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, \
          val_frac=0.1, print_every=10):
    '''
        Arguments:
            net: CharRNN network
            data: text data to train the network
            epochs: number of epochs
            batch_size: number of mini-sequences per mini-batch
            seq_length: Number of character steps per mini-batch
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: number of steps for printing training and validation
    '''
    
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterio = nn.CrossEntropyLoss()
    
    # creating training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data , val_data = data[:val_idx], data[val_idx]
    
    if train_on_gpu:
        net.cuda()
        
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if train_on_gpu:
                inputs , targets = inputs.cuda(), targets.cuda()
                
            #creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
            
            net.zero_grad()
            
            output, h = net(inputs, h)
            
            loss = criterion(output, targets.view(batch_size*seq_length))
            loss.backward()
            
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size)
                val_losses=[]
                net.eval()
                for x,y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x,y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x,y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()
                        
                    output, val_h = net(inputs, val_h)
                    val_loss = criterion0(output, targets.view(batch_size*seq_length))
                    
                    val_losses.append(val_loss.item())
            
            net.train()
            
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(np.mean(val_losses)))
            
            

In [30]:
n_hidden = 512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [31]:
batch_size = 128
seq_length = 100
n_epochs = 20

In [None]:

train(net, encoded, epochs=n_epochs, batch_size= batch_size, \
      seq_length= seq_length, lr=0.001, print_every=10)
