# Character Level LSTM in PyTorch

__Statistical Language Model__: A trained model to predict the next word/character given all previous words/characters.

__Character-Level Language Model__: The main task of the char-level language model is to predict the next character given all previous characters in a sequence of data, i.e. generates text character by character. 


In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
torch.cuda.is_available()

True

In [3]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [4]:
text[:1000]

"Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverything was in confusion in the Oblonskys' house. The wife had\ndiscovered that the husband was carrying on an intrigue with a French\ngirl, who had been a governess in their family, and she had announced to\nher husband that she could not go on living in the same house with him.\nThis position of affairs had now lasted three days, and not only the\nhusband and wife themselves, but all the members of their family and\nhousehold, were painfully conscious of it. Every person in the house\nfelt that there was no sense in their living together, and that the\nstray people brought together by chance in any inn had more in common\nwith one another than they, the members of the family and household of\nthe Oblonskys. The wife did not leave her own room, the husband had not\nbeen at home for three days. The children ran wild all over the house;\nthe English governess quarreled with the housekeep

### Encoding the Text

In [5]:
## Encoding the text ## 
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch:ii for ii,ch in int2char.items()}
encoded = np.array([char2int[ch] for ch in text])

In [6]:
encoded[:100]

array([77, 32, 79, 26, 46, 52, 82, 20, 37, 27, 27, 27, 47, 79, 26, 26, 74,
       20, 71, 79, 53, 45, 80, 45, 52, 62, 20, 79, 82, 52, 20, 79, 80, 80,
       20, 79, 80, 45, 49, 52,  2, 20, 52, 72, 52, 82, 74, 20, 19, 64, 32,
       79, 26, 26, 74, 20, 71, 79, 53, 45, 80, 74, 20, 45, 62, 20, 19, 64,
       32, 79, 26, 26, 74, 20, 45, 64, 20, 45, 46, 62, 20, 40, 75, 64, 27,
       75, 79, 74,  7, 27, 27, 10, 72, 52, 82, 74, 46, 32, 45, 64])

### Data Pre-Processing

In [7]:
def one_hot_encode(arr, n_labels):

    #n_labels = max(arr.flatten()) + 1
    
    one_hot = np.zeros(shape = (np.multiply(*arr.shape) , n_labels))
    
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [8]:
test_seq = np.array([[1,2,3,7],[5,3,2,8]])
one_hot = one_hot_encode(test_seq,9)

In [9]:
one_hot

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]]])

### Making Training mini-batches

In [10]:
# N: batch size
# M: sequence length
# K: total number of batches

def get_batches(arr, batch_size, seq_length):
    
    # Number of matches we can make from the input array
    n_batches = len(arr) // (batch_size * seq_length)
    
    # keeping enoough character to make full batches
    arr = arr[:n_batches * batch_size * seq_length]
    
    arr = arr.reshape((batch_size, -1))
    
    # iterating over the batches
    for n in range(0, arr.shape[1] , seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:,:-1], y[:,-1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:,:-1], y[:,-1] = x[:, 1:], arr[:,0]
        
        yield x,y

In [11]:
# testing the get_batch function

batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [12]:
x, y

(array([[77, 32, 79, 26, 46, 52, 82, 20, 37, 27, 27, 27, 47, 79, 26, 26,
         74, 20, 71, 79, 53, 45, 80, 45, 52, 62, 20, 79, 82, 52, 20, 79,
         80, 80, 20, 79, 80, 45, 49, 52,  2, 20, 52, 72, 52, 82, 74, 20,
         19, 64],
        [62, 40, 64, 20, 46, 32, 79, 46, 20, 79, 46, 46, 82, 79,  5, 46,
         52, 17, 20, 32, 52, 82, 20, 79, 46, 46, 52, 64, 46, 45, 40, 64,
         20, 75, 79, 62, 20, 32, 52, 82, 20, 32, 19, 62, 68, 79, 64, 17,
          7, 20],
        [52, 64, 17, 20, 40, 82, 20, 79, 20, 71, 40, 52, 28, 20, 32, 52,
         20, 79, 72, 40, 45, 17, 52, 17, 20, 32, 45, 62, 20, 71, 79, 46,
         32, 52, 82,  7, 20, 47, 52, 27, 80, 40, 40, 49, 52, 17, 20, 82,
         40, 19],
        [62, 20, 46, 32, 52, 20,  5, 32, 45, 52, 71, 20, 46, 32, 40, 19,
         60, 32, 20, 32, 45, 17, 17, 52, 64, 27, 45, 64, 46, 52, 82, 52,
         62, 46, 20, 40, 71, 20, 32, 45, 62, 20, 80, 45, 71, 52, 28, 20,
         40, 71],
        [20, 62, 79, 75, 20, 32, 52, 82, 20, 46, 52,

### Defining the LSTM Network

In [13]:
train_on_gpu = torch.cuda.is_available()

In [14]:
train_on_gpu

True

In [15]:
class CharRNN(nn.Module):
    def __init__(self, chars, n_hidden = 256, n_layers=2, drop_prob=0.5, lr=0.001):
        
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.chars = chars
        
        self.int2chars = dict(enumerate(self.chars))
        self.char2int = {ch : ii for ii,ch in self.int2chars.items()}
        
        self.lstm = nn.LSTM(input_size = len(self.chars), hidden_size = n_hidden,  num_layers = n_layers, \
                             dropout = self.drop_prob, batch_first = True)
        
        self.dropout = nn.Dropout(self.drop_prob)
        
        self.fc = nn.Linear(self.n_hidden, len(self.chars))
    
    def forward(self, x, hidden):
        
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        
        return out, hidden
        
    def init_hidden(self, batch_size):
        # creating two new tensors with size n_layers * batch_size * n_hidden,
        # initialized to zero, for hidden state and cell state of the LSTM
        
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
            
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
            
        

### Training 

In [20]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, \
          val_frac=0.1, print_every=10, epoch_offset = 0,save_path = "model.pt"):
    '''
        Arguments:
            net: CharRNN network
            data: text data to train the network
            epochs: number of epochs
            batch_size: number of mini-sequences per mini-batch
            seq_length: Number of character steps per mini-batch
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: number of steps for printing training and validation
    '''
    
    # Making all parameters of the model Double
    net.double()
    
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # creating training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data , val_data = data[:val_idx], data[val_idx:]
    
    valid_loss_min = np.Inf
    
    if train_on_gpu:
        net.cuda()
        
    counter = 0
    n_chars = len(net.chars)
    for e in range(epoch_offset,  epoch_offset + epochs):
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if train_on_gpu:
                inputs , targets = inputs.cuda(), targets.cuda()
                
            #creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
            
            net.zero_grad()
            
            output, h = net(inputs, h)
            
            loss = criterion(output, targets.view(batch_size*seq_length))
            loss.backward()
            
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size)
                val_losses=[]
                net.eval()
                for x,y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x,y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x,y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()
                        
                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length))
                    
                    val_losses.append(val_loss.item())
            
                net.train()
                
                new_val_loss = np.mean(val_losses)
                
                if new_val_loss < valid_loss_min:
                    valid_loss_min = new_val_loss
                    torch.save(net.state_dict(), save_path)
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(new_val_loss))
            
            

In [21]:
n_hidden = 512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [22]:
batch_size = 128
seq_length = 100
n_epochs = 20

In [23]:

train(net, encoded, epochs=n_epochs, batch_size= batch_size, \
      seq_length= seq_length, lr=0.001, print_every=10)


Epoch: 1/20... Step: 10... Loss: 3.2934... Val Loss: 3.2490
Epoch: 1/20... Step: 20... Loss: 3.1470... Val Loss: 3.1393
Epoch: 1/20... Step: 30... Loss: 3.1443... Val Loss: 3.1257
Epoch: 1/20... Step: 40... Loss: 3.1156... Val Loss: 3.1191
Epoch: 1/20... Step: 50... Loss: 3.1471... Val Loss: 3.1178
Epoch: 1/20... Step: 60... Loss: 3.1196... Val Loss: 3.1162
Epoch: 1/20... Step: 70... Loss: 3.1104... Val Loss: 3.1154
Epoch: 1/20... Step: 80... Loss: 3.1254... Val Loss: 3.1130
Epoch: 1/20... Step: 90... Loss: 3.1240... Val Loss: 3.1071
Epoch: 1/20... Step: 100... Loss: 3.1062... Val Loss: 3.0989
Epoch: 1/20... Step: 110... Loss: 3.0865... Val Loss: 3.0667
Epoch: 1/20... Step: 120... Loss: 2.9947... Val Loss: 2.9856
Epoch: 1/20... Step: 130... Loss: 2.9743... Val Loss: 2.9341
Epoch: 2/20... Step: 140... Loss: 2.8703... Val Loss: 2.8226
Epoch: 2/20... Step: 150... Loss: 2.7537... Val Loss: 2.7147
Epoch: 2/20... Step: 160... Loss: 2.6497... Val Loss: 2.6166
Epoch: 2/20... Step: 170... Loss:

Epoch: 10/20... Step: 1350... Loss: 1.4076... Val Loss: 1.4387
Epoch: 10/20... Step: 1360... Loss: 1.4167... Val Loss: 1.4364
Epoch: 10/20... Step: 1370... Loss: 1.3937... Val Loss: 1.4366
Epoch: 10/20... Step: 1380... Loss: 1.4342... Val Loss: 1.4304
Epoch: 10/20... Step: 1390... Loss: 1.4441... Val Loss: 1.4278
Epoch: 11/20... Step: 1400... Loss: 1.4534... Val Loss: 1.4299
Epoch: 11/20... Step: 1410... Loss: 1.4605... Val Loss: 1.4258
Epoch: 11/20... Step: 1420... Loss: 1.4489... Val Loss: 1.4208
Epoch: 11/20... Step: 1430... Loss: 1.4122... Val Loss: 1.4230
Epoch: 11/20... Step: 1440... Loss: 1.4362... Val Loss: 1.4172
Epoch: 11/20... Step: 1450... Loss: 1.3694... Val Loss: 1.4203
Epoch: 11/20... Step: 1460... Loss: 1.3932... Val Loss: 1.4161
Epoch: 11/20... Step: 1470... Loss: 1.3848... Val Loss: 1.4191
Epoch: 11/20... Step: 1480... Loss: 1.4043... Val Loss: 1.4135
Epoch: 11/20... Step: 1490... Loss: 1.4001... Val Loss: 1.4142
Epoch: 11/20... Step: 1500... Loss: 1.3797... Val Loss:

Epoch: 20/20... Step: 2660... Loss: 1.2602... Val Loss: 1.3032
Epoch: 20/20... Step: 2670... Loss: 1.2596... Val Loss: 1.3002
Epoch: 20/20... Step: 2680... Loss: 1.2579... Val Loss: 1.2985
Epoch: 20/20... Step: 2690... Loss: 1.2449... Val Loss: 1.3035
Epoch: 20/20... Step: 2700... Loss: 1.2532... Val Loss: 1.2979
Epoch: 20/20... Step: 2710... Loss: 1.2266... Val Loss: 1.2982
Epoch: 20/20... Step: 2720... Loss: 1.2219... Val Loss: 1.3026
Epoch: 20/20... Step: 2730... Loss: 1.2182... Val Loss: 1.3017
Epoch: 20/20... Step: 2740... Loss: 1.2190... Val Loss: 1.3026
Epoch: 20/20... Step: 2750... Loss: 1.2259... Val Loss: 1.3020
Epoch: 20/20... Step: 2760... Loss: 1.2165... Val Loss: 1.3010
Epoch: 20/20... Step: 2770... Loss: 1.2574... Val Loss: 1.2999
Epoch: 20/20... Step: 2780... Loss: 1.2761... Val Loss: 1.2996


In [24]:
torch.save(net.state_dict(),"char_level_lstm.pt")