# Character Level LSTM in PyTorch

__Statistical Language Model__: A trained model to predict the next word/character given all previous words/characters.

__Character-Level Language Model__: The main task of the char-level language model is to predict the next character given all previous characters in a sequence of data, i.e. generates text character by character. 


In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
torch.cuda.is_available()

True

In [3]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [4]:
text[:1000]

"Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverything was in confusion in the Oblonskys' house. The wife had\ndiscovered that the husband was carrying on an intrigue with a French\ngirl, who had been a governess in their family, and she had announced to\nher husband that she could not go on living in the same house with him.\nThis position of affairs had now lasted three days, and not only the\nhusband and wife themselves, but all the members of their family and\nhousehold, were painfully conscious of it. Every person in the house\nfelt that there was no sense in their living together, and that the\nstray people brought together by chance in any inn had more in common\nwith one another than they, the members of the family and household of\nthe Oblonskys. The wife did not leave her own room, the husband had not\nbeen at home for three days. The children ran wild all over the house;\nthe English governess quarreled with the housekeep

### Encoding the Text

In [5]:
## Encoding the text ## 
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch:ii for ii,ch in int2char.items()}
encoded = np.array([char2int[ch] for ch in text])

In [6]:
encoded[:100]

array([65, 22,  0,  1, 37, 76, 25, 34, 13, 57, 57, 57, 33,  0,  1,  1, 50,
       34, 51,  0, 38, 21, 36, 21, 76, 23, 34,  0, 25, 76, 34,  0, 36, 36,
       34,  0, 36, 21, 41, 76, 52, 34, 76, 18, 76, 25, 50, 34, 74, 15, 22,
        0,  1,  1, 50, 34, 51,  0, 38, 21, 36, 50, 34, 21, 23, 34, 74, 15,
       22,  0,  1,  1, 50, 34, 21, 15, 34, 21, 37, 23, 34, 42, 70, 15, 57,
       70,  0, 50, 43, 57, 57, 17, 18, 76, 25, 50, 37, 22, 21, 15])

### Data Pre-Processing

In [7]:
def one_hot_encode(arr, n_labels):

    #n_labels = max(arr.flatten()) + 1
    
    one_hot = np.zeros(shape = (np.multiply(*arr.shape) , n_labels))
    
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [8]:
test_seq = np.array([[1,2,3,7],[5,3,2,8]])
one_hot = one_hot_encode(test_seq,9)

In [9]:
one_hot

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]]])

### Making Training mini-batches

In [10]:
# N: batch size
# M: sequence length
# K: total number of batches

def get_batches(arr, batch_size, seq_length):
    
    # Number of matches we can make from the input array
    n_batches = len(arr) // (batch_size * seq_length)
    
    # keeping enoough character to make full batches
    arr = arr[:n_batches * batch_size * seq_length]
    
    arr = arr.reshape((batch_size, -1))
    
    # iterating over the batches
    for n in range(0, arr.shape[1] , seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:,:-1], y[:,-1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:,:-1], y[:,-1] = x[:, 1:], arr[:,0]
        
        yield x,y

In [11]:
# testing the get_batch function

batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [12]:
x, y

(array([[65, 22,  0,  1, 37, 76, 25, 34, 13, 57, 57, 57, 33,  0,  1,  1,
         50, 34, 51,  0, 38, 21, 36, 21, 76, 23, 34,  0, 25, 76, 34,  0,
         36, 36, 34,  0, 36, 21, 41, 76, 52, 34, 76, 18, 76, 25, 50, 34,
         74, 15],
        [23, 42, 15, 34, 37, 22,  0, 37, 34,  0, 37, 37, 25,  0, 63, 37,
         76, 81, 34, 22, 76, 25, 34,  0, 37, 37, 76, 15, 37, 21, 42, 15,
         34, 70,  0, 23, 34, 22, 76, 25, 34, 22, 74, 23, 24,  0, 15, 81,
         43, 34],
        [76, 15, 81, 34, 42, 25, 34,  0, 34, 51, 42, 76, 56, 34, 22, 76,
         34,  0, 18, 42, 21, 81, 76, 81, 34, 22, 21, 23, 34, 51,  0, 37,
         22, 76, 25, 43, 34, 33, 76, 57, 36, 42, 42, 41, 76, 81, 34, 25,
         42, 74],
        [23, 34, 37, 22, 76, 34, 63, 22, 21, 76, 51, 34, 37, 22, 42, 74,
         71, 22, 34, 22, 21, 81, 81, 76, 15, 57, 21, 15, 37, 76, 25, 76,
         23, 37, 34, 42, 51, 34, 22, 21, 23, 34, 36, 21, 51, 76, 56, 34,
         42, 51],
        [34, 23,  0, 70, 34, 22, 76, 25, 34, 37, 76,

### Defining the LSTM Network

In [13]:
train_on_gpu = torch.cuda.is_available()

In [14]:
train_on_gpu

True

In [15]:
class CharRNN(nn.Module):
    def __init__(self, chars, n_hidden = 256, n_layers=2, drop_prob=0.5, lr=0.001):
        
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.chars = chars
        
        self.int2chars = dict(enumerate(self.chars))
        self.char2int = {ch : ii for ii,ch in self.int2chars.items()}
        
        self.lstm = nn.LSTM(input_size = len(self.chars), hidden_size = n_hidden,  num_layers = n_layers, \
                             dropout = self.drop_prob, batch_first = True)
        
        self.dropout = nn.Dropout(self.drop_prob)
        
        self.fc = nn.Linear(self.n_hidden, len(self.chars))
    
    def forward(self, x, hidden):
        
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        
        return out, hidden
        
    def init_hidden(self, batch_size):
        # creating two new tensors with size n_layers * batch_size * n_hidden,
        # initialized to zero, for hidden state and cell state of the LSTM
        
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
            
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
            
        

### Training 

In [16]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, \
          val_frac=0.1, print_every=10, epoch_offset = 0,save_path = "model.pt"):
    '''
        Arguments:
            net: CharRNN network
            data: text data to train the network
            epochs: number of epochs
            batch_size: number of mini-sequences per mini-batch
            seq_length: Number of character steps per mini-batch
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: number of steps for printing training and validation
    '''
    
    # Making all parameters of the model Double
    net.double()
    
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # creating training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data , val_data = data[:val_idx], data[val_idx:]
    
    valid_loss_min = np.Inf
    
    if train_on_gpu:
        net.cuda()
        
    counter = 0
    n_chars = len(net.chars)
    for e in range(epoch_offset,  epoch_offset + epochs):
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if train_on_gpu:
                inputs , targets = inputs.cuda(), targets.cuda()
                
            #creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
            
            net.zero_grad()
            
            output, h = net(inputs, h)
            
            loss = criterion(output, targets.view(batch_size*seq_length))
            loss.backward()
            
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size)
                val_losses=[]
                net.eval()
                for x,y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x,y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x,y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()
                        
                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length))
                    
                    val_losses.append(val_loss.item())
            
                net.train()
                
                new_val_loss = np.mean(val_losses)
                
                if new_val_loss < valid_loss_min:
                    valid_loss_min = new_val_loss
                    torch.save(net.state_dict(), save_path)
                
                print("Epoch: {}...".format(e+1),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(new_val_loss))
            
            

In [17]:
n_hidden = 512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [18]:
batch_size = 128
seq_length = 100
n_epochs = 20

In [23]:

train(net, encoded, epochs=n_epochs, batch_size= batch_size, \
      seq_length= seq_length, lr=0.001, print_every=10)


Epoch: 1/20... Step: 10... Loss: 3.2934... Val Loss: 3.2490
Epoch: 1/20... Step: 20... Loss: 3.1470... Val Loss: 3.1393
Epoch: 1/20... Step: 30... Loss: 3.1443... Val Loss: 3.1257
Epoch: 1/20... Step: 40... Loss: 3.1156... Val Loss: 3.1191
Epoch: 1/20... Step: 50... Loss: 3.1471... Val Loss: 3.1178
Epoch: 1/20... Step: 60... Loss: 3.1196... Val Loss: 3.1162
Epoch: 1/20... Step: 70... Loss: 3.1104... Val Loss: 3.1154
Epoch: 1/20... Step: 80... Loss: 3.1254... Val Loss: 3.1130
Epoch: 1/20... Step: 90... Loss: 3.1240... Val Loss: 3.1071
Epoch: 1/20... Step: 100... Loss: 3.1062... Val Loss: 3.0989
Epoch: 1/20... Step: 110... Loss: 3.0865... Val Loss: 3.0667
Epoch: 1/20... Step: 120... Loss: 2.9947... Val Loss: 2.9856
Epoch: 1/20... Step: 130... Loss: 2.9743... Val Loss: 2.9341
Epoch: 2/20... Step: 140... Loss: 2.8703... Val Loss: 2.8226
Epoch: 2/20... Step: 150... Loss: 2.7537... Val Loss: 2.7147
Epoch: 2/20... Step: 160... Loss: 2.6497... Val Loss: 2.6166
Epoch: 2/20... Step: 170... Loss:

Epoch: 10/20... Step: 1350... Loss: 1.4076... Val Loss: 1.4387
Epoch: 10/20... Step: 1360... Loss: 1.4167... Val Loss: 1.4364
Epoch: 10/20... Step: 1370... Loss: 1.3937... Val Loss: 1.4366
Epoch: 10/20... Step: 1380... Loss: 1.4342... Val Loss: 1.4304
Epoch: 10/20... Step: 1390... Loss: 1.4441... Val Loss: 1.4278
Epoch: 11/20... Step: 1400... Loss: 1.4534... Val Loss: 1.4299
Epoch: 11/20... Step: 1410... Loss: 1.4605... Val Loss: 1.4258
Epoch: 11/20... Step: 1420... Loss: 1.4489... Val Loss: 1.4208
Epoch: 11/20... Step: 1430... Loss: 1.4122... Val Loss: 1.4230
Epoch: 11/20... Step: 1440... Loss: 1.4362... Val Loss: 1.4172
Epoch: 11/20... Step: 1450... Loss: 1.3694... Val Loss: 1.4203
Epoch: 11/20... Step: 1460... Loss: 1.3932... Val Loss: 1.4161
Epoch: 11/20... Step: 1470... Loss: 1.3848... Val Loss: 1.4191
Epoch: 11/20... Step: 1480... Loss: 1.4043... Val Loss: 1.4135
Epoch: 11/20... Step: 1490... Loss: 1.4001... Val Loss: 1.4142
Epoch: 11/20... Step: 1500... Loss: 1.3797... Val Loss:

Epoch: 20/20... Step: 2660... Loss: 1.2602... Val Loss: 1.3032
Epoch: 20/20... Step: 2670... Loss: 1.2596... Val Loss: 1.3002
Epoch: 20/20... Step: 2680... Loss: 1.2579... Val Loss: 1.2985
Epoch: 20/20... Step: 2690... Loss: 1.2449... Val Loss: 1.3035
Epoch: 20/20... Step: 2700... Loss: 1.2532... Val Loss: 1.2979
Epoch: 20/20... Step: 2710... Loss: 1.2266... Val Loss: 1.2982
Epoch: 20/20... Step: 2720... Loss: 1.2219... Val Loss: 1.3026
Epoch: 20/20... Step: 2730... Loss: 1.2182... Val Loss: 1.3017
Epoch: 20/20... Step: 2740... Loss: 1.2190... Val Loss: 1.3026
Epoch: 20/20... Step: 2750... Loss: 1.2259... Val Loss: 1.3020
Epoch: 20/20... Step: 2760... Loss: 1.2165... Val Loss: 1.3010
Epoch: 20/20... Step: 2770... Loss: 1.2574... Val Loss: 1.2999
Epoch: 20/20... Step: 2780... Loss: 1.2761... Val Loss: 1.2996


In [24]:
torch.save(net.state_dict(),"char_level_lstm.pt")

In [26]:
net.load_state_dict(torch.load("char_level_lstm.pt"))

<All keys matched successfully>

In [29]:
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, \
           print_every=10, epoch_offset = 20,save_path = "char_level_lstm.pt")

Epoch: 21... Step: 10... Loss: 1.2595... Val Loss: 1.2996
Epoch: 21... Step: 20... Loss: 1.2836... Val Loss: 1.3003
Epoch: 21... Step: 30... Loss: 1.2748... Val Loss: 1.2965
Epoch: 21... Step: 40... Loss: 1.2438... Val Loss: 1.2938
Epoch: 21... Step: 50... Loss: 1.2566... Val Loss: 1.2989
Epoch: 21... Step: 60... Loss: 1.1864... Val Loss: 1.2957
Epoch: 21... Step: 70... Loss: 1.2219... Val Loss: 1.2964
Epoch: 21... Step: 80... Loss: 1.2092... Val Loss: 1.2990
Epoch: 21... Step: 90... Loss: 1.2404... Val Loss: 1.2961
Epoch: 21... Step: 100... Loss: 1.2278... Val Loss: 1.2972
Epoch: 21... Step: 110... Loss: 1.2098... Val Loss: 1.2976
Epoch: 21... Step: 120... Loss: 1.1938... Val Loss: 1.2982
Epoch: 21... Step: 130... Loss: 1.2347... Val Loss: 1.2939
Epoch: 22... Step: 140... Loss: 1.2773... Val Loss: 1.2944
Epoch: 22... Step: 150... Loss: 1.2365... Val Loss: 1.2990
Epoch: 22... Step: 160... Loss: 1.2368... Val Loss: 1.2908
Epoch: 22... Step: 170... Loss: 1.2598... Val Loss: 1.2930
Epoch:

Epoch: 31... Step: 1400... Loss: 1.1585... Val Loss: 1.2497
Epoch: 31... Step: 1410... Loss: 1.1677... Val Loss: 1.2505
Epoch: 31... Step: 1420... Loss: 1.1787... Val Loss: 1.2510
Epoch: 31... Step: 1430... Loss: 1.1523... Val Loss: 1.2495
Epoch: 31... Step: 1440... Loss: 1.1641... Val Loss: 1.2493
Epoch: 31... Step: 1450... Loss: 1.1041... Val Loss: 1.2485
Epoch: 31... Step: 1460... Loss: 1.1354... Val Loss: 1.2454
Epoch: 31... Step: 1470... Loss: 1.1097... Val Loss: 1.2490
Epoch: 31... Step: 1480... Loss: 1.1401... Val Loss: 1.2533
Epoch: 31... Step: 1490... Loss: 1.1267... Val Loss: 1.2530
Epoch: 31... Step: 1500... Loss: 1.1219... Val Loss: 1.2470
Epoch: 31... Step: 1510... Loss: 1.1160... Val Loss: 1.2456
Epoch: 31... Step: 1520... Loss: 1.1378... Val Loss: 1.2520
Epoch: 32... Step: 1530... Loss: 1.2244... Val Loss: 1.2439
Epoch: 32... Step: 1540... Loss: 1.1479... Val Loss: 1.2502
Epoch: 32... Step: 1550... Loss: 1.1438... Val Loss: 1.2475
Epoch: 32... Step: 1560... Loss: 1.1544.

Epoch: 40... Step: 2770... Loss: 1.1082... Val Loss: 1.2434
Epoch: 40... Step: 2780... Loss: 1.1383... Val Loss: 1.2422


### Checkpoint

In [19]:
net.load_state_dict(torch.load("char_level_lstm.pt"))

<All keys matched successfully>

After training, we'll save the model so we can load it again later if we need to. Here, I'm saving the parameters needed to create the same architecture, the hidden layer hyperparameters, and the next characters.

In [20]:
model_name = 'char_level_lstm.net'

checkpoint = {
    'n_hidden': net.n_hidden,
    'n_layer': net.n_layers,
    'state_dict': net.state_dict(),
    'tokens': net.chars
}

In [25]:
with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

### Loading a checkpoint

In [26]:
with open('char_level_lstm.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded_net = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'],\
                     n_layers=checkpoint['n_layer'])
loaded_net.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

### Making Predictions

The output of our RNN is from a fully-connected layer and it outputs a distribution of next-character scores. To actually get the next character, we apply a softmax function, which gives us a probability distribution that we can then sample to predict the next character.

In [49]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2chars[char], h

In [50]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [51]:
loaded_net.double()
print(sample(loaded_net, 2000, prime='Anna Levin said', top_k=2))

Anna Levin said;!;NIS;EGU;YMGI7ER@UQV87`GU8@GU;IEGI7EG@I8EGSUISGU;GVQ(!EG7QSGUIN;G@;;7GU;YMGI7EGU;GVI@RI!!GSU;GPY87W8PI!G@Q(7EGQ7GSU;GWQ(7S87`GUQ(@;GQgGSU;GPYQN87W;MGI7EGSUISRSU8@GVI@GIG@SYI7`;GSUI7GI7uQ7;GVUQGVI@GIG@SY;7`SUMGSUISGU;GUIEGK;;7G@QG*(WURSU;G@I*;GS8*;GI7EGSU;GPY87W;@@MGVUQGUIEMGSU;G@SY;7`SUGQgGU8@GU;IYSRSUISGUIEGK;;7G@;;7GSUISGU;GVI@G@QG*(WUGIGWQ7@W8Q(@GSUISGU;GVQ(!EG7QSRUIN;GK;;7G@QG@8*P!;MGI7EGUIEG@Q*;SU87`G@QG*IYY8;EGSUISGU;GVI@G7QSGIG@;WQ7ERSQGU8@GV8g;MGI7EGSQGK;G@QG*IYYuI7`!uGSUISGSU;GPYQg;@@QYGVI@G7;N;YGI7`Y8!uRSQQGSU;G*Q@SG@QYSGQgGSY8`USGSUISGU;GUIEGK;`(7GSUISGSU;G@I*;G@QYSG87GSUISRPQ@8S8Q7GQgGSU;G@I*;MGSUISGSU8@GVI@GIG@87`!;G*I7GVUQGUI@GK;;7GI!!GISGQ7W;GSUISRU8@G@Q(!GVI@GIG!QN;!uG@UI*;GQgGSU;8YG@;7@;GQgGSU;G@I*;G@Q(!EGQgGS8*;%GAU;RPY87W;@@GV;Y;GIG@8!;7W;MGI7EGU;YGU(@KI7E @GgIW;GVI@G@SYI8`USGSQGSU;G@SIS;RQgGSU;G@SY;I*MGVU;Y;GSU;uGUIEGK;;7GIGWU8!EMGI7EGSUISGU;GUIEGK;;7G@QG@SYI8`USRSUISGU8@G@S(E8QGVI@GIG@87W;YGSQGU;I!GU8*@;!gMGSU;G*Q@SGPQYSYI8SMGSQGSU;GWQ(7S87`GUQ(@;R