## Character-Level LSTM in PyTorch

This network is based off of Andrej Karpathy's post on RNNs and implementation in Torch.The network will train character by character on some text, then generate new text character by character. As an example, I will train on Anna Karenina.  Below is the general architecture of the character-wise RNN.

<img src="assets/charseq.jpeg" width="500">

In [1]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from collections import Counter

In [2]:
with open('data/anna.txt', 'r') as f:
    text = f.read()
##text = text.lower()

In [3]:
## There are only 83 unique charecters here in my text including caps and lower case
## So I am not using any dimension reduction here 
len(set(text))

83

In [4]:
words = tuple(set(text))
words_count = Counter(words)
##sorted_words = sorted(words_count, key= words_count.get, reverse = True)

## sorting not required here since this is a char RNN

int2char = dict(enumerate(words_count))
char2int = {ch:ii for ii, ch in int2char.items()}

In [5]:
## encode the text

encoded = []
for ch in text:
    encoded.append(char2int[ch])
encoded = np.array(encoded)

In [6]:
## Converting each charecter into one hot vector

In [7]:
def one_hot(arr,n_labels):
    ##rows = np.multiply(arr.shape[0],arr.shape[1])
    rows = np.multiply(*arr.shape)
    one_hot = np.zeros((rows , n_labels ),dtype = np.int32)
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    one_hot= one_hot.reshape(arr.shape[0],arr.shape[1],n_labels)
    return one_hot

In [8]:
test_seq = np.array([[1,2,3]])
one_hot(test_seq, 5)

array([[[0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0]]])

###### We just want batches to fill all the batches. So the total length should be N * M * K
###### where N = Batch_size , M = Seq_len K = No of Batches


In [9]:
def get_batches(arr, batch_size, seq_len):
    total_batches = len(arr)//(batch_size*seq_len)
    total_arr_length = batch_size*seq_len*total_batches  ## N * M * K
    
    arr =  arr [ : total_arr_length]
    
    ## We will use Batch First =True going forward
    ## This would mean the no of columns is equal to M*K
    ## So in one loop we will just yield M columns...and this would happen K times
    arr = arr.reshape(batch_size, -1)
    
    for n in (0, arr.shape[1], seq_len):
        x = arr[:,n:n+seq_len]
        y = np.zeros_like(x)
        print(y.shape)
        try:
            y[:,:-1] = x[:,1:]
            y[:,-1] = arr[:,n+seq_len]
        except IndexError: ## for the last Seq_length
            y[:,:-1] = x[:,1:]
            y[:,-1] = arr[:,0]
            
        yield x,y

In [10]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [11]:
##test get_batches 

batches = get_batches(encoded, 8,100)
x,y = next(batches)

# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[11 50 43 19 33 44  5 51 27 28]
 [52 51 33 50 44 51 33  5 43 82]
 [66 82 58  8 51 50 82 68 51 23]
 [76 58 44 23 51 43 58 17 51 33]
 [44 26 51 60 37 33 51 50 44 51]
 [51 18 66 44 59 43 58 17  5 76]
 [51  3 76 33 50 51  3 44 31 43]
 [52 43  8 44 51 82 58 51 53 50]]

y
 [[50 43 19 33 44  5 51 27 28 28]
 [51 33 50 44 51 33  5 43 82 58]
 [82 58  8 51 50 82 68 51 23 76]
 [58 44 23 51 43 58 17 51 33 76]
 [26 51 60 37 33 51 50 44 51 50]
 [18 66 44 59 43 58 17  5 76 69]
 [ 3 76 33 50 51  3 44 31 43 37]
 [43  8 44 51 82 58 51 53 50 82]]


In [12]:
gpu = torch.cuda.is_available()
if gpu:
    print('Training on GPU')
else:
    print('Not Training on GPU')

Training on GPU


In [13]:
#### Defining the network with PyTorch

In [14]:
class CharRNN(nn. Module):
    def __init__(self,tokens, n_hidden=256, n_layers=2,drop_prob = 0.5,lr = 0.001):
        super().__init__()
        ## Class variables
        self.drop_prob = drop_prob
        self.lr = lr
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        ## create class dictionaries 
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch:ii for ii,ch in self.int2char.items()}
        
        ## Create the layers 
        ## I am not using any embedding here 
        input_size = len(self.chars)
        output_size = len(self.chars)
        self.lstm = nn.LSTM(input_size, self.n_hidden, self.n_layers, 
                           dropout = drop_prob, batch_first = True)
        self.dropout = nn.Dropout(drop_prob)
        self.FC = nn.Linear(self.n_hidden, output_size)

    def forward(self,x,hidden):
        
        r_out,hidden = self.lstm(x,hidden)
        r_out = self.dropout(r_out)
        
        r_out = r_out.contiguous().reshape(-1,self.n_hidden)
        
        out = self.FC(r_out)
        
        return out,hidden
    def init_hidden(self,batch_size):
        
        ### initialized to zero, for hidden state and cell state of LSTM
        
        weights = next(self.parameters()).data
        
        if gpu: ## Create Cuda tensors
            hidden = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weights.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weights.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        return hidden

In [15]:
## Build the train function

In [16]:
def train(model,data,epochs =1,batch_size = 10,seq_len =10,lr = 0.001,clip = 5, val_frac= 0.1, print_freq = 10):
    
    if gpu:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    ##optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    model_name = 'char_rnn.net'
    
    
    ##Create train and Validation set 
    val_idx = int(len(data)*(1- val_frac))
    train_data, valid_data = data[:val_idx], data[val_idx:]
    
    #### counter and vocab size
    vocab_size = len(model.chars)
    minimum_val_loss =np.inf
    ##minimum_val_loss =3.1185
    
    
    
    for epoch in range(epochs):
        counter = 0
        train
        
        h = model.init_hidden(batch_size)
        for x,y in get_batches(train_data,batch_size,seq_len):
            model.train()
            counter +=1
            x = one_hot(x,vocab_size)
            inputs = torch.from_numpy(x)
            inputs = inputs.type(torch.FloatTensor)
            labels = torch.from_numpy(y)
            labels = labels.type(torch.FloatTensor)
            if gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()
            h = tuple([each.data for each in h]) ## this is already in cuda if GPU is available
            
            model.zero_grad()
            output,h = model(inputs,h)
            labels = labels.type(torch.cuda.LongTensor)
            loss = criterion(output,labels.view(batch_size*seq_len))
            loss.backward()            
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()
            
            if counter% print_freq == 0:
                model.eval()
                val_losses =[]
                val_h = model.init_hidden(batch_size)
                for val_x,val_y in get_batches(valid_data,batch_size,seq_len):
                    val_x= one_hot(val_x,vocab_size)
                    inputs, labels = torch.from_numpy(val_x), torch.from_numpy(val_y)
                    inputs = inputs.type(torch.FloatTensor)
                    labels = labels.type(torch.FloatTensor)
                    inputs,labels = inputs.cuda(),labels.cuda()
                    val_h = tuple([each.data for each in val_h])
                    output,val_h = model(inputs,val_h)
                    labels = labels.type(torch.cuda.LongTensor)
                    loss= criterion(output,labels.view(batch_size*seq_len))
                    val_losses.append(loss.item())
  
                print("Epoch: {}/{}...".format(epoch+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
    
                ## Save Model if validation loss goes down
                if (np.mean(val_losses) < minimum_val_loss):
                    minimum_val_loss = np.mean(val_losses)
                    checkpoint = {'n_hidden' : model.n_hidden,
                                  'n_layers' : model.n_layers,
                                   'state_dict': model.state_dict(),
                                   'tokens' :model.chars}
                    print('Saving Model...')
        
                with open(model_name, 'wb') as f:
                    torch.save(checkpoint,f)


In [17]:
##Hyperparameters

n_hidden= 512
n_layers=2
drop_prob = 0.5
lr = 0.001

model = CharRNN(words,n_hidden,n_layers,drop_prob,lr)
print(model)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (FC): Linear(in_features=512, out_features=83, bias=True)
)


In [29]:
data= encoded
epochs = 20
batch_size = 128
seq_len = 100
lr = 0.001
clip = 5
val_frac= 0.1
##print_freq = 10

##Train the model
train(model,data,epochs,batch_size,seq_len,lr,clip, val_frac= 0.1 )


Epoch: 1/20... Step: 10... Loss: 1.5611... Val Loss: 1.5153
Saving Model...
Epoch: 1/20... Step: 20... Loss: 1.5503... Val Loss: 1.4999
Saving Model...
Epoch: 1/20... Step: 30... Loss: 1.5450... Val Loss: 1.4926
Saving Model...
Epoch: 1/20... Step: 40... Loss: 1.5386... Val Loss: 1.4889
Saving Model...
Epoch: 1/20... Step: 50... Loss: 1.5375... Val Loss: 1.4872
Saving Model...
Epoch: 1/20... Step: 60... Loss: 1.5360... Val Loss: 1.4828
Saving Model...
Epoch: 1/20... Step: 70... Loss: 1.5350... Val Loss: 1.4835
Epoch: 1/20... Step: 80... Loss: 1.5373... Val Loss: 1.4822
Saving Model...
Epoch: 1/20... Step: 90... Loss: 1.5262... Val Loss: 1.4763
Saving Model...
Epoch: 1/20... Step: 100... Loss: 1.5266... Val Loss: 1.4743
Saving Model...
Epoch: 1/20... Step: 110... Loss: 1.5292... Val Loss: 1.4740
Saving Model...
Epoch: 1/20... Step: 120... Loss: 1.5269... Val Loss: 1.4703
Saving Model...
Epoch: 1/20... Step: 130... Loss: 1.5204... Val Loss: 1.4689
Saving Model...
Epoch: 2/20... Step: 10.

Epoch: 10/20... Step: 30... Loss: 1.3652... Val Loss: 1.3046
Saving Model...
Epoch: 10/20... Step: 40... Loss: 1.3716... Val Loss: 1.3087
Epoch: 10/20... Step: 50... Loss: 1.3786... Val Loss: 1.3111
Epoch: 10/20... Step: 60... Loss: 1.3806... Val Loss: 1.3138
Epoch: 10/20... Step: 70... Loss: 1.3746... Val Loss: 1.3108
Epoch: 10/20... Step: 80... Loss: 1.3796... Val Loss: 1.3130
Epoch: 10/20... Step: 90... Loss: 1.3746... Val Loss: 1.3073
Epoch: 10/20... Step: 100... Loss: 1.3723... Val Loss: 1.3051
Epoch: 10/20... Step: 110... Loss: 1.3691... Val Loss: 1.3034
Saving Model...
Epoch: 10/20... Step: 120... Loss: 1.3722... Val Loss: 1.3027
Saving Model...
Epoch: 10/20... Step: 130... Loss: 1.3677... Val Loss: 1.3004
Saving Model...
Epoch: 11/20... Step: 10... Loss: 1.3715... Val Loss: 1.3029
Epoch: 11/20... Step: 20... Loss: 1.3721... Val Loss: 1.3024
Epoch: 11/20... Step: 30... Loss: 1.3683... Val Loss: 1.3011
Epoch: 11/20... Step: 40... Loss: 1.3757... Val Loss: 1.3079
Epoch: 11/20... S

Epoch: 19/20... Step: 120... Loss: 1.3386... Val Loss: 1.2627
Epoch: 19/20... Step: 130... Loss: 1.3310... Val Loss: 1.2568
Epoch: 20/20... Step: 10... Loss: 1.3253... Val Loss: 1.2554
Saving Model...
Epoch: 20/20... Step: 20... Loss: 1.3376... Val Loss: 1.2598
Epoch: 20/20... Step: 30... Loss: 1.3383... Val Loss: 1.2600
Epoch: 20/20... Step: 40... Loss: 1.3346... Val Loss: 1.2572
Epoch: 20/20... Step: 50... Loss: 1.3317... Val Loss: 1.2582
Epoch: 20/20... Step: 60... Loss: 1.3340... Val Loss: 1.2559
Epoch: 20/20... Step: 70... Loss: 1.3359... Val Loss: 1.2606
Epoch: 20/20... Step: 80... Loss: 1.3403... Val Loss: 1.2621
Epoch: 20/20... Step: 90... Loss: 1.3348... Val Loss: 1.2582
Epoch: 20/20... Step: 100... Loss: 1.3368... Val Loss: 1.2600
Epoch: 20/20... Step: 110... Loss: 1.3414... Val Loss: 1.2638
Epoch: 20/20... Step: 120... Loss: 1.3377... Val Loss: 1.2595
Epoch: 20/20... Step: 130... Loss: 1.3305... Val Loss: 1.2574


##### Loading the saved model


In [32]:
 
with open('char_rnn.net', 'rb') as f:
    checkpoint = torch.load(f)
model = CharRNN(checkpoint['tokens'],n_hidden =checkpoint['n_hidden'],n_layers=checkpoint['n_layers'])
model.load_state_dict(checkpoint['state_dict'])

In [33]:
## Prediction using this model

In [34]:
def predict(model, char,h = None, top_k = None):
    ## char is the input charecter here
    ##model.eval() (while call eval during priming)
    vocab_size = len(model.chars)
    x = np.array([[char2int[char]]]) ## multipli dimentional
    x = one_hot(x,vocab_size)
    x = torch.from_numpy(x)
    x = x.type(torch.FloatTensor)
    if gpu:
        x= x.cuda()
    h = tuple([each.data for each in h])
    
    output, h = model(x,h)
    p= F.softmax(output,dim =1).data
    if gpu:
        p = p.cpu()
    if top_k ==None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch= top_ch.numpy().squeeze()
    p = p.numpy().squeeze()
    ch = np.random.choice(top_ch, p = p/p.sum())
    character = int2char[ch]
    return character,h

In [35]:
## Priming and generating Text

In [36]:
def generate_text(model, size, prime='The', top_k=None):
    if gpu:
        model.cuda()
    chars = [ch for ch in prime]
    h = model.init_hidden(1)  ## Batch size is 1 
    model.eval()
    for ch in prime:
        char,h = predict(model, ch,h , top_k = top_k)
    chars.append(char) ## just add the last one after prime
    
    for i in range(size):
        char,h = predict(model, chars[-1],h , top_k = top_k)
        chars.append(char)
        
    return ''.join(chars)
    

In [49]:
generated_text =generate_text(model, size=1000, prime='You are', top_k=5)

In [50]:
print(generated_text)

You are to blame in
the morning and happened, and what has said: "You will be delighted at your
acquainteninc."

"I see it in her eyes as I see anything."

"Yes, I should have been to be to care to see, to speak of him. But
I see it's a care. Take their marshal to the thing of it."

"Yes," he said. "I have not meant. You see it, are they have so partional of
me, then I settled him about her in work in the carrying myself of
all sours, and have that to make out when this is so much. I so good
to be a look of still. And these all mind towards the story of his marriage,
when I don't understand him. I've been an arm or the child, but what a discussion with
the there there is in those though they did never come or work
in his brother's feeling at the book as I should have so much," said
Levin, with the secretary who came to her trough, and as a conditions of
those chest--that was saying that he could not say in a sincerity, who show
them, and without standing that he had terror and that she

##### HAHA!! It was fun :)