In [1]:
## Most of the code in this notebook is taken from https://www.kaggle.com/francescapaulin/character-level-lstm-in-pytorch/notebook
## and modified as required


import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import os
import re


In [2]:
# open text file and read in data as `text`
with open('/data/shakespeare.txt', 'r') as f:
    text = f.read()

In [3]:
def text_lowercase(text): 
    return text.lower() 

def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 

def remove_punctuation(text): 
    punctuation_set = ':' + ';' + ',' + '.' + '!' + '?' + '(' + ')'
    translator = str.maketrans('', '', punctuation_set)
    return text.translate(translator).replace("' ", " ")


text = remove_numbers(text_lowercase(text))
text = text.replace("\n\n                   \n", "")
text = text.replace("                   \n", "")
text = text.replace("  ", "")

seq_length =40

In [4]:
# encode the text and map each character to an integer and vice versa

# we create two dictionaries:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [5]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [12]:
def get_batches(arr, batch_size, seq_length, skip_size=1):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''

#     batch_size_total = batch_size * seq_length
    
#     # total number of batches we can make, // integer division, round down
#     n_batches = len(arr)//batch_size_total
#     print(n_batches)
    
#     # Keep only enough characters to make full batches
#     arr = arr[:n_batches * batch_size_total]
#     # Reshape into batch_size rows, n. of first row is the batch size, the other lenght is inferred
#     arr = arr.reshape((batch_size, -1))

#     # iterate through the array, one sequence at a time
#     for n in range(0, arr.shape[1], seq_length):
#         # The features
#         x = arr[:, n:n+seq_length]
#         # The targets, shifted by one
#         y = np.zeros_like(x)
#         try:
#             y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
#         except IndexError:
#             y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
#         yield x, y 
        
    

    n_batches = (len(arr)-seq_length-1) // batch_size
    i=0
    
    for bb in range(n_batches):
        
        x = np.zeros([batch_size,seq_length],np.int64)
        y = np.zeros([batch_size,seq_length],np.int64)
        
        for rr in range(batch_size):
            x[rr,:] = arr[i:i+seq_length]
            y[rr,:] = arr[i+1:i+seq_length+1]
            i = i+1
        
        yield x ,y
        
# #when we call get batches we are going 
#to create a generator that iteratest through our array and returns x, y with yield command

In [13]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

No GPU available, training on CPU; consider making n_epochs very small.


In [14]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=150, n_layers=1,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        r_output, hidden = self.lstm(x, hidden)
        
        out = self.dropout(r_output)
        
        out = out.contiguous().view(-1, self.n_hidden)
        
        out = self.fc(out)
                
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [15]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
#     val_idx = int(len(data)*(1-val_frac))
#     data, val_data = data[:val_idx], data[val_idx:]
#     print(data.shape)
#     print(val_data.shape)
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        net.train()
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length))
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
        
        net.eval()

        correct = 0

        # Turning off automatic differentiation
        with torch.no_grad():
            for x, y in get_batches(data, batch_size, seq_length):
                x = one_hot_encode(x, n_chars)
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
                h = tuple([each.data for each in h])
                output, h = net(inputs, h)

                pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
                correct += pred.eq(targets.view_as(pred)).sum().item()

#         test_loss /= len(test_loader.dataset)
            
        
        print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Acc: {:.4f}...".format(correct/(1463*64*40)))
            
            # loss stats
#             if counter % print_every == 0:
#                 # Get validation loss
#                 val_h = net.init_hidden(batch_size)
#                 val_losses = []
#                 net.eval()
#                 for x, y in get_batches(val_data, batch_size, seq_length):
#                     # One-hot encode our data and make them Torch tensors
#                     x = one_hot_encode(x, n_chars)
#                     x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
#                     # Creating new variables for the hidden state, otherwise
#                     # we'd backprop through the entire training history
#                     val_h = tuple([each.data for each in val_h])
                    
#                     inputs, targets = x, y
#                     output, val_h = net(inputs, val_h)
#                     val_loss = criterion(output, targets.view(batch_size*seq_length))
                
#                     val_losses.append(val_loss.item())
                
#                 net.train() # reset to train mode after iterationg through validation data
                
#                 print("Epoch: {}/{}...".format(e+1, epochs),
#                       "Step: {}...".format(counter),
#                       "Loss: {:.4f}...".format(loss.item()),
#                       "Val Loss: {:.4f}".format(np.mean(val_losses)))


In [16]:
# define and print the net
n_hidden=200
n_layers=1

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(38, 200, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=200, out_features=38, bias=True)
)


In [17]:
batch_size = 64
seq_length = 40 #max length verses
n_epochs = 50 # start smaller if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=50)

Epoch: 1/50... Step: 1463... Loss: 1.8565... Acc: 0.3836...
Epoch: 2/50... Step: 2926... Loss: 1.7550... Acc: 0.4260...
Epoch: 3/50... Step: 4389... Loss: 1.6935... Acc: 0.4547...
Epoch: 4/50... Step: 5852... Loss: 1.6585... Acc: 0.4718...
Epoch: 5/50... Step: 7315... Loss: 1.6591... Acc: 0.4845...
Epoch: 6/50... Step: 8778... Loss: 1.6211... Acc: 0.4948...
Epoch: 7/50... Step: 10241... Loss: 1.6061... Acc: 0.5041...
Epoch: 8/50... Step: 11704... Loss: 1.6188... Acc: 0.5126...
Epoch: 9/50... Step: 13167... Loss: 1.5871... Acc: 0.5200...
Epoch: 10/50... Step: 14630... Loss: 1.5796... Acc: 0.5273...
Epoch: 11/50... Step: 16093... Loss: 1.5799... Acc: 0.5334...
Epoch: 12/50... Step: 17556... Loss: 1.5868... Acc: 0.5386...
Epoch: 13/50... Step: 19019... Loss: 1.5791... Acc: 0.5435...
Epoch: 14/50... Step: 20482... Loss: 1.5495... Acc: 0.5479...
Epoch: 15/50... Step: 21945... Loss: 1.5374... Acc: 0.5517...
Epoch: 16/50... Step: 23408... Loss: 1.5497... Acc: 0.5552...
Epoch: 17/50... Step: 2

In [18]:
checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open('lstm_dense_50_epoch.net', 'wb') as f:
    torch.save(checkpoint, f)

In [19]:
def predict(net, char, T, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)

        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        # apply softmax to get p probabilities for the likely next character giving x
        p = F.softmax(out/T, dim=1).data

        # get top characters
        # considering the k most probable characters with topk method
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [20]:
def sample(net, size, T, prime='Il', top_k=None):
        
    net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, T, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], T, h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [21]:
print(sample(net, 1000, T = 1, prime="shall i compare thee to a summer's day?\n"))


shall i compare thee to a summer's day?
then tell my love is forget sicker eyes,
then whoule my proud heart to so thy subject thee out.
her by my poor not thy sweet words if vialing,
who leaves knows use of thine be in gazed how to me with alterporion call growd thine ede him of thee, thy more be,
that thy herst excestoned know sway,
threw twed forth which loke pass thy rest
a viigh and that then art a doth ever be,
to more word aase,
to mare are groan,
then thou acted that i foon,
the can my bedies ride,
thy love part that prike,
that that that wet the unks to me:
thus that i see doth tround though eyes and forswild,
in flestest of not i in u,
doth heart per love ow overst by thy self belight she soar blessed, and bear,
my confeet,
and worls bitter swornst in reatures mive for bording esextrence.
wo the liest a facollong inward have own spure of that by a manting born date.
riving mine kniget.
that i well love recome that,
love of thich all my love thee fair thy fortuen best,
that des

In [22]:
print(sample(net, 1000,T = 0.75, prime="shall i compare thee to a summer's day?\n"))


shall i compare thee to a summer's day?
thy pingured my firet thoughts, or when i am dost spet me in thee i in thy hime be olden be unforned of thy friend me wet race,
but thou thou lov'st that be.
so that beauty of thy fingers and in thee that stere you dost live hath placue the fairest wards these liss in one,
and therefore was of the best and thy ingarded  bitter scanted words thou sense nd thou have the found, that heaven's grest,
and though thy self disgrace be the sin thy fair dare my love, and thee and her forged fair last,
who all thee thrie in and thee is not be food:
so mo thou dost thou with me that thou shaltering thy self i form hath thou bities thou thou to mine own love sufat more thou art and wanting thy constancies when i not beauty,
and thue i not then me doth thou thy hand,
to love her with he proud heart that my breath i roses,
thy dicaily in such a both sweet love in that beauty there is mine in tangle is to mine eyes doth not lease my will,
and thy mournard of my 

In [23]:
print(sample(net, 1000,T = 0.25, prime="shall i compare thee to a summer's day?\n"))


shall i compare thee to a summer's day?
the can my self a falles i restill,
and therefore to the self art beauty still,
and i am my love that in my love that thought to me so the thy sweet self so thoughts doth see thee,
which in thy sweet self all thy self art thought to thee as the world is to the self-love to concest to me that the world with thy brand thee,
when i be better that my heart to thee most be to the sun the thing thee that i am thy self art that i have sweet self to thee i be not in thy thing on thee as thou art all that i do be the sun the world thou most and thee,
where is thy proud heart to me that the beauty of thy self thy self art in thee as the confounds,
that i say they so that which thou thy self alone.
the store thee and thee for my love thee that i with touch she thou thy sweet self be thee and thee,
when i best that thou mayst in the fairest thoughts in thee,
and therefore to my self the world and thee,
when i bate that thou thy self alone.
that i am thy self

In [24]:
print(sample(net, 1000, prime="shall i compare thee to a summer's day?\n",T = 1e-10))


shall i compare thee to a summer's day?
the world to the sun the fairest fair that thou art thoughts in thee,
and therefore to thee my self to thee that i am forsworn,
but thou thy self a thee,
which in thy strong beauty speak in thee,
and thou art to thee the world to be thee that i am forst enceeded with that which thy sweet self alone,
that i am thy self thy self art to my self a faire,
that i have sweet self to thee more than the world and thee,
which thou thy self thy self art to my self a faire,
that i have sweet self to thee more than the world and thee,
which thou thy self thy self art to my self a faire,
that i have sweet self to thee more than the world and thee,
which thou thy self thy self art to my self a faire,
that i have sweet self to thee more than the world and thee,
which thou thy self thy self art to my self a faire,
that i have sweet self to thee more than the world and thee,
which thou thy self thy self art to my self a faire,
that i have sweet self to thee more t