In [66]:
import numpy as np
import torch 
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

In [67]:
with open('anna.txt', 'r') as f:
    text = f.read()

In [68]:
chars = tuple(set(text))
print(chars)

('"', '&', 'P', 'A', 'L', 'h', 'b', 'T', '`', 'y', 'm', '-', 'U', '_', 'a', '.', 'c', 'd', 'B', 'w', 'R', '4', '6', 'e', ':', 's', '9', 'X', 'K', 'g', 'z', 'N', '0', 'E', 'M', 'r', 'H', 'O', '1', '3', '*', '@', '$', 't', 'x', ' ', 'G', 'I', 'q', 'i', "'", 'C', 'p', ';', 'v', 'j', 'V', '7', 'J', 'l', 'n', '2', 'F', 'Z', '\n', 'S', 'D', 'f', ')', ',', '8', '?', 'o', '(', 'Q', '%', 'W', '/', 'Y', '5', 'u', 'k', '!')


In [69]:
print(len(chars))

83


In [70]:
int2char = dict(enumerate(chars))
char2int = { ch : ii for ii,ch in int2char.items()}

In [71]:
encoded = np.array([char2int[ch] for ch in text])

In [72]:
print(len(encoded))

1985223


In [73]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [74]:
def get_batches(arr, n_seqs, n_steps):
    '''Create a generator that returns mini-batches of size
       n_seqs x n_steps from arr.
    '''
    
    batch_size = n_seqs * n_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:, n:n+n_steps]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_steps]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [75]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_steps=100, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.dropout = nn.Dropout(drop_prob)
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
        self.init_weights()
        
    def forward(self, x, hc):
        ''' Forward pass through the network '''
        
        x, (h, c) = self.lstm(x, hc)
        x = self.dropout(x)
        
        # Stack up LSTM outputs
        x = x.view(x.size()[0]*x.size()[1], self.n_hidden)
        
        x = self.fc(x)
        
        return x, (h, c)
    
    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Given a character, predict the next character.
        
            Returns the predicted character and the hidden state.
        '''
        if cuda:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        inputs = Variable(torch.from_numpy(x), volatile=True)
        if cuda:
            inputs = inputs.cuda()
        
        h = tuple([Variable(each.data, volatile=True) for each in h])
        out, h = self.forward(inputs, h)

        p = F.softmax(out).data
        if cuda:
            p = p.cpu()
        
        if top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
            
        return self.int2char[char], h
    
    def init_weights(self):
        ''' Initialize weights for fully connected layer '''
        initrange = 0.1
        
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-1, 1)
        
    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (Variable(weight.new(self.n_layers, n_seqs, self.n_hidden).zero_()),
                Variable(weight.new(self.n_layers, n_seqs, self.n_hidden).zero_()))
        

In [76]:
def train(net, data, epochs=10, n_seqs=10, n_steps=50, lr=0.001, clip=5, val_frac=0.1, cuda=False, print_every=10):
    ''' Traing a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        n_seqs: Number of mini-sequences per mini-batch, aka batch size
        n_steps: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        cuda: Train with CUDA on a GPU
        print_every: Number of steps for printing training and validation loss
    
    '''
    
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if cuda:
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        h = net.init_hidden(n_seqs)
        for x, y in get_batches(data, n_seqs, n_steps):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            x, y = torch.from_numpy(x), torch.from_numpy(y)
            
            inputs, targets = Variable(x), Variable(y.long())
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([Variable(each.data) for each in h])

            net.zero_grad()
            
            output, h = net.forward(inputs, h)
            loss = criterion(output, targets.view(n_seqs*n_steps))

            loss.backward()
            
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            opt.step()
            
            if counter % print_every == 0:
                
                # Get validation loss
                val_h = net.init_hidden(n_seqs)
                val_losses = []
                for x, y in get_batches(val_data, n_seqs, n_steps):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([Variable(each.data, volatile=True) for each in val_h])
                    
                    inputs, targets = Variable(x, volatile=True), Variable(y.long(), volatile=True)
                    if cuda:
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net.forward(inputs, val_h)
                    val_loss = criterion(output, targets.view(n_seqs*n_steps))
                
                    val_losses.append(val_loss.data[0])
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.data[0]),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [77]:
net = CharRNN(chars, n_hidden=512, n_layers=2)

In [78]:
n_seqs, n_steps = 128, 100
train(net, encoded, epochs=25, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=True, print_every=10)



Epoch: 1/25... Step: 10... Loss: 3.3152... Val Loss: 3.3121
Epoch: 1/25... Step: 20... Loss: 3.1955... Val Loss: 3.2105
Epoch: 1/25... Step: 30... Loss: 3.1042... Val Loss: 3.0896
Epoch: 1/25... Step: 40... Loss: 2.9229... Val Loss: 2.9301
Epoch: 1/25... Step: 50... Loss: 2.7912... Val Loss: 2.7533
Epoch: 1/25... Step: 60... Loss: 2.6062... Val Loss: 2.6364
Epoch: 1/25... Step: 70... Loss: 2.5378... Val Loss: 2.5584
Epoch: 1/25... Step: 80... Loss: 2.4679... Val Loss: 2.5025
Epoch: 1/25... Step: 90... Loss: 2.4489... Val Loss: 2.4611
Epoch: 1/25... Step: 100... Loss: 2.3942... Val Loss: 2.4232
Epoch: 1/25... Step: 110... Loss: 2.3392... Val Loss: 2.3899
Epoch: 1/25... Step: 120... Loss: 2.2875... Val Loss: 2.3584
Epoch: 1/25... Step: 130... Loss: 2.3015... Val Loss: 2.3379
Epoch: 2/25... Step: 140... Loss: 2.2583... Val Loss: 2.3077
Epoch: 2/25... Step: 150... Loss: 2.2499... Val Loss: 2.2854
Epoch: 2/25... Step: 160... Loss: 2.2271... Val Loss: 2.2545
Epoch: 2/25... Step: 170... Loss:

Epoch: 10/25... Step: 1350... Loss: 1.3521... Val Loss: 1.5118
Epoch: 10/25... Step: 1360... Loss: 1.3509... Val Loss: 1.5095
Epoch: 10/25... Step: 1370... Loss: 1.3561... Val Loss: 1.5133
Epoch: 10/25... Step: 1380... Loss: 1.3709... Val Loss: 1.5050
Epoch: 10/25... Step: 1390... Loss: 1.3885... Val Loss: 1.5142
Epoch: 11/25... Step: 1400... Loss: 1.3919... Val Loss: 1.5097
Epoch: 11/25... Step: 1410... Loss: 1.4026... Val Loss: 1.4982
Epoch: 11/25... Step: 1420... Loss: 1.3841... Val Loss: 1.5051
Epoch: 11/25... Step: 1430... Loss: 1.3574... Val Loss: 1.5127
Epoch: 11/25... Step: 1440... Loss: 1.3839... Val Loss: 1.5018
Epoch: 11/25... Step: 1450... Loss: 1.3122... Val Loss: 1.4986
Epoch: 11/25... Step: 1460... Loss: 1.3336... Val Loss: 1.4986
Epoch: 11/25... Step: 1470... Loss: 1.3246... Val Loss: 1.4952
Epoch: 11/25... Step: 1480... Loss: 1.3488... Val Loss: 1.4943
Epoch: 11/25... Step: 1490... Loss: 1.3344... Val Loss: 1.4919
Epoch: 11/25... Step: 1500... Loss: 1.3303... Val Loss:

Epoch: 20/25... Step: 2660... Loss: 1.2075... Val Loss: 1.4052
Epoch: 20/25... Step: 2670... Loss: 1.2201... Val Loss: 1.3996
Epoch: 20/25... Step: 2680... Loss: 1.1977... Val Loss: 1.4032
Epoch: 20/25... Step: 2690... Loss: 1.1861... Val Loss: 1.4024
Epoch: 20/25... Step: 2700... Loss: 1.2032... Val Loss: 1.3985
Epoch: 20/25... Step: 2710... Loss: 1.1679... Val Loss: 1.3981
Epoch: 20/25... Step: 2720... Loss: 1.1654... Val Loss: 1.4067
Epoch: 20/25... Step: 2730... Loss: 1.1676... Val Loss: 1.3982
Epoch: 20/25... Step: 2740... Loss: 1.1685... Val Loss: 1.4027
Epoch: 20/25... Step: 2750... Loss: 1.1802... Val Loss: 1.3997
Epoch: 20/25... Step: 2760... Loss: 1.1648... Val Loss: 1.4061
Epoch: 20/25... Step: 2770... Loss: 1.2017... Val Loss: 1.3996
Epoch: 20/25... Step: 2780... Loss: 1.2382... Val Loss: 1.3947
Epoch: 21/25... Step: 2790... Loss: 1.2119... Val Loss: 1.3967
Epoch: 21/25... Step: 2800... Loss: 1.2232... Val Loss: 1.3970
Epoch: 21/25... Step: 2810... Loss: 1.2166... Val Loss:

In [79]:
checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}
with open('rnn.net', 'wb') as f:
    torch.save(checkpoint, f)

In [84]:
with open('rnn.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

In [85]:
def sample(net, size, prime='The', top_k=None, cuda=False):
        
    if cuda:
        net.cuda()
    else:
        net.cpu()

    net.eval()
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = net.predict(ch, h, cuda=cuda, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = net.predict(chars[-1], h, cuda=cuda, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [86]:
print(sample(net, 2000, prime='Anna', top_k=5, cuda=False))



Anna, that the
mirst were completely turned with thotger, and at the beauty of hos
its wearted and sancage and collacted with all the mother with an old painter
of words, some art, a smint.

"I've not thought to be in the subject," said the old man,
and still made the conversation, but his secretary and talking of
his fingers, he still. He sat down in the did not care to see her attitude
on the first room.

Sht liked the sick smart still seemed in his father when he had
to go, sure the shoulders to bring a changing soft of case, and would not have
seemed to give if he would stop his house.

And was all the part to her them. The first man all went in, and
he seemed that the more said that it was the footman so important.
She had so disapproved of them, and had not thrusted on the sease,
as that she had never saying a perceasable position in her
face and a little work, but to go to his betray, but there was
not time to disleap in his counting house. Thene where when he was
not stupid, an