## Part 1: Importing libraries


In [2]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

import glob
import errno
import sys, re, os
from ipy_table import *
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)


## Part 2: Data Loading and Preprocessing

We take the corpus of Shakespeare plays and try to generate Shakespearan English

In [1]:
with open('./Data/shakespeare.txt', 'r') as f:
    text = f.read()    

text = text.replace('\n','')
text = text.replace('\r','')
text = text.replace('\t','')

In [None]:
text[:5000]

#### Embeddings: One-hot vectors
- We take characters and convert them to integers
- These integers are encoded as one-hot vectors
- Character embeddings can be used for better results

In [4]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

encoded = np.array([char2int[ch] for ch in text])

encoded[:100]


array([68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
       68, 68, 68, 68,  9, 68, 68, 76, 78, 40, 59, 68, 43, 16, 65, 78, 74,
       20, 13, 68, 75, 78, 74, 16, 13, 70, 78, 74, 20, 68, 60, 74, 68, 45,
       74, 20, 65, 78, 74, 68, 65, 54, 75, 78, 74, 16, 20, 74, 73, 68, 68,
        8, 28, 16, 13, 68, 13, 28, 74, 78, 74, 64, 17, 68, 64, 74, 16, 70,
       13, 17, 46, 20, 68, 78, 40, 20, 74, 68, 59, 65, 55, 28, 13])

In [5]:
def one_hot_encode(arr, n_labels):
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    return one_hot

# Generator based on character
def get_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    n_batches = len(arr)//batch_size_total
    arr = arr[:n_batches * batch_size_total]
    arr = arr.reshape((batch_size, -1))
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y


## Part 3: Defining the Model

- We define an LSTM (1 LSTM unit, 1 fully connected layer) to capture character information
- The LSTM during training learns a "language model" on characters
- It is then able to predict the next character "fluently"

In [6]:
# Check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available')
    
#The model
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        #define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
       


No GPU available, training on CPU; consider making n_epochs very small.


## Part 4: Training
- Optimizer: Adam
- Loss: Cross Entropy

In [10]:
loss_array = []
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):

    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            h = tuple([each.data for each in h])

            net.zero_grad()

            output, h = net(inputs, h)
            
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            if counter % print_every == 0:
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() 
                loss_array.append(loss.item())
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [None]:
n_hidden=256
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

batch_size = 32
seq_length = 100
n_epochs = 3

train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=2)

model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)
    
def predict(net, char, h=None, top_k=None):

        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        h = tuple([each.data for each in h])
        out, h = net(inputs, h)

        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() 
        
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        return net.int2char[char], h
        
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval()
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)
    
# Generating new text
print(sample(net, 1000, prime='T', top_k=10))

CharRNN(
  (lstm): LSTM(83, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=256, out_features=83, bias=True)
)
Epoch: 1/3... Step: 2... Loss: 4.3904... Val Loss: 4.3610
Epoch: 1/3... Step: 4... Loss: 4.3227... Val Loss: 4.2575
Epoch: 1/3... Step: 6... Loss: 4.1257... Val Loss: 3.8788
Epoch: 1/3... Step: 8... Loss: 3.5666... Val Loss: 3.4522
Epoch: 1/3... Step: 10... Loss: 3.5318... Val Loss: 3.3275
Epoch: 1/3... Step: 12... Loss: 3.2911... Val Loss: 3.2493
Epoch: 1/3... Step: 14... Loss: 3.2398... Val Loss: 3.2440
Epoch: 1/3... Step: 16... Loss: 3.2334... Val Loss: 3.2389
Epoch: 1/3... Step: 18... Loss: 3.3169... Val Loss: 3.2203
Epoch: 1/3... Step: 20... Loss: 3.2534... Val Loss: 3.2004
Epoch: 1/3... Step: 22... Loss: 3.3613... Val Loss: 3.1927
Epoch: 1/3... Step: 24... Loss: 3.3150... Val Loss: 3.1937
Epoch: 1/3... Step: 26... Loss: 3.2331... Val Loss: 3.1951
Epoch: 1/3... Step: 28... Loss: 3.2094... Val Loss: 3.1928
Epoch: 1/3

In [None]:
x = list(range(len(loss_array)))
y = loss_array[:]

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots()
ax.plot(x, y)

ax.set(xlabel='Training step', ylabel='loss',
       title='Training loss')
ax.grid()
# fig.savefig("test.png")
plt.show()