In [1]:
# !wget https://github.com/udacity/deep-learning/blob/master/tensorboard/anna.txt
# enwik8: http://prize.hutter1.net/

In [2]:
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
import pdb
import time
import string

In [4]:
from pathlib import Path
path = Path("enwik8/enwik8")
text = path.open(encoding="utf8").read()

In [5]:
printable = set(string.printable)
text_clean = list(filter(lambda x: x in printable, text))

# get the set of all characters
characters = tuple(set(text_clean))

# use enumeration to give the characters integer values
int2char = dict(enumerate(characters))

# create the look up dictionary from characters to the assigned integers
char2int = {char: index for index, char in int2char.items()}

# encode the text, using the character to integer dictionary
encoded = np.array([char2int[char] for char in text_clean])

In [6]:
to_gpu = True
def gpu(m):
    if to_gpu:
        return m.cuda()
    return m

In [7]:
def get_batches(arr, n_seqs_in_a_batch, n_characters):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       n_seqs: Batch size, the number of sequences per batch
       n_steps: Number of sequence steps per batch
    '''
    
    batch_size = n_seqs_in_a_batch * n_characters
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs_in_a_batch, -1))
    
    for n in range(0, arr.shape[1], n_characters):
        # The features
        x = arr[:, n:n+n_characters]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_characters]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [8]:
def ints_to_tensor(ints):
    return gpu(torch.tensor(ints).long().transpose(1, 0))

def xy_to_tensor(x, y):
    x = ints_to_tensor(x)
    y = torch.tensor(y.T).long()
    return x, gpu(y)

In [9]:
# build the model using the pytorch nn module
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_dim, batch_size, embedding_dim):
        super(CharLSTM, self).__init__()
        
        # init the meta parameters
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm_1 = nn.LSTMCell(input_size=embedding_dim, hidden_size=hidden_dim)
        self.lstm_2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=hidden_dim) 
        
        self.dropout = nn.Dropout(p=0.5)

        # fully connected layer to connect the output of the LSTM cell to the output
        self.fc = nn.Linear(in_features=hidden_dim, out_features=vocab_size)
        
    def forward(self, x, hc, return_hc=False):
        seq_len = x.shape[0]
        batch_size = x.shape[1]
        
        # empty tensor for the output of the lstm
        output_seq = torch.empty((seq_len, batch_size, self.vocab_size))
        output_seq = gpu(output_seq)
        hc1, hc2 = hc, hc

        # for every step in the sequence
        for t in range(seq_len):
            out_t, hc1, hc2 = self.feed_one_x_t(x[t], hc1, hc2)
            output_seq[t] = out_t
        
        if return_hc:
            return output_seq, hc1, hc2
        return output_seq
            
    def init_hidden(self, bs=None):
        if bs is None:
            bs = self.batch_size
        # initialize the <hidden state> and the <cell state> to zeros
        return (gpu(torch.zeros(bs, self.hidden_dim)), gpu(torch.zeros(bs, self.hidden_dim)))
    
    def feed_one_x_t(self, x_t, hc1, hc2):
        # convert batch of single ints to batch of embeddings
        xt_emb = self.emb(x_t) # returns (batch_size, embedding_dim)

        # get the hidden and cell states from the first layer cell
        hc1 = self.lstm_1(xt_emb, hc1)
        h1, c1 = hc1 # unpack the hidden and the cell states from the first layer

        # pass the hidden state from the first layer to the cell in the second layer
        hc2 = self.lstm_2(h1, hc2)
        h2, c2 = hc2 # unpack the hidden and cell states from the second layer cell

        # form the output of the fc
        out_t = self.fc(self.dropout(h2))
        
        return out_t, hc1, hc2
    
    def feed_one_char(self, char, hc1, hc2):
        ints = [char2int[char]] # sequence of ints 
        ints = [ints] # a 1-batch of seqs
        x = ints_to_tensor(ints) # shape of (seq_len, batch_size)
        x_t = x[0] # take the first (single) part of the sequence
        
        return self.feed_one_x_t(x_t, hc1, hc2)
    
    def warm_up(self, base_str):
        hc = net.init_hidden(bs=1)
        ints = [char2int[c] for c in base_str]  # sequence of ints 
        ints = [ints] # a 1-batch of seqs
        x = ints_to_tensor(ints) # shape of (seq_len, batch_size)
        
        out, hc1, hc2 = self.forward(x, hc, return_hc=True)
        return out, hc1, hc2
    
    def sample_char(self, out_t, top_k=5):
        # apply the softmax to the output to get the probabilities of the characters
        out_t = F.softmax(out_t, dim=1)

        # out_t now holds the vector of predictions (1, vocab_size)
        # we want to sample 5 top characters
        p, top_char = out_t.topk(top_k) # returns tuple (top_values, top_indices)

        # get the top k characters by their probabilities
        top_char = top_char.cpu().squeeze().numpy()

        # sample a character using its probability
        p = p.detach().cpu().squeeze().numpy()
        char_int = np.random.choice(top_char, p = p/p.sum())
        
        return int2char[char_int]
        
    def predict(self, base_str, top_k=5, seq_len=128):
        self.eval()

        res = np.empty(seq_len+len(base_str), dtype="object")
        for i, c in enumerate(base_str):
            res[i] = c
        
        out_warm, hc1, hc2 = self.warm_up(base_str)
        out_t = out_warm[-1]

        for i in range(seq_len):
            char = self.sample_char(out_t, top_k)
            out_t, hc1, hc2 = self.feed_one_char(char, hc1, hc2)
            res[i + len(base_str)] = char
        
        return ''.join(res)
        

In [10]:
BS = 500 # 500
embedding_dim = 100
vocab_size=len(char2int)
hidden_dim = 512 # 512
seq_len = 128 # 128
seq_len_BS = seq_len * BS

In [11]:
print(f'BS: {BS}')
print(f'embedding_dim: {embedding_dim}')
print(f'vocab_size: {vocab_size}')
print(f'hidden_dim: {hidden_dim}')
print(f'seq_len: {seq_len}')
print(f'seq_len_BS: {seq_len_BS}')

BS: 500
embedding_dim: 100
vocab_size: 97
hidden_dim: 512
seq_len: 128
seq_len_BS: 64000


In [12]:
# get the validation and the training data
val_idx = int(len(encoded) * (1 - 0.1))
data, val_data = encoded[:val_idx], encoded[val_idx:]

In [13]:
# compile the network - sequence_len, vocab_size, hidden_dim, batch_size
net = CharLSTM(vocab_size=len(char2int), hidden_dim=hidden_dim, batch_size=BS, embedding_dim=embedding_dim)
net = gpu(net)

# define the loss and the optimizer
optimizer = optim.Adam(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# quick sanity

In [14]:
x, y = next(get_batches(data, BS, seq_len))

In [15]:
x, y = xy_to_tensor(x, y)

In [16]:
x.shape, y.shape

(torch.Size([128, 500]), torch.Size([128, 500]))

In [17]:
hc = net.init_hidden()
out = net(x, hc)
out.shape # (seq_len, batch_size, vocab_size)

torch.Size([128, 500, 97])

In [18]:
net.sample_char(out[-1, 0:1])

'7'

In [19]:
net.predict("The")

"TheMC'''('fM''''9\n(fU(f}Mf MfC f}77]fM7Cf''(}A}(MC700]]. 9]C7''ff7 MC''''9Eff.( } }ff77f7]]77Af}}f7fMf M}fo00 C77 7'0f77 0]f]M977f7"

# Train

In [None]:
val_losses = list() # empty list for the validation losses
net.eval()

for epoch in range(10):
    
    # reinit the hidden and cell states
    hc = net.init_hidden()
    
    for i, (x, y) in enumerate(get_batches(data, BS, seq_len)):
        x_train, y_true = xy_to_tensor(x, y)
        optimizer.zero_grad() # zero out the gradients
    
        # forward pass
        t0 = time.time()
        y_pred = net(x_train, hc)
        if i == 0:
            print(f'elapsed forward: {time.time() - t0}')
    
        # calculate the loss
        # we need to calculate the loss across all batches, so we have to flat the y_true tensor
        loss = criterion(y_pred.view(BS*seq_len, -1), y_true.view(BS*seq_len)) # .contiguous()?
        
        # calculate the gradients
        t0 = time.time()
        loss.backward()
        if i == 0:
            print(f'elapsed backward: {time.time() - t0}')
        
        # update the parameters of the model
        optimizer.step()
    
        print("Epoch: {}, Batch: {}, Train Loss: {:.6f}".format(epoch, i, loss.item()))

        # feedback every 10 batches
        with torch.no_grad():
            if i % 10 == 0: 
                net.eval()
                
                x, y = next(get_batches(val_data, BS, seq_len))
                x_val, y_val = xy_to_tensor(x, y)
                hc_val = net.init_hidden()
                y_pred = net(x_val, hc_val)
                loss_val = criterion(y_pred.view(BS*seq_len, -1), y_val.view(BS*seq_len)) # .contiguous()?
                
                net.train()
                print("Epoch: {}, Batch: {}, Train Loss: {:.6f}, Validation Loss: {:.6f}".format(epoch, i, loss.item(), loss_val.item()))

                sample = net.predict("The", seq_len=100)
                print(sample)
                print()


elapsed forward: 0.12067842483520508
elapsed backward: 0.4368586540222168
Epoch: 0, Batch: 0, Train Loss: 1.522233
Epoch: 0, Batch: 0, Train Loss: 1.522233, Validation Loss: 1.487424
The succession]], although the [[Conserve]], and arolitically at the cources and according important as

Epoch: 0, Batch: 1, Train Loss: 1.506478
Epoch: 0, Batch: 2, Train Loss: 1.507504
Epoch: 0, Batch: 3, Train Loss: 1.488862
Epoch: 0, Batch: 4, Train Loss: 1.483703
Epoch: 0, Batch: 5, Train Loss: 1.507480
Epoch: 0, Batch: 6, Train Loss: 1.497443
Epoch: 0, Batch: 7, Train Loss: 1.504932
Epoch: 0, Batch: 8, Train Loss: 1.489003
Epoch: 0, Batch: 9, Train Loss: 1.484921
Epoch: 0, Batch: 10, Train Loss: 1.507722
Epoch: 0, Batch: 10, Train Loss: 1.507722, Validation Loss: 1.485062
The film ship in 1984. The coup the found of case.  The could be common implement is natural teachers a

Epoch: 0, Batch: 11, Train Loss: 1.489842
Epoch: 0, Batch: 12, Train Loss: 1.482748
Epoch: 0, Batch: 13, Train Loss: 1.499490
E

Epoch: 0, Batch: 135, Train Loss: 1.430912
Epoch: 0, Batch: 136, Train Loss: 1.440560
Epoch: 0, Batch: 137, Train Loss: 1.467538
Epoch: 0, Batch: 138, Train Loss: 1.457695
Epoch: 0, Batch: 139, Train Loss: 1.462981
Epoch: 0, Batch: 140, Train Loss: 1.429020
Epoch: 0, Batch: 140, Train Loss: 1.429020, Validation Loss: 1.454768
The Century Chicago and the Aughers of the United States, Analas is the [[American City of the Arabian]

Epoch: 0, Batch: 141, Train Loss: 1.413854
Epoch: 0, Batch: 142, Train Loss: 1.412826
Epoch: 0, Batch: 143, Train Loss: 1.410467
Epoch: 0, Batch: 144, Train Loss: 1.418755
Epoch: 0, Batch: 145, Train Loss: 1.427979
Epoch: 0, Batch: 146, Train Loss: 1.418987
Epoch: 0, Batch: 147, Train Loss: 1.416317
Epoch: 0, Batch: 148, Train Loss: 1.414020
Epoch: 0, Batch: 149, Train Loss: 1.433620
Epoch: 0, Batch: 150, Train Loss: 1.408010
Epoch: 0, Batch: 150, Train Loss: 1.408010, Validation Loss: 1.453947
The [[Canadian]] in [[1639 in Chip II|A Branders]] it is a song on 

The [[Californism]] and [[Sports of Encyclopedia of Chile Alexander]].

=== Chick ===

* [[Antarcticali

Epoch: 0, Batch: 271, Train Loss: 1.409552
Epoch: 0, Batch: 272, Train Loss: 1.397348
Epoch: 0, Batch: 273, Train Loss: 1.397550
Epoch: 0, Batch: 274, Train Loss: 1.405614
Epoch: 0, Batch: 275, Train Loss: 1.383740
Epoch: 0, Batch: 276, Train Loss: 1.385788
Epoch: 0, Batch: 277, Train Loss: 1.401356
Epoch: 0, Batch: 278, Train Loss: 1.381449
Epoch: 0, Batch: 279, Train Loss: 1.402495
Epoch: 0, Batch: 280, Train Loss: 1.367679
Epoch: 0, Batch: 280, Train Loss: 1.367679, Validation Loss: 1.427359
The music and problem of about 180, the southeast of the [[Belinus Christmas]]. He was advantage into t

Epoch: 0, Batch: 281, Train Loss: 1.404460
Epoch: 0, Batch: 282, Train Loss: 1.406722
Epoch: 0, Batch: 283, Train Loss: 1.415472
Epoch: 0, Batch: 284, Train Loss: 1.400039
Epoch: 0, Batch: 285, Train Loss: 1.394577
Epoch: 0, Batch: 286, Train Loss: 1.387163
Epoch: 0, Batch: 287, Train Loss

Epoch: 0, Batch: 407, Train Loss: 1.396941
Epoch: 0, Batch: 408, Train Loss: 1.390780
Epoch: 0, Batch: 409, Train Loss: 1.395821
Epoch: 0, Batch: 410, Train Loss: 1.400208
Epoch: 0, Batch: 410, Train Loss: 1.400208, Validation Loss: 1.408961
Their champson as ''[[The Alley of the Corid Constantina]]'. The southeast of [[Albaniani]] [[Romanic s

Epoch: 0, Batch: 411, Train Loss: 1.416994
Epoch: 0, Batch: 412, Train Loss: 1.394904
Epoch: 0, Batch: 413, Train Loss: 1.396074
Epoch: 0, Batch: 414, Train Loss: 1.413489
Epoch: 0, Batch: 415, Train Loss: 1.385878
Epoch: 0, Batch: 416, Train Loss: 1.392540
Epoch: 0, Batch: 417, Train Loss: 1.391145
Epoch: 0, Batch: 418, Train Loss: 1.406274
Epoch: 0, Batch: 419, Train Loss: 1.395228
Epoch: 0, Batch: 420, Train Loss: 1.376374
Epoch: 0, Batch: 420, Train Loss: 1.376374, Validation Loss: 1.408531
The Council of the [[USB organizationatory]], and that the come of the first story of [[Malaia]] in [[1

Epoch: 0, Batch: 421, Train Loss: 1.387296
Epoch

Epoch: 0, Batch: 541, Train Loss: 1.346828
Epoch: 0, Batch: 542, Train Loss: 1.356484
Epoch: 0, Batch: 543, Train Loss: 1.363039
Epoch: 0, Batch: 544, Train Loss: 1.368152
Epoch: 0, Batch: 545, Train Loss: 1.360947
Epoch: 0, Batch: 546, Train Loss: 1.354164
Epoch: 0, Batch: 547, Train Loss: 1.367691
Epoch: 0, Batch: 548, Train Loss: 1.377237
Epoch: 0, Batch: 549, Train Loss: 1.385081
Epoch: 0, Batch: 550, Train Loss: 1.384031
Epoch: 0, Batch: 550, Train Loss: 1.384031, Validation Loss: 1.392416
The Computer America]], the [[Aragon]] in 1640, was following [[particles|Antonese]]. Although he had s

Epoch: 0, Batch: 551, Train Loss: 1.362809
Epoch: 0, Batch: 552, Train Loss: 1.361419
Epoch: 0, Batch: 553, Train Loss: 1.360672
Epoch: 0, Batch: 554, Train Loss: 1.354573
Epoch: 0, Batch: 555, Train Loss: 1.371560
Epoch: 0, Batch: 556, Train Loss: 1.371862
Epoch: 0, Batch: 557, Train Loss: 1.369039
Epoch: 0, Batch: 558, Train Loss: 1.364101
Epoch: 0, Batch: 559, Train Loss: 1.392281
Epoch: 0

In [None]:
net.predict("God i")

In [31]:
state = dict(net=net, char2int=char2int, int2char=int2char)

In [32]:
torch.save(state, "save_1")

In [23]:
net2 = torch.load("save_1")

In [28]:
net2.predict("The ")

'The sold of the [[United States and]].  These provides to be resting team at the sere of the same that a charaction as as a stabou d'