# The Unreasonable Effectiveness of RNNs
The [liftothers.org lab](http://liftothers.org/dokuwiki/doku.php?id=cs501r_f2018:lab6)  
Andrej Karpathy's [blog](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)

Are we passing in one character at a time or one sentence at a time?

In [9]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import random
import re
import string
import sys
import time
import torch
import torch.nn as nn
import torch.optim as optim
import unidecode

torch.set_printoptions(precision=2)

In [4]:
!{sys.executable} -m pip install unidecode

Collecting unidecode
  Using cached https://files.pythonhosted.org/packages/59/ef/67085e30e8bbcdd76e2f0a4ad8151c13a2c5bce77c85f8cad6e1f16fb141/Unidecode-1.0.22-py2.py3-none-any.whl
Installing collected packages: unidecode
Successfully installed unidecode-1.0.22


## Read in the text, parse into sentences.

In [2]:
# filenames = glob.glob("shakespeare/*.txt")
# sentences = []
# for filename in filenames:
#     with open(filename) as file:
#         sentences = file.read().split('.')
#         break
        
filename = "alma.txt"
with open(filename) as file:
    sentences = file.read().split('.')
sentences = [sentence.strip() for sentence in sentences if len(sentence) > 0]

sentences = sentences[:1]
len(sentences)
print(sentences)

['The account of Alma, who was the son of Alma the first, and chief judge over the people of Nephi, and also the high priest over the Church']


## First, overfit to a single sentence
This works for the sentence `bbabb`, but fails on `bbbbbbbba`.
This is because we're predicting the next letter given the current letter.
90% of the time, the next letter is `b`. We need to somehow count the `b`s and then return `a` on every ninth `b`.

Using an SGD optimizer seems to learn the beginning part of the sentence first.

In [21]:
# sentence = " bbbbbbbbba"
# sentence = " a b c d e f g h"
# sentence = " a a a a e f g h"
# sentences = ["AAnders is just so silly...",
#             "JJared is incredibly smart."]

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.combined_size = input_size + hidden_size
        self.i2h = nn.Linear(self.combined_size, hidden_size)
        self.i2o = nn.Linear(self.combined_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), dim=1)
        hidden = self.tanh(self.i2h(combined))
        output = self.softmax(self.i2o(combined))
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(self.hidden_size).view(1,self.hidden_size)
    
class CharGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        pass

vocab = set()
for sentence in sentences:
    vocab.update(sentence)
vocab = list(vocab)

letter_idx = lambda letter: vocab.index(letter)
def letter_tensor(letter):
    t = torch.zeros(n_letters)
    i = letter_idx(letter)
    t[i] = 1
    return t.view(-1,n_letters).float()
n_letters = len(vocab)
print("n_letters:",n_letters)


rnn = CharRNN(input_size=n_letters, hidden_size=32, output_size=n_letters)
n_epochs = 10000
print_every = n_epochs//10
objective = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr=0.01)

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == torch.Tensor:
        return torch.Tensor(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)
    
def get_batch():
    """
    Returns an index and a sentence.
    """
    i = np.random.randint(len(sentences))
    return i, sentences[i]

def train(n_epochs, print_every):
    for epoch in range(n_epochs):
        # Starting each batch, repackage the hidden variable so it only backprops so far.
        output_sentence = ""
        hidden = rnn.init_hidden()
        loss = 0
        i_s, sentence = get_batch()

        # Backprop wrt a single sentence
        for i in range(len(sentence)-1): 
            rnn.zero_grad()

            x = letter_tensor(sentence[i])
            y = letter_tensor(sentence[i+1])
            output, hidden = rnn(x, hidden) 
            loss += objective(output, y.long().argmax(dim=1))

            letter = vocab[output.argmax(dim=1)]
            output_sentence += letter
        loss.backward()
        optimizer.step()

        if epoch%print_every == 0:
            print("epoch {}:".format(epoch), output_sentence)
    print("Finished")

NameError: name 'sentences' is not defined

## Scaffolding Code

In [18]:
all_characters = string.printable
n_characters = len(all_characters)

file = unidecode.unidecode(open('alma.txt').read())
file_len = len(file)
print('file_len =', file_len)

chunk_len = 200

def random_chunk():
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

print(random_chunk())

# Turn string into list of longs
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return tensor

print(char_tensor('abcDEF'))

def random_training_set():
    """
    Returns a (200,n_characters) tensor.
    """
    chunk = random_chunk()
    inp = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return inp, target

file_len = 466656
tood in need.

 And thus they did prosper and become far more wealthy than those who did not belong to their church.

 For those who did not belong to their church did indulge themselves in sorceries, 
tensor([ 10,  11,  12,  39,  40,  41])


In [37]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.embedding_encoder = nn.Embedding(input_size, hidden_size) # maps an ASCII char to a vector
        self.gru_encoder = nn.GRU(hidden_size, hidden_size, n_layers)
        
        self.embedding_decoder = nn.Embedding(output_size, hidden_size) # maps an ASCII char to a vector
        self.gru_decoder = nn.GRU(hidden_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
        
    def forward(self, input, hidden):
        """
        The encoder reads an input sequence and outputs a single vector,
        the decoder reads the vector and produces an output sequence.
        Parameters:
            input (batch, input_size)
            hidden (batch, hidden_size)
        Returns:
            output (batch, hidden_size)
        """
        embedded = self.embedding_encoder(input)
        print("embedded:",embedded.shape)
        #.view(1, 1, -1) # (1, 1, hidden_size)
        hidden = self.gru_encoder(embedded, hidden) # (1, hidden_size)
        # We use the last hidden state as the context vector.
        output = self.embedding_decoder(input).view(1, 1, self.hidden_size)
        output = F.relu(output)
        output, hidden = self.gru_decoder(output, hidden)
        output = self.softmax(self.out(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(self.n_layers, 1, self.hidden_size)

In [38]:
hidden_size = 100
n_layers = 1
lr = 0.005

rnn = RNN(n_characters, hidden_size, n_characters, n_layers)
optimizer = optim.Adam(rnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

def train(inp, target):
    """
    Parameters:
        inp (200,n_vocab)
        target (200,n_vocab)
    """
    hidden = decoder.init_hidden()
    optimizer.zero_grad()
    
    loss = 0
    for c in range(chunk_len):
        output, hidden = rnn(inp, hidden)
        loss += criterion(output, target[c].unsqueeze(0))
        
    loss.backward()
    optimizer.step()
    
    return loss.item() / chunk_len

In [39]:
n_epochs = 2000
print_every = 100
plot_every = 10

start = time.time()
all_losses = []
loss_avg = 0

for epoch in range(1, n_epochs+1):
    inp, target = random_training_set()
    print("inp:",inp.shape)
    print("target:",target.shape)
    loss_ = train(inp, target)
    loss_avg += loss_
    if epoch % print_every == 0:
        print("[{} ({} {}%) {:.4f}]".format(time.time()-start, epoch, 
                        epoch/n_epochs*100, loss_))
        print(evaluate('Wh', 100), '\n')
        
    if epoch % plot_every == 0:
        all_losses.append(loss_avg/plot_every)
        loss_avg = 0
        
    break

inp: torch.Size([200])
target: torch.Size([200])
embedded: torch.Size([200, 100])


RuntimeError: input must have 3 dimensions, got 2