# Recurrent Neural Networks

Recurrent neural networks are another kind of neural network. They're especially good at dealing with sequential data. At each step of the sequence the network takes two inputs: the encoding of the appropriate element of the sequence, and the hidden state of the network from the previous element. This allows sequential relationships to have bearing on the output of the network.

Recurrent neural networks are great for a wide variety of tasks, and especially for "sequence to representation" and "sequence to sequence" tasks. A "sequence to representation" task might be something like producing a sentiment score for a sentence. A "sequence to sequence" task might be producing a new sentence given an old one.

# Setup

In [1]:
import numpy as np

import sys
import os

import torch
import torch.nn as nn
import torch.optim  as optim

# Data

Now, let's read the data for the sequence-to-sequence game of thrones model.

Here, I've compiled a folder with transcripts of each game of thrones episode. I'll read the scripts into a list of all the lines, and then I'll embed the scripts as a sequence of one-hot vectors for each character.

In [2]:
scripts_path = os.path.abspath('//Users/jeremiahsafe/Documents/Data/GOTScripts')
script_files = [os.path.join(scripts_path, x) for x in os.listdir(scripts_path)]

got_lines = []
alphabet = set(['<s>', '</s>'])
for sf in script_files:
    with open(sf, 'r') as fp:
        for line in fp:
            for c in line.lower():
                alphabet.add(c)
            got_lines.append(line.lower())

class Embedder(object):
    
    def __init__(self, alphabet):
        self.alphabet = alphabet
        self.character_embedding = {}
        self.alphabet_to_idx = {}
        self.idx_to_alphabet = {}
        
        for i, glyph in enumerate(self.alphabet):
            self.alphabet_to_idx[glyph] = i
            self.idx_to_alphabet[i] = glyph
        
        for i, glyph in enumerate(self.alphabet):
            emb = torch.zeros(1, len(self.alphabet))
            emb[0][self.alphabet_to_idx[glyph]] = 1
            self.character_embedding[glyph] = emb
            
    def embed_sentence(self, sentence):
        embs = list(map(lambda x: self.character_embedding[x], sentence))
        return torch.cat(embs, 0)
    
    def unembed_scores(self, scores):
        _, idxs = torch.max(scores, dim=1)
        res = []
        for i in idxs:
            res.append(self.idx_to_alphabet[int(i)])
        return res
        
embedder = Embedder(alphabet)  
sequences = []
for line in got_lines:
    tagged_line = ['<s>'] + list(line) + ['</s>']
    sequences.append(embedder.embed_sentence(tagged_line))
    
sequence_pairs = []
for seq in sequences:
    sequence_pairs.append((seq, seq[1:]))

# Model: Language Generation

In [3]:
class LSTM(nn.Module):
    
    def __init__(self, alphabet_size, hidden_dim):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(alphabet_size, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, alphabet_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence):
        embeds = sentence.view(len(sentence), 1, -1)
        lstm_out, _ = self.lstm(embeds)
        to_char = self.fc1(lstm_out.view(len(sentence), -1))
        char_scores = self.softmax(to_char)
        return char_scores

k = len(alphabet)
model = LSTM(k, 32)

In [None]:
for x, target in sequence_pairs[:10]:
    print(len(x), len(target))

In [None]:
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = .1)

for n in range(5):
    running_loss = 0
    for i, (x, target) in enumerate(sequence_pairs[:10]):
        optimizer.zero_grad()
        scores = model(x.view(len(x), -1))[:-1]
        loss = loss_fn(scores, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i & 500 == 499:
            print("[{} {}] loss = {}".format(n+1, i+1, running_loss))
            running_loss = 0

# Results