<a href="https://colab.research.google.com/github/jonasmue/nlp-playground/blob/master/20190527-084916-RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import itertools
import numpy as np
from google.colab import drive
from tqdm import tqdm
from time import time
from datetime import datetime

In [0]:
import torch

from torch import optim
from torch import nn

In [0]:
dev = torch.device('cuda')

In [0]:
HIMYM = True

# Data Preprocessing

In [59]:
# Mount Google Drive (for now)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
if HIMYM:
  file_path = os.path.join("drive", "My Drive", "data", "himym.txt")
  with open(file_path) as f:
    lines = f.readlines()

In [0]:
if not HIMYM:
  # Load shakespeare:
  file_path = os.path.join("drive", "My Drive", "data", "shakespeare")
  lines = []
  for file_name in os.listdir(file_path):
    with open(os.path.join(file_path, file_name)) as f:
      lines += f.readlines()

In [62]:
len(lines)

45558

In [0]:
text = "".join(lines)
#text = "Dummy Text"

In [64]:
print(text[:1000])




01x01 - Pilot


Pilot
Scene One
[Title: The Year 2030]
Narrator: Kids, I'm going to tell you an incredible story. The story of how I met your mother
Son: Are we being punished for something?
Narrator: No
Daughter: Yeah, is this going to take a while?
Narrator: Yes. (Kids are annoyed) Twenty-five years ago, before I was dad, I had this whole other life.
(Music Plays, Title "How I Met Your Mother" appears)
Narrator: It was way back in 2005. I was twenty-seven just starting to make it as an architect and living in New York with my friend Marshall, my best friend from college. My life was good and then Uncle Marshall went and screwed the whole thing up.
Marshall: (Opens ring) Will you marry me.
Ted: Yes, perfect! And then you're engaged, you pop the champagne! You drink a toast! You have s*x on the kitchen floor... Don't have s*x on our kitchen floor.
Marshall: Got it. Thanks for helping me plan this out, Ted.
Ted: Dude, are you kidding? It's you and Lily! I've been there for all the bi

In [65]:
print(repr(text[:256]))

"\n\n\n01x01 - Pilot\n\n\nPilot\nScene One\n[Title: The Year 2030]\nNarrator: Kids, I'm going to tell you an incredible story. The story of how I met your mother\nSon: Are we being punished for something?\nNarrator: No\nDaughter: Yeah, is this going to take a while?\nNa"


In [0]:
char_set = list(set(text))
char2id = {c:i for i,c in enumerate(char_set)}
id2char = {i:c for i,c in enumerate(char_set)}

In [0]:
text_labels = [char2id[c] for c in text]
num_characters = len(char2id)

In [68]:
print(num_characters)

93


In [0]:
def to_one_hot(X_text_labels, num_characters):
  eye = torch.eye(num_characters)
  X = torch.zeros((X_text_labels.shape[0], X_text_labels.shape[1], num_characters))
  for i, sentence_labels in enumerate(X_text_labels):
    X[i] = eye[sentence_labels]
  return X.to(dev)

In [0]:
# Outputs tensor of X with shape [batch_size, seq_len, num_chars] and y with shape [batch_size, seq_len]
def get_next_training_batch(labels, num_chars, seq_len=128, batch_size=32):  
  for batch_offset in range(0, len(labels), batch_size * (seq_len + 1)):
    if len(labels) < batch_offset + batch_size * (seq_len + 1):
      return
    batch = labels[batch_offset:batch_offset + batch_size * (seq_len + 1)]
    X_text_labels = torch.Tensor([batch[i:i+seq_len] for i in range(0, len(batch), seq_len + 1)]).long()
    X_one_hot = to_one_hot(X_text_labels, num_characters)
    y_text_labels = torch.Tensor([batch[i+1:i+seq_len+1] for i in range(0, len(batch), seq_len + 1)]).long().to(dev)
    yield X_one_hot, y_text_labels

In [71]:
%time X, y = next(itertools.islice(get_next_training_batch(text_labels, 1), 0, None))

CPU times: user 2.41 ms, sys: 1 ms, total: 3.41 ms
Wall time: 3.78 ms


# Implementation of GRU and LSTM using PyTorch


#### GRU

In [0]:
class GRU_Layer(nn.Module):
  def __init__(self, num_characters, hidden_size, batch_size, initial_state=None):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_characters = num_characters
    self.batch_size = batch_size

    # -- Init weight matrices
    self.W_z = nn.Parameter(torch.randn((self.num_characters, self.hidden_size)))
    self.W_r = nn.Parameter(torch.randn((self.num_characters, self.hidden_size)))
    self.W_h = nn.Parameter(torch.randn((self.num_characters, self.hidden_size)))

    self.U_z = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_r = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_h = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))

    self.W_y = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))

    # -- Init biases
    self.b_z = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_r = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_h = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_y = nn.Parameter(torch.zeros(self.num_characters))

    self.h = initial_state

    self._init_weights()

  def _init_weights(self):
        for param in self.parameters():
            param.requires_grad_(True)
            
            if param.data.ndimension() >= 2:
                nn.init.xavier_uniform_(param.data)
            else:
                nn.init.zeros_(param.data)

  def init_hidden(self):
    self.h = torch.zeros((self.batch_size, self.hidden_size)).to(dev)

  def forward(self, x_t):
    # x_t is [batch_size, num_characters]      

    z_t = torch.sigmoid(x_t.mm(self.W_z) + self.h.mm(self.U_z) + self.b_z)
    r_t = torch.sigmoid(x_t.mm(self.W_r) + self.h.mm(self.U_r) + self.b_r)

    h_tilde = torch.tanh(x_t.mm(self.W_h) + (r_t * self.h).mm(self.U_h) + self.b_h)
    self.h = z_t * self.h + (1 - z_t) * h_tilde

    y_hat = self.h.mm(self.W_y) + self.b_y
    return y_hat, self.h

In [0]:
class GRU(nn.Module):
  def __init__(self, num_characters, n_layers=2, hidden_size=1024, batch_size=32, initial_states=None):
    super().__init__()
    if initial_states is not None:
      assert len(initial_states) == n_layers

    self.n_layers = n_layers
    self.batch_size = batch_size
    self.num_characters = num_characters
    layers_temp = []
    
    for i in range(n_layers):
      initial_state = initial_states[i] if initial_states is not None else None
      layer = GRU_Layer(num_characters, hidden_size, batch_size, initial_state)
      layers_temp.append(layer)
    
    self.layers = nn.ModuleList(layers_temp)

  def forward(self, X):
    # X is [seq_len, batch_size, num_characters]
    Y_hat = torch.zeros_like(X)
    hidden_states = []
    
    for layer in self.layers:
      layer.init_hidden()

    for i, x_t in enumerate(X): 
      # enumerate over sequence -> x_t is [batch_size, num_characters]
      assert len(x_t.shape) == 2 and x_t.shape[0] == self.batch_size and x_t.shape[1] == self.num_characters
      for layer in self.layers:
        x_t, hidden_state = layer(x_t)
        hidden_states.append(hidden_state)
      Y_hat[i] = x_t
    return torch.log_softmax(Y_hat, dim=-1), hidden_states


  def predict(self, initial_character_encoding, length_of_sequence=100):
    curr_char = initial_character_encoding
    predicted = [curr_char]

    for layer in self.layers:
      layer.init_hidden()

    for i in range(length_of_sequence):
      for layer in self.layers:
        curr_char, _ = layer(curr_char)
      predicted.append(curr_char[0])  
    
    return predicted

---

#### LSTM

In [0]:
class LSTM_Layer(nn.Module):
  def __init__(self, num_characters, hidden_size, initial_h=None, initial_C=None):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_characters = num_characters

    # -- Init weight matrices
    self.W_f = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_i = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_o = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_C = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))

    self.U_f = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_i = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_o = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_C = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))

    self.W_y = nn.Parameter(torch.randn((self.num_characters, self.hidden_size)))

    # -- Init biases
    self.b_f = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_i = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_o = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_C = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_y = nn.Parameter(torch.zeros(self.num_characters))

    self.h = initial_h
    self.C = initial_C

    self._init_weights()

  def _init_weights(self):
        for param in self.parameters():
            param.requires_grad_(True)
            
            if param.data.ndimension() >= 2:
                nn.init.xavier_uniform_(param.data)
            else:
                nn.init.zeros_(param.data)

  def forward(self, x_t):
    if self.h is None:
      self.h = torch.zeros(self.hidden_size).cuda()
      self.C = torch.zeros(self.hidden_size).cuda()
    
    f_t = torch.sigmoid(self.W_f.matmul(x_t) + self.U_f.matmul(self.h) + self.b_f)
    i_t = torch.sigmoid(self.W_i.matmul(x_t) + self.U_i.matmul(self.h) + self.b_i)
    o_t = torch.sigmoid(self.W_o.matmul(x_t) + self.U_o.matmul(self.h) + self.b_o)

    C_tilde_t = torch.tanh(self.W_C.matmul(x_t) + self.U_C.matmul(self.h) + self.b_C)
    self.C = self.C * f_t + C_tilde_t * i_t
    self.h = o_t * torch.tanh(self.C)

    y_hat = self.W_y.matmul(self.h) + self.b_y
    return y_hat, self.h, self.C

  

In [0]:
class LSTM(nn.Module):
  def __init__(self, num_characters, n_layers=2, hidden_size=1024, initial_states=None):
    super().__init__()
    if initial_states is not None:
      assert len(initial_states) == n_layers

    self.n_layers = n_layers
    layers_temp = []
    
    for i in range(n_layers):
      initial_state = initial_states[i] if initial_states is not None else None
      layer = LSTM_Layer(num_characters, hidden_size, initial_state)
      layers_temp.append(layer)
    
    self.layers = nn.ModuleList(layers_temp)

  def forward(self, X):
    Y_hat = torch.zeros_like(X)
    hidden_states = []

    for i, x_t in enumerate(X):
      for layer in self.layers:
        x_t, hidden_state, cell_state = layer(x_t)
        hidden_states.append(hidden_state)
        hidden_states.append(cell_state)
      Y_hat[i] = x_t
    return torch.log_softmax(Y_hat, dim=1), hidden_states


  def predict(self, initial_character_encoding, length_of_sequence=100):
    curr_char = initial_character_encoding
    predicted = [curr_char]

    for i in range(length_of_sequence):
      for layer in self.layers:
        curr_char, _, _ = layer(curr_char, i==0, i==0)
      predicted.append(curr_char)  
    
    return predicted

# Training

In [0]:
training_data = text_labels

In [0]:
n_epochs = 1000
batch_size = 64
seq_len = 128
hidden_size = 512
num_layers = 2

In [0]:
rnn = GRU(num_characters, num_layers, hidden_size, batch_size).to(dev)
#rnn = nn.GRU(num_characters, 512, 2).to(dev)

In [0]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.0003)

In [0]:
def prediction_to_string(prediction):
  result = ""
  for t in prediction:
    result += id2char[torch.argmax(torch.softmax(t, dim=-1)).item()]
  return result

In [87]:
start = time()
outname = os.path.join("drive", "My Drive", "results", str(datetime.now()) + ".txt")
with open(outname, "w") as f:
  f.write("GRU Training {}, num layers: {}, hidden size: {}, batch size: {}, sequence length: {}".format(str(datetime.now()), num_layers, hidden_size, batch_size, seq_len))

for epoch in range(n_epochs):
  running_loss = 0
  for i, batch in enumerate(get_next_training_batch(training_data, num_characters, seq_len, batch_size)):
    optimizer.zero_grad()
    X, y = batch
    # X is of shape [batch_size, seq_len, num_characters] -> we want it to be [seq_len, batch_size, num_characters]
    X = X.permute(1,0, -1)
    outputs, hidden_states = rnn(X)
    loss = criterion(outputs.permute(1,2,0), y)
    loss.backward()

    # Gradient Clipping
    for param in rnn.parameters():
      if param.grad is None:
          continue
      grad_val = torch.clamp(param.grad, -5, 5)
    optimizer.step()
    
    for hidden in hidden_states:
      hidden.detach_()

    running_loss += loss.item()

  with torch.no_grad():
    print("\n--------------------\nLoss after epoch {}: {}".format(epoch + 1, running_loss))
    print("Took {} seconds".format(time() - start))
    print("--------------------")
    start = time()
    initial_char = "\n"
    initial_char_encoding = torch.Tensor(torch.eye(num_characters)[char2id[initial_char]]).view(1, num_characters).to(dev)
    prediction = rnn.predict(initial_char_encoding, 256)
    print("--------------------")
    pred_string = prediction_to_string(prediction)
    print("Sample Prediction:", pred_string)
    with open(outname, "a") as f:
      f.write("\n\n--------------------\nLoss after epoch {}: {}\n".format(epoch + 1, running_loss))
      f.write(pred_string)
    running_loss = 0



--------------------
Loss after epoch 1: 837.3366882801056
Took 76.60003638267517 seconds
--------------------
--------------------
Sample Prediction: 
Trintttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

--------------------
Loss after epoch 2: 622.1644778251648
Took 77.20012426376343 seconds
--------------------
--------------------
Sample Prediction: 
TS000000000000000000000000000000000000000000000000000000000000000
































































































































































































--------------------
Loss after epoch 3: 558.660736322403
Took 76.53534245491028 seconds
--------------------
--------------------
Sample Prediction: 
TS0000IIT




















KeyboardInterrupt: ignored