<a href="https://colab.research.google.com/github/jonasmue/nlp-playground/blob/master/Char-RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import os
import numpy as np
from torch import nn
from google.colab import drive
from torch import optim
from tqdm import tqdm
from time import time

# Data Preprocessing

In [0]:
# Mount Google Drive (for now)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
file_path = os.path.join("drive", "My Drive", "data", "himym.txt")
with open(file_path) as f:
  lines = f.readlines()

In [0]:
text = "".join(lines)
#text = "Dummy Text"

In [0]:
char_set = list(set(text))
char2id = {c:i for i,c in enumerate(char_set)}
id2char = {i:c for i,c in enumerate(char_set)}

In [0]:
text_labels = [char2id[c] for c in text]
num_characters = len(char2id)

In [0]:
num_characters = len(char2id)

In [0]:
def get_next_training_batch(labels, num_chars, batch_size=32):
  eye = np.eye(num_chars)
  for i in range(0, len(labels) - 1, batch_size):
    X = torch.Tensor(eye[labels[i:i + batch_size]]).cuda()
    y = torch.Tensor(labels[i + 1:i + 1 + batch_size]).long().cuda()
    if X.shape[0] < batch_size or y.shape[0] < batch_size:
      return
    yield X, y

# Implementation of GRU and LSTM using PyTorch


In [0]:
class GRU_Layer(nn.Module):
  def __init__(self, num_characters, hidden_size, initial_state=None):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_characters = num_characters

    # -- Init weight matrices
    self.W_z = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_r = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_h = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))

    self.U_z = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_r = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_h = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))

    self.W_y = nn.Parameter(torch.randn((self.num_characters, self.hidden_size)))

    # -- Init biases
    self.b_z = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_r = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_h = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_y = nn.Parameter(torch.zeros(self.num_characters))

    self.h = initial_state

    self._init_weights()

  def _init_weights(self):
        for param in self.parameters():
            param.requires_grad_(True)
            
            if param.data.ndimension() >= 2:
                nn.init.xavier_uniform_(param.data)
            else:
                nn.init.zeros_(param.data)

  def forward(self, x_t):
    if self.h is None:
      self.h = torch.zeros(self.hidden_size).cuda()
    
    z_t = torch.sigmoid(self.W_z.matmul(x_t) + self.U_z.matmul(self.h) + self.b_z)
    r_t = torch.sigmoid(self.W_r.matmul(x_t) + self.U_r.matmul(self.h) + self.b_r)
    h_tilde = torch.tanh(self.W_h.matmul(x_t) + self.U_h.matmul(r_t * self.h) + self.b_h)
    self.h = z_t * self.h + (1 - z_t) * h_tilde
    y_hat = self.W_y.matmul(self.h) + self.b_y
    return y_hat, self.h

  

In [0]:
class GRU(nn.Module):
  def __init__(self, num_characters, n_layers=2, hidden_size=1024, initial_states=None):
    super().__init__()
    if initial_states is not None:
      assert len(initial_states) == n_layers

    self.n_layers = n_layers
    layers_temp = []
    
    for i in range(n_layers):
      initial_state = initial_states[i] if initial_states is not None else None
      layer = GRU_Layer(num_characters, hidden_size, initial_state)
      layers_temp.append(layer)
    
    self.layers = nn.ModuleList(layers_temp)

  def forward(self, X):
    Y_hat = torch.zeros_like(X)
    hidden_states = []

    for i, x_t in enumerate(X):
      for layer in self.layers:
        x_t, hidden_state = layer(x_t)
        hidden_states.append(hidden_state)
      Y_hat[i] = x_t
    return torch.log_softmax(Y_hat, dim=1), hidden_states


  def predict(self, initial_character_encoding, length_of_sequence=100):
    curr_char = initial_character_encoding
    predicted = [curr_char]

    for i in range(length_of_sequence):
      for layer in self.layers:
        curr_char, _ = layer(curr_char)
      predicted.append(curr_char)  
    
    return predicted

---

In [0]:
class LSTM_Layer(nn.Module):
  def __init__(self, num_characters, hidden_size, initial_h=None, initial_C=None):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_characters = num_characters

    # -- Init weight matrices
    self.W_f = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_i = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_o = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))
    self.W_C = nn.Parameter(torch.randn((self.hidden_size, self.num_characters)))

    self.U_f = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_i = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_o = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
    self.U_C = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))

    self.W_y = nn.Parameter(torch.randn((self.num_characters, self.hidden_size)))

    # -- Init biases
    self.b_f = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_i = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_o = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_C = nn.Parameter(torch.zeros(self.hidden_size))
    self.b_y = nn.Parameter(torch.zeros(self.num_characters))

    self.h = initial_h
    self.C = initial_C

    self._init_weights()

  def _init_weights(self):
        for param in self.parameters():
            param.requires_grad_(True)
            
            if param.data.ndimension() >= 2:
                nn.init.xavier_uniform_(param.data)
            else:
                nn.init.zeros_(param.data)

  def forward(self, x_t):
    if self.h is None:
      self.h = torch.zeros(self.hidden_size).cuda()
      self.C = torch.zeros(self.hidden_size).cuda()
    
    f_t = torch.sigmoid(self.W_f.matmul(x_t) + self.U_f.matmul(self.h) + self.b_f)
    i_t = torch.sigmoid(self.W_i.matmul(x_t) + self.U_i.matmul(self.h) + self.b_i)
    o_t = torch.sigmoid(self.W_o.matmul(x_t) + self.U_o.matmul(self.h) + self.b_o)

    C_tilde_t = torch.tanh(self.W_C.matmul(x_t) + self.U_C.matmul(self.h) + self.b_C)
    self.C = self.C * f_t + C_tilde_t * i_t
    self.h = o_t * torch.tanh(self.C)

    y_hat = self.W_y.matmul(self.h) + self.b_y
    return y_hat, self.h, self.C

  

In [0]:
class LSTM(nn.Module):
  def __init__(self, num_characters, n_layers=2, hidden_size=1024, initial_states=None):
    super().__init__()
    if initial_states is not None:
      assert len(initial_states) == n_layers

    self.n_layers = n_layers
    layers_temp = []
    
    for i in range(n_layers):
      initial_state = initial_states[i] if initial_states is not None else None
      layer = LSTM_Layer(num_characters, hidden_size, initial_state)
      layers_temp.append(layer)
    
    self.layers = nn.ModuleList(layers_temp)

  def forward(self, X):
    Y_hat = torch.zeros_like(X)
    hidden_states = []

    for i, x_t in enumerate(X):
      for layer in self.layers:
        x_t, hidden_state, cell_state = layer(x_t)
        hidden_states.append(hidden_state)
        hidden_states.append(cell_state)
      Y_hat[i] = x_t
    return torch.log_softmax(Y_hat, dim=1), hidden_states


  def predict(self, initial_character_encoding, length_of_sequence=100):
    curr_char = initial_character_encoding
    predicted = [curr_char]

    for i in range(length_of_sequence):
      for layer in self.layers:
        curr_char, _, _ = layer(curr_char)
      predicted.append(curr_char)  
    
    return predicted

In [0]:
training_data = text_labels[:len(text_labels) // 2]
len(training_data)//32

41026

In [0]:
#training_data = text_labels

In [0]:
rnn = LSTM(num_characters, 1, 512).cuda()
#rnn = nn.LSTM(input_size=num_characters, hidden_size=512)

In [0]:
n_epochs = 5
batch_size = 32
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)

In [0]:
def prediction_to_string(prediction):
  result = ""
  for t in prediction:
    result += id2char[torch.argmax(torch.softmax(t, dim=-1)).item()]
  return result

In [0]:
for epoch in range(n_epochs):
  start = time()
  running_loss = 0
  for i, batch in enumerate(get_next_training_batch(training_data, num_characters, batch_size)):
    optimizer.zero_grad()
    X, y = batch
    #X = X.view([batch_size, 1, num_characters])
    outputs, hidden_states = rnn(X)
    loss = criterion(outputs, y)
    loss.backward()

    # Gradient Clipping
    for param in rnn.parameters():
      if param.grad is None:
          continue
      grad_val = torch.clamp(param.grad, -5, 5)
    optimizer.step()
    
    for hidden in hidden_states:
      hidden.detach_()

    running_loss += loss.item()

    if i and not i % 100:
      print("\n--------------------\nLoss after iteration {}: {}".format(i, running_loss))
      print("Took {} seconds".format(time() - start))
      print("--------------------")
      print("Training Text:", prediction_to_string(X))
      start = time()
      running_loss = 0
      initial_char = "D"
      initial_char_encoding = torch.Tensor(np.eye(num_characters)[char2id[initial_char]]).cuda()
      prediction = rnn.predict(initial_char_encoding, 256)
      print("--------------------")
      print("Sample Prediction:", prediction_to_string(prediction))


--------------------
Loss after iteration 100: 150.93116801977158
Took 3.675058364868164 seconds
--------------------
Training Text: ame called... "Have you met Ted?
--------------------
Sample Prediction: D.ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

--------------------
Loss after iteration 200: 157.99454182386398
Took 3.9156711101531982 seconds
--------------------
Training Text: l: (covers mouth) OH!
Ted: Why a
--------------------
Sample Prediction: Ddttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

--------------------
Loss after iteration 300: 172.36598533391953
To

KeyboardInterrupt: ignored