In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import BertTokenizer, BertForSequenceClassification, BertModel

In [None]:
if torch.cuda.is_available() :
  print("CUDA")
  device = torch.device("cuda")
# elif torch.backends.mps.is_available() :
#   print("M1-mps")
#   device = torch.device("mps")
else :
  print("CPU")
  device = torch.device("cpu")

In [None]:
VOCAB_SIZE = 128 + 2
EOS, SOS, PAD = 128, 129, 0

def preprocess(df, is_source=True) :
    texts = df['text']
    max_len = max([len(t) for t in texts])
    out = []
    for text in texts :
        encoded = [ord(c) for c in text] + [EOS] + [PAD] * (max_len - len(text))
        if not is_source :
            encoded = [SOS] + encoded
        out.append(torch.tensor(encoded))
    return out, max_len

In [None]:
train_rate, test_rate = 0.9, 0.09
itr = 1
p_itr = 100
epochs = 5
batch = 5

In [None]:
total_df = pd.read_csv('augmented_data/Dataset_aug_complex_10424_.csv', sep=',')
total_df.dropna(inplace=True)
total_df = total_df[["text", "label"]]
total_df["label"] = [1 if i == "nothate" else 0 for i in total_df["label"]]

out, max_len = preprocess(total_df)
total_df['preprocessed'] = out
total_df['valid_len'] = [len(t) for t in total_df['text']]
maxlens = []
maxlens.append(max_len)

In [None]:
target_df = pd.read_csv('augmented_data/Dataset_aug_complex_10424_original.csv', sep=',')
out, max_len = preprocess(target_df, is_source=False)
total_df['target_preprocessed'] = out
total_df['target_len'] = [len(t) for t in target_df['text']]

maxlens.append(max_len)
max_len = max(maxlens)
print(max_len)
print(len(total_df.columns))

In [None]:
class TestDataset(Dataset) :
  #Dataset - English/typo-added/labeled
  def __init__(self, df) :
    self.df = df
  
  def __len__(self) :
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    item = (self.df.iloc[idx, 1], self.df.iloc[idx, 2], self.df.iloc[idx, 3], self.df.iloc[idx, 4], self.df.iloc[idx, 5])
    return text, item

In [None]:
test_df, train_df, _ = np.split(total_df, [int(test_rate*len(total_df)), int(test_rate*len(total_df) + train_rate*len(total_df))])
print(len(test_df), len(train_df))

train_dataset = TestDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)

# LSTM Encoder-Decoder

In [None]:
def masked_softmax(X, valid_length):
  """
  inputs:
    X: 3-D tensor
    valid_length: 1-D or 2-D tensor
  """
  mask_value = -1e7 

  if len(X.shape) == 2:
    X = X.unsqueeze(1)

  N, n, m = X.shape

  if len(valid_length.shape) == 1:
    valid_length = valid_length.repeat_interleave(n, dim=0)
  else:
    valid_length = valid_length.reshape((-1,))

  mask = torch.arange(m)[None, :].to(X.device) >= valid_length[:, None]
  X.view(-1, m)[mask] = mask_value

  Y = torch.softmax(X, dim=-1)

In [None]:
class DotProductAttention(nn.Module): 
  def __init__(self):
      super(DotProductAttention, self).__init__()

  def forward(self, query, key, value, valid_length=None):
    """
    inputs:
      query: tensor of size (B, n, d)
      key: tensor of size (B, m, d)
      value: tensor of size (B, m, dim_v)
      valid_length: (B, )

      B is the batch_size, n is the number of queries, m is the number of <key, value> pairs,
      d is the feature dimension of the query, and dim_v is the feature dimension of the value.

    Outputs:
      attention: tensor of size (B, n, dim_v), weighted sum of values
    """
    ##############################################################################
    # TODO3: Implement the forward pass of DotProductAttention. Do not
    # use any loops in your implementation.
    ##############################################################################
    # Replace "pass" statement with your code
    #Z = softmax(Q @ K_T / sqrt(d)) @ V
    B, n, d = query.shape
    d_sqrt = torch.sqrt(query.new_tensor([d]))
    a = torch.div(torch.bmm(query, torch.transpose(key,1,2)), d_sqrt)
    b = masked_softmax(a, valid_length)
    attention = torch.bmm(b, value)
    # END OF YOUR CODE

    return attention

In [None]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, device=None):
    super(Encoder, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.enc = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)  #input_size, hidden_size, num_layers, bias, batch_first(TRUE -> (B,MAX_LEN,emb_dim)), dropout, bi-directional
    self.hidden_size = hidden_size
    
  def forward(self, sources, valid_len):
    #(B,Max_len)
    #print(sources)
    word_embedded = self.embedding(sources)
    packed_input = pack_padded_sequence(word_embedded, valid_len, batch_first=True, enforce_sorted=False)

    N = word_embedded.shape[0]  #(N, Max_len, emb_dim)
    max_len = word_embedded.shape[1]
    
    #(D*num_layers), N, H_out / D=2(bi-directional), num_layers=1, N=batch_size, H_out=hidden_size 
    h = sources.new_zeros(2, N, self.hidden_size).float()
    c = sources.new_zeros(2, N, self.hidden_size).float()

    #output_size : (N, L, D*H_out) when batch_first=True
    outputs, (h, c) = self.enc(packed_input, (h, c))
    packed_output, _ = pad_packed_sequence(outputs, padding_value= 0, batch_first=True, total_length=max_len)

    return packed_output, (h, c)

In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, device):
    super(Decoder, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.enc = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
    self.output_emb = nn.Linear(2*hidden_size, vocab_size)
    self.hidden_size = hidden_size
    
  def forward(self, state, target, valid_len):
    loss = 0
    preds = []
    enc_output, (h, c), src_len = state
    enc_output = enc_output.to(device)

    #print(target)
    target_embedded = self.embedding(target)
    N, max_len = target_embedded.shape[:2]  #T : MAX sequence-length

    packed_input = pack_padded_sequence(target_embedded, valid_len, batch_first=True, enforce_sorted=False)

    dec_output, (h, c) = self.enc(packed_input, (h, c))
    dec_output, _ = pad_packed_sequence(dec_output, padding_value= 0, batch_first=True, total_length=max_len)

    preds = self.output_emb(dec_output)   #preds : (N,Max_len,vocab_size)

    loss = F.nll_loss(F.log_softmax(preds[:, :max_len-1].transpose(1,2), dim = 1), target[:, 1:], ignore_index=0, reduction = 'none')
    loss = loss.sum(1).mean()

    preds = preds.argmax(dim=-1)
    # END OF YOUR CODE
    return loss, preds
  
  # def predict(self, state, target, valid_len):
  #   pred = None
  #   enc_output, (h, c), src_len = state
  #   enc_output = enc_output.to(device)

  #   target_embedded = self.embedding(target)
  #   N, max_len = target_embedded.shape[:2]  #T : MAX sequence-length

  #   dec_valid = valid_len.new_tensor([1 for k in valid_len])
  #   pred_prev = target_embedded[:, :1].reshape(N,1,-1)  #(N,1,embedding_dim)
  #   preds = []

  #   for i in range(max_len+1) :
  #     dec_input = pred_prev
  #     dec_words, (h, c) = self.enc((dec_input, dec_valid), (h, c))    #dec_words : (N,1,hidden_size)
  #     dec_words_output = self.output_emb(dec_words.to(device)).argmax(dim=-1)   #(hidden_size -> vocab_size)
  #     preds.append(dec_words_output)
  #     pred_prev = self.embedding(dec_words_output)    #(vocab_size -> emb_dim)
    
  #   pred = torch.cat(preds, dim=1).to(device)
  #   # END OF YOUR CODE

  #   return pred

In [None]:
# class Decoder(nn.Module):
#   def __init__(self, vocab_size, embedding_dim, hidden_size, device):
#     super(Decoder, self).__init__()
#     self.embedding = nn.Embedding(vocab_size, embedding_dim)
#     self.enc = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
#     self.att = DotProductAttention()
#     self.output_emb = nn.Linear(hidden_size, vocab_size)
#     self.hidden_size = hidden_size
    
#   def forward(self, state, target, valid_len):
#     loss = 0
#     preds = []
#     enc_output, (h, c), src_len = state
#     enc_output = enc_output.to(device)

#     target_embedded = self.embedding(target)
#     N, T = target_embedded.shape[:2]  #T : MAX sequence-length

#     dec_output = enc_output.new_zeros(N,T,2*self.hidden_size).to(device)
#     dec_valid = valid_len.new_tensor([1 for k in valid_len])
#     #H : (2, N, hidden_size)
#     print(h.transpose(0,1).reshape(N,1,2*self.hidden_size).shape)
    
#     for i in range(T+1) :
#       context = self.att(h.transpose(0,1).reshape(N,1,2*self.hidden_size), enc_output, enc_output, src_len)
#       dec_input = torch.cat((target_embedded[:,i,:].reshape(N,1,-1), context), dim=2)
#       dec_words, (h, c) = self.enc((dec_input, dec_valid), (h, c))    #dec_words : (N,1,hidden_size)
#       dec_output[:,i,:] = dec_words.reshape(N,self.hidden_size)   #dec_output : decoded-predictions w/ attention, (N,T,hidden_size)
    
#     preds = self.output_emb(dec_output)   #preds : (N,T,vocab_size)
#     loss = F.nll_loss(F.log_softmax(preds[:, :T-1].transpose(1,2), dim = 1), target[:, 1:], ignore_index=0, reduction = 'none')
#     loss = loss.sum(1).mean()

#     preds = preds.argmax(dim=-1)
#     # END OF YOUR CODE
#     return loss, preds
  
#   def predict(self, state, target, valid_len):
#     pred = None
#     enc_output, (h, c), src_len = state
#     enc_output = enc_output.to(device)
#     target_embedded = self.embedding(target)
#     N, T = target_embedded.shape[:2]  #T : MAX sequence-length

#     dec_valid = valid_len.new_tensor([1 for k in valid_len])
#     pred_prev = target_embedded[:, :1].reshape(N,1,-1)  #(N,1,embedding_dim)
#     preds = []

#     for i in range(T+1) :
#       context = self.att(h.reshape(N,1,2*self.hidden_size), enc_output, enc_output, src_len)
#       dec_input = torch.cat((pred_prev, context), dim=2)
#       dec_words, (h, c) = self.enc((dec_input, dec_valid), (h, c))    #dec_words : (N,1,hidden_size)
#       dec_words_output = self.output_emb(dec_words.to(device)).argmax(dim=-1)   #(hidden_size -> vocab_size)
#       preds.append(dec_words_output)
#       pred_prev = self.embedding(dec_words_output)    #(vocab_size -> emb_dim)
    
#     pred = torch.cat(preds, dim=1).to(device)
#     # END OF YOUR CODE

#     return pred

In [None]:
class NMTLSTM(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size, embedding_dim, hidden_size, device):
    super(NMTLSTM, self).__init__()
    self.enc = Encoder(src_vocab_size, embedding_dim, hidden_size, device)
    self.dec = Decoder(tgt_vocab_size, embedding_dim, hidden_size, device)
    
  def forward(self, src, src_len, tgt, tgt_len):
    outputs, (h, c) = self.enc(src, src_len)
    loss, pred = self.dec((outputs, (h, c), src_len), tgt, tgt_len)
    return loss, pred

In [None]:
def train_lstm(net, train_loader, lr, epochs, device):
  # training
  net = net.to(device)

  optimizer = torch.optim.Adam(net.parameters(), lr=lr)
  loss_list = []
  print_interval = len(train_loader)
  total_iter = epochs * len(train_loader)
  for e in range(epochs):
    net.train()
    i = 0
    for text, item in train_loader :
      labels, data, valid_len, tgt, tgt_len = item
      #data : (B, M), valid_len : (B)
      labels, data, valid_len, tgt, tgt_len = labels.to(device), data.to(device), valid_len.to(device), tgt.to(device), tgt_len.to(device)
      
      loss, pred = net(data, valid_len, tgt, tgt_len)

      loss_list.append(loss.mean().detach())
      optimizer.zero_grad()
      loss.mean().backward()
      optimizer.step()

      step = i + e * len(train_loader)
      if i % print_interval == 0:
        print('iter {} / {}\tLoss:\t{:.6f}'.format(i, total_iter, loss.mean().detach()))
        print('pred:\t {}\n'.format(pred.detach().cpu()))
        print('tgt:\t {}\n'.format(tgt.cpu()))
      i += 1
  return loss_list

In [None]:
lr = 1e-3
embedding_dim = 128
hidden_size = 256

lstm_net = NMTLSTM(VOCAB_SIZE, VOCAB_SIZE, embedding_dim, hidden_size, device)
lstm_loss_list = train_lstm(lstm_net, train_loader, lr, epochs, device)

In [None]:
def comp_acc(pred, gt, valid_len):
  N, T_gt = gt.shape[:2]
  _, T_pr = pred.shape[:2]
  assert T_gt == T_pr, 'Prediction and target should have the same length.'
  len_mask = torch.arange(T_gt).expand(N, T_gt)
  len_mask = len_mask < valid_len[:, None]
  
  pred_crr = (pred == gt).float() * len_mask.float() # filter out the 'bos' token
  pred_acc = pred_crr.sum(dim=-1) / (valid_len - 1).float() # minus the 'bos' token
  return pred_acc
  
def evaluate_lstm(net, test_loader, device):
  acc_list = []

  for text, item in test_loader :
    labels, data, valid_len, tgt, tgt_len = item
    labels, data, valid_len, tgt, tgt_len = labels.to(device), data.to(device), valid_len.to(device), tgt.to(device), tgt_len.to(device)
      
    pred = net.predict(data, valid_len, tgt, tgt_len)

    pred_acc = comp_acc(pred.detach().cpu(), tgt.detach().cpu(), tgt_len.cpu())
    acc_list.append(pred_acc)
  
  print("Test complete")
  acc_final = torch.cat(acc_list).mean()
  return acc_list, acc_list


acc_final = evaluate_lstm(lstm_net, test_loader, device)