In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
!git clone https://github.com/joseph1723/CS376_Final_Project.git

fatal: destination path 'CS376_Final_Project' already exists and is not an empty directory.


In [3]:
#device = torch.device("cpu")
if torch.cuda.is_available() :
  print("CUDA")
  device = torch.device("cuda")
else :
  print("CPU")
  device = torch.device("cpu")

CUDA


In [4]:
VOCAB_SIZE = 128 + 2
EOS, SOS, PAD = 128, 129, 0

def preprocess(df, column, max_len, is_source=True) :
    texts = df[column]
    #max_len = max([len(t) for t in texts])
    out = []
    for text in texts :
        encoded = [ord(c) for c in text] + [EOS] + [PAD] * (max_len - len(text))
        if not is_source :
            encoded = [SOS] + encoded
        out.append(torch.tensor(encoded))
    return out  #, max_len

In [5]:
train_rate, test_rate = 0.95, 0.05
batch = 10

In [6]:
total_df = pd.read_csv('/content/CS376_Final_Project/augmented_data/Dataset_aug_char_12612.csv', sep=',')[["text","original"]]

max_len = max([max([len(t) for t in total_df["text"]]), max([len(t) for t in total_df["original"]])])
print(max_len)

#out, max_len = preprocess(total_df)
out = preprocess(total_df, 'text', max_len)
total_df['preprocessed'] = out
total_df['valid_len'] = [(len(t)+1) for t in total_df['text']]

out = preprocess(total_df, 'original', max_len, is_source=False)
total_df['target_preprocessed'] = out
total_df['target_len'] = [(len(t)+2) for t in total_df['original']]
print(total_df[:10])

23
          text   original                                       preprocessed  \
0    shiteiXks  shitdicks  [tensor(115), tensor(104), tensor(105), tensor...   
1    sni6dicks  shitdicks  [tensor(115), tensor(110), tensor(105), tensor...   
2    shitdicr8  shitdicks  [tensor(115), tensor(104), tensor(105), tensor...   
3    8hitdicrs  shitdicks  [tensor(56), tensor(104), tensor(105), tensor(...   
4  shnitydicks  shitdicks  [tensor(115), tensor(104), tensor(110), tensor...   
5  shiDtdkicks  shitdicks  [tensor(115), tensor(104), tensor(105), tensor...   
6    sh8tdickJ  shitdicks  [tensor(115), tensor(104), tensor(56), tensor(...   
7    smiDdicks  shitdicks  [tensor(115), tensor(109), tensor(105), tensor...   
8    hstidicks  shitdicks  [tensor(104), tensor(115), tensor(116), tensor...   
9    shitdcisk  shitdicks  [tensor(115), tensor(104), tensor(105), tensor...   

   valid_len                                target_preprocessed  target_len  
0         10  [tensor(129), tensor(115

In [7]:
class TestDataset(Dataset) :
  #Dataset - English/typo-added/labeled
  def __init__(self, df) :
    self.df = df
  
  def __len__(self) :
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    item = (self.df.iloc[idx, 1], self.df.iloc[idx, 2], self.df.iloc[idx, 3], self.df.iloc[idx, 4], self.df.iloc[idx, 5])
    return text, item

In [8]:
test_df, train_df, _ = np.split(total_df, [int(test_rate*len(total_df)), int(test_rate*len(total_df) + train_rate*len(total_df))])
print(len(test_df), len(train_df))

train_dataset = TestDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)

630 11982


# LSTM Encoder-Decoder (w/ Attention)

In [9]:
def masked_softmax(X, valid_length):
  mask_value = -1e7 

  if len(X.shape) == 2:
    X = X.unsqueeze(1)

  N, n, m = X.shape

  if len(valid_length.shape) == 1:
    valid_length = valid_length.repeat_interleave(n, dim=0)
  else:
    valid_length = valid_length.reshape((-1,))

  mask = torch.arange(m)[None, :].to(X.device) >= valid_length[:, None]
  X.view(-1, m)[mask] = mask_value

  Y = torch.softmax(X, dim=-1)
  return Y

In [10]:
class DotProductAttention(nn.Module): 
  def __init__(self):
      super(DotProductAttention, self).__init__()

  def forward(self, query, key, value, valid_length=None):

    B, n, d = query.shape
    d_sqrt = torch.sqrt(query.new_tensor([d]))
    a = torch.div(torch.bmm(query, torch.transpose(key,1,2)), d_sqrt)

    b = masked_softmax(a, valid_length)
    #print(value.shape, value)
    #rint(b.shape, b)

    attention = torch.bmm(b, value)

    return attention

In [11]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, device=None):
    super(Encoder, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.enc = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=False)  #input_size, hidden_size, num_layers, bias, batch_first(TRUE -> (B,MAX_LEN,emb_dim)), dropout, bi-directional
    self.hidden_size = hidden_size
    
  def forward(self, sources, valid_len):
    #(B,Max_len)
    #print(sources)
    word_embedded = self.embedding(sources)
    packed_input = pack_padded_sequence(word_embedded, valid_len, batch_first=True, enforce_sorted=False)

    N = word_embedded.shape[0]  #(N, Max_len, emb_dim)
    max_len = word_embedded.shape[1]
    
    #(D*num_layers), N, H_out / D=2(bi-directional), num_layers=1, N=batch_size, H_out=hidden_size 
    h = sources.new_zeros(1, N, self.hidden_size).float()
    c = sources.new_zeros(1, N, self.hidden_size).float()

    #output_size : (N, L, D*H_out) when batch_first=True
    outputs, (h, c) = self.enc(packed_input, (h, c))
    packed_output, _ = pad_packed_sequence(outputs, padding_value= 0, batch_first=True, total_length=max_len)

    return packed_output, (h, c)

In [12]:
from numpy.ma.core import nonzero
class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, device):
    super(Decoder, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.enc = nn.LSTM(embedding_dim+hidden_size, hidden_size, batch_first=True, bidirectional=False)
    self.output_emb = nn.Linear(hidden_size, vocab_size)
    self.att = DotProductAttention()
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.device = device
    
  def forward(self, state, target, valid_len):
    device = self.device
    loss = 0
    preds = []
    enc_output, (h, c), src_len = state
    enc_output = enc_output.to(device)

    #print(target)
    target_embedded = self.embedding(target)
    N, max_len = target_embedded.shape[:2]  #T : MAX sequence-length

    dec_output = enc_output.new_zeros(N,max_len,self.hidden_size).to(device)
    for i in range(max_len) :
      context = self.att(h.transpose(0,1), enc_output, enc_output, valid_length=src_len.to(device))

      dec_input = torch.cat((target_embedded[:,i,:].reshape(N,1,-1), context), dim=2)
      dec_words, (h, c) = self.enc(dec_input, (h, c))    #dec_words : (N,1,hidden_size)
      dec_output[:,i,:] = dec_words.reshape(N,self.hidden_size)   #(N,T,hidden_size)

    preds = self.output_emb(dec_output)   #preds : (N,Max_len,vocab_size)

    loss = F.nll_loss(F.log_softmax(preds[:, :max_len-1].transpose(1,2), dim = 1), target[:, 1:], ignore_index=0, reduction = 'none')
    loss = loss.sum(1).mean()

    preds = preds.argmax(dim=-1)
    # END OF YOUR CODE
    return loss, preds
  
  def predict(self, state, target=None, valid_len=None):
    device = self.device
    pred = None
    enc_output, (h, c), src_len = state
    enc_output = enc_output.to(device)
    N, max_len = enc_output.shape[:2]  #T : MAX sequence-length

    preds = []
    pred_prev = self.embedding(torch.full((N,1,1),fill_value=SOS).to(device)).reshape(N,1,-1)

    for i in range(max_len) :
      context = self.att(h.transpose(0,1), enc_output, enc_output, valid_length=src_len.to(device))
      dec_input = torch.cat((pred_prev, context), dim=2)
      dec_words, (h, c) = self.enc(dec_input, (h, c))    #dec_words : (N,1,hidden_size)
      dec_words_output = self.output_emb(dec_words.to(device)).argmax(dim=-1)   #(hidden_size -> vocab_size)
      preds.append(dec_words_output)
      pred_prev = self.embedding(dec_words_output)    #(vocab_size -> emb_dim)
    
    pred = torch.cat(preds, dim=1).to(device)

    return pred

In [13]:
class NMTLSTM(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size, embedding_dim, hidden_size, device):
    super(NMTLSTM, self).__init__()
    self.enc = Encoder(src_vocab_size, embedding_dim, hidden_size, device)
    self.dec = Decoder(tgt_vocab_size, embedding_dim, hidden_size, device)
    
  def forward(self, src, src_len, tgt, tgt_len):
    outputs, (h, c) = self.enc(src, src_len)
    loss, pred = self.dec((outputs, (h, c), src_len), tgt, tgt_len)
    return loss, pred
  
  def predict(self, src, src_len, tgt=None, tgt_len=None):
    outputs, (h, c) = self.enc(src, src_len)
    pred = self.dec.predict((outputs, (h, c), src_len), tgt, tgt_len)
    return pred

In [14]:
def train_lstm(net, train_loader, lr, epochs, device):
  # training
  net = net.to(device)

  optimizer = torch.optim.Adam(net.parameters(), lr=lr)
  loss_list = []
  print_interval = int(len(train_loader) / 5)
  total_iter = epochs * len(train_loader)
  i = 0
  for e in range(epochs):
    net.train()
    for text, item in train_loader :
      labels, data, valid_len, tgt, tgt_len = item
      #data : (B, M), valid_len : (B)
      data, tgt = data.to(device), tgt.to(device)

      loss, pred = net(data, valid_len, tgt, tgt_len)

      loss_list.append(loss.mean().detach())
      optimizer.zero_grad()
      loss.mean().backward()
      optimizer.step()

      step = i + e * len(train_loader)
      if i % print_interval == 0:
        print('iter {} / {}\tLoss:\t{:.6f}'.format(i, total_iter, loss.mean().detach()))
        print('pred:\t {}\n'.format(pred.detach().cpu()))
        print('tgt:\t {}\n'.format(tgt[:,1:].cpu()))
      i += 1
  return loss_list

In [15]:
lr = 5e-4
embedding_dim = 512
hidden_size = 512
epochs = 20

lstm_net = NMTLSTM(VOCAB_SIZE, VOCAB_SIZE, embedding_dim, hidden_size, device)
lstm_loss_list = train_lstm(lstm_net, train_loader, lr, epochs, device)

iter 0 / 23980	Loss:	40.027039
pred:	 tensor([[ 40,  69, 126, 104,  65,  65,  80,  98,  52,  65,  14,  53,  53,  53,
          53,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53],
        [ 69,  78,  49,  19,  19,  64,  64,  14,  53,  53,  53,  53,  53,  53,
          53,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53],
        [ 40,  78,  20, 117,  18,  52,  65,  52,  56,  14,  53,  53,  53,  53,
          53,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53],
        [ 44,  91,  68,  52,  52, 126,  78,  78,  78,  53,  53,  53,  53,  53,
          53,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53],
        [ 69,  69, 126,  89,  65,  65,  98,  56,  35,  80,   4,  18,  52,  56,
          60,  60,  53,  53,  53,  53,  53,  53,  53,  53,  53],
        [ 69, 126, 126,  64,  64,  30,  14,  53,  53,  53,  53,  53,  53,  53,
          53,  53,  53,  53,  53,  53,  53,  53,  53,  53,  53],
        [ 44,  18,  38, 105,  75, 127,  51, 123,   6,  53,  53,  53,  53,  53,
          53,  53, 

In [16]:
!pip install fastDamerauLevenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastDamerauLevenshtein
  Downloading fastDamerauLevenshtein-1.0.7.tar.gz (36 kB)
Building wheels for collected packages: fastDamerauLevenshtein
  Building wheel for fastDamerauLevenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for fastDamerauLevenshtein: filename=fastDamerauLevenshtein-1.0.7-cp37-cp37m-linux_x86_64.whl size=59375 sha256=1a149a0c7f0e8af8dc896f1b3ba74b1b84fdbf57b0f3c949836683f1adf4644b
  Stored in directory: /root/.cache/pip/wheels/3d/f6/ac/23a63dea60e40b449efa973a54f5b1ba766a984be2d7b95964
Successfully built fastDamerauLevenshtein
Installing collected packages: fastDamerauLevenshtein
Successfully installed fastDamerauLevenshtein-1.0.7


In [17]:
from fastDamerauLevenshtein import damerauLevenshtein

def tensor_to_str(t) :
  #tensor t : (N,Max_len)
  N = t.shape[0]
  str_list = []
  for i in range(N) :
    row = t[i,:]
    decoded = "".join([chr(c) for c in row if c in range(1,128)])
    str_list.append(decoded)
  return str_list

In [18]:
def comp_acc(pred, gt, valid_len):
  N, T_gt = gt.shape[:2]
  _, T_pr = pred.shape[:2]
  assert T_gt == T_pr, 'Prediction and target should have the same length.'
  len_mask = torch.arange(T_gt-1).expand(N, T_gt-1)
  len_mask = len_mask < (valid_len[:, None] - 1)
  
  # print(pred.shape)
  pred_crr = (pred[:,:-1] == gt[:,1:]).float() * len_mask.float() # filter out the 'bos' token
  pred_acc = pred_crr.sum(dim=-1) / (valid_len - 1).float() # minus the 'bos' token
  return pred_acc
  
def evaluate_lstm(net, test_loader, device):
  acc_list = []

  i=0
  print_interval = int(len(test_loader)/10)
  for text, item in test_loader :
    labels, data, valid_len, tgt, tgt_len = item
    data, tgt = data.to(device), tgt.to(device)
      
    pred = net.predict(data, valid_len, tgt, tgt_len)

    #pred_acc = comp_acc(pred.detach().cpu(), tgt.detach().cpu(), tgt_len.cpu())
    #acc_list.append(pred_acc)

    pred_str = tensor_to_str(pred)
    dist_list = [damerauLevenshtein(p, l) for p, l in zip(pred_str, list(labels))]
    acc_list.append(torch.tensor(dist_list))

    if i%print_interval == 0 :
      print("preds :", pred_str)
      print("tgt :  ", list(labels))
    #print("levenshtein-Distances :", dist_list)
    i += 1
  
  print("Test complete")
  acc_final = torch.cat(acc_list).mean()
  return acc_final, acc_list


acc_final, acc_list = evaluate_lstm(lstm_net, test_loader, device)
print("Final Accuracy :", acc_final)

preds : ['nigga', 'jackoff', 'sheeet', 'motherfck', 'azzaz', 'jizzziesssss', 'nigs', 'motherfuccsasccscs', 'jagoff', 'knobeadszzz']
tgt :   ['nigra', 'jackoff', 'sheet', 'motherfxck', 'azz', 'jizzes', 'jiggs', 'motherfcks', 'jagoff', 'knobendz']
preds : ['wanker', 'jizzsacss', 'shitterfuckeruckerck', 'dickfaceac', 'shittyfuckerfucker', 'jiggaboo', 'shits', 'sheist', 'cum', 'jigga']
tgt :   ['wnker', 'jizzes', 'shitfudgefucker', 'dickface', 'shitfudgefucker', 'jiggabooboo', 'shit', 'sheet', 'cumz', 'jiggs']
preds : ['azzaz', 'ragreadsadss', 'mofucc', 'jigaboos', 'bitchsassssasss', 'jigaboos', 'dickface', 'reeckhead', 'spook', 'mofuck']
tgt :   ['azz', 'ragheads', 'mofuck', 'jigaboos', 'bitchasses', 'jigaboos', 'dickface', 'peckerhead', 'spook', 'mofuck']
preds : ['bitchty', 'phuckingsickssics', 'carpetmunchersingers', 'wanker', 'weiner', 'shitesicksickssicks', 'jiggass', 'wankiesick', 'shitte', 'cookk']
tgt :   ['bitchers', 'phvckings', 'carpetmunchers', 'wnker', 'wnker', 'shitdicks', '

# LSTM (Single-Directional) w/ Attention
Batch = 10, Epoch = 20

Accuracy (based on Levenshtein-Distance) : 0.6829

In [19]:
torch.save(lstm_net, "/content/spelling_lstm.model")

In [20]:
#example input : list of strings(words)
test_words = ["f@ck", "b!tcH", "moth3rf#ck*r"]

#Get model
model_loaded = torch.load("/content/spelling_lstm.model")
model_loaded.to(device)

#Preprocess input
max_len_input = max([len(i) for i in test_words])
src = preprocess(pd.DataFrame(test_words, columns=['text']), column='text', max_len=max_len_input)
src_len = torch.tensor([t.shape[0] for t in src], dtype=torch.int64)
src = torch.cat([torch.unsqueeze(s, dim=0) for s in src], dim=0).to(device)

#Do the prediction & print
prediction = tensor_to_str(model_loaded.predict(src, src_len))
print(prediction)

['fckkedcking', 'bitchlicker', 'motherfucker']


# Baseline : Bi-Directional RNN

In [24]:
class BaseRNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, device=None):
    super(BaseRNN, self).__init__()
    self.num_layers = 1
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.enc = nn.RNN(embedding_dim, hidden_size, num_layers=self.num_layers, batch_first=True, bidirectional=True)  #input_size, hidden_size, num_layers, bias, batch_first(TRUE -> (B,MAX_LEN,emb_dim)), dropout, bi-directional
    self.ln1 = nn.Linear(2*hidden_size, vocab_size)

    self.hidden_size = hidden_size
    self.embedding_dim = embedding_dim
    
  def forward(self, sources, valid_len):
    #(B,Max_len)
    word_embedded = self.embedding(sources)
    packed_input = pack_padded_sequence(word_embedded, valid_len, batch_first=True, enforce_sorted=False)

    N = word_embedded.shape[0]  #(N, Max_len, emb_dim)
    max_len = word_embedded.shape[1]
    
    #(D*num_layers), N, H_out / D=2(bi-directional), num_layers=1, N=batch_size, H_out=hidden_size 
    h = sources.new_zeros(2*self.num_layers, N, self.hidden_size).float()
    #c = sources.new_zeros(2*self.num_layers, N, self.hidden_size).float()

    #output_size : (N, L, D*H_out) when batch_first=True
    outputs, h = self.enc(packed_input, h)
    packed_output, _ = pad_packed_sequence(outputs, padding_value= 0, batch_first=True, total_length=max_len)

    #linear_output : (N, L, Vocab_size)
    lin_output = self.ln1(packed_output)

    #Final_Output : (N, L, 1)
    preds = lin_output.argmax(dim=-1)
    return preds, lin_output

In [25]:
def train_RNN(net, train_loader, lr, epochs, device):
  # training
  net = net.to(device)

  optimizer = torch.optim.Adam(net.parameters(), lr=lr)
  loss_list = []
  print_interval = int(len(train_loader) / 5)
  total_iter = epochs * len(train_loader)
  i = 0
  for e in range(epochs):
    net.train()
    for text, item in train_loader :
      labels, data, valid_len, tgt, tgt_len = item

      #data : (B, M), valid_len : (B)
      data, tgt = data.to(device), tgt.to(device)

      pred, output = net(data, valid_len)
      loss = F.nll_loss(F.log_softmax(output.transpose(1,2), dim = 1), tgt[:, 1:], ignore_index=0, reduction = 'none')
      loss = loss.sum(1).mean()
      loss_list.append(loss.mean().detach())
      optimizer.zero_grad()
      loss.mean().backward()
      optimizer.step()

      step = i + e * len(train_loader)
      if i % print_interval == 0:
        print('iter {} / {}\tLoss:\t{:.6f}'.format(i, total_iter, loss.mean().detach()))
        print('pred:\t {}\n'.format(pred.detach().cpu()))
        print('tgt:\t {}\n'.format(tgt[:,1:].cpu()))
      i += 1
  return loss_list

In [26]:
lr = 5e-4
embedding_dim = 512
hidden_size = 512
epochs = 20

base_net = BaseRNN(VOCAB_SIZE, embedding_dim, hidden_size, device)
base_loss_list = train_RNN(base_net, train_loader, lr, epochs, device)

iter 0 / 23980	Loss:	40.613979
pred:	 tensor([[ 88, 114,  80,  64,  38,  33, 112, 112, 112, 112, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 112, 112],
        [  8,  80,  65,  24,  78, 105,  76,  14,  30,  33, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 112, 112],
        [ 24,  30, 128,  37,  61,  93,  53, 112, 112, 112, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 112, 112],
        [ 38,  67,  55,  96,  55,  47,  56,  33, 112, 112, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 112, 112],
        [  4,   9,  30,  22,  29,  88, 116, 112, 112, 112, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 112, 112],
        [109,  85,  43, 125,  99, 112, 112, 112, 112, 112, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 112, 112],
        [ 88, 100,  85,  65,  88,  14,  88,  86,  33, 112, 112, 112, 112, 112,
         112, 112, 112, 112, 112, 112, 112, 112, 

In [27]:
def evaluate_RNN(net, test_loader, device):
  acc_list = []

  i = 0
  print_interval = int(len(test_loader)/10)
  for text, item in test_loader :
    labels, data, valid_len, tgt, tgt_len = item
    data, tgt = data.to(device), tgt.to(device)
    
    #pred : (N,L,1)
    pred, _ = net(data, valid_len)

    pred_str = tensor_to_str(pred)
    dist_list = [damerauLevenshtein(p, l) for p, l in zip(pred_str, list(labels))]
    acc_list.append(torch.tensor(dist_list))

    if i%print_interval == 0 :
      print("preds :", pred_str)
      print("tgt :  ", list(labels))
    #print("levenshtein-Distances :", dist_list)
    i += 1
  
  print("Test complete")
  acc_final = torch.cat(acc_list).mean()
  return acc_final, acc_list

base_acc_final, base_acc_list = evaluate_RNN(base_net, test_loader, device)
print("Final Accuracy :", base_acc_final)

preds : ['wanking', 'peckhharsl', 'motherfkc', 'shiter', 'spiok', 'ccosucka', 'MFers', 'slusterfucking', 'clusterfucking', 'coctits']
tgt :   ['wanking', 'peckerhead', 'motherfxck', 'shite', 'spook', 'coksucka', 'MFers', 'clusterfucking', 'clusterfucking', 'cooties']
preds : ['ladboy', 'jagofff', 'wankkn', 'carretmunchers', 'packerfuugerzle', 'kkobend', 'shittydiiks', 'MFers', 'scieeterfuckface', 'shittydnipdss']
tgt :   ['ladiboy', 'jagoff', 'wanking', 'carpetmunchers', 'packerfudgehead', 'knobendz', 'shittydicks', 'MFers', 'shiesterfuckface', 'shittydicks']
preds : ['cocklicker', 'barsesscicps', 'shitdicks', 'wanking', 'nigaa', 'breastitccss', 'shithiadss', 'dopsh', 'cumz', 'phickings']
tgt :   ['cocklicker', 'breasticles', 'shitdicks', 'wanking', 'nigra', 'breasticles', 'shitheads', 'doosh', 'cumz', 'phvckings']
preds : ['carredmunchers', 'aasseadds', 'dotgshi', 'clusterfucciin', 'shitee', 'clusterfucking', 'wanker', 'bitchers', 'peter', 'seete']
tgt :   ['carpetmunchers', 'ragheads

In [28]:
torch.save(base_net, "/content/spelling_base_rnn.model")

In [29]:
#example input : list of strings(words)
test_words = ["f@ck", "b!tcH", "moth3rf#ck*r"]

#Get model
model_loaded = torch.load("/content/spelling_base_rnn.model")
model_loaded.to(device)

#Preprocess input
max_len_input = max([len(i) for i in test_words])
src = preprocess(pd.DataFrame(test_words, columns=['text']), column='text', max_len=max_len_input)
src_len = torch.tensor([t.shape[0] for t in src], dtype=torch.int64)
src = torch.cat([torch.unsqueeze(s, dim=0) for s in src], dim=0).to(device)

#Do the prediction & print
prediction = tensor_to_str(model_loaded(src, src_len)[0])
print(prediction)

['fuckbbst', 'botchbbh', 'motherfkcker']


# RNN (Bi-Directional)
Batch = 10, Epoch = 20

Accuracy (based on Levenshtein-Distance) : 0.7919