In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F

In [None]:
if torch.cuda.is_available() :
  print("CUDA")
  device = torch.device("cuda")
# elif torch.backends.mps.is_available() :
#   print("M1-mps")
#   device = torch.device("mps")
else :
  print("CPU")
  device = torch.device("cpu")

In [None]:
class TestDataset(Dataset) :
  #Dataset - English/typo-added/labeled
  def __init__(self, df) :
    self.df = df
  
  def __len__(self) :
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    label = self.df.iloc[idx, 1]
    return text, label

In [None]:
train_rate, test_rate = 0.9, 0.09
itr = 1
p_itr = 100
epochs = 5
batch = 10

In [None]:
total_df = pd.read_csv('augmented_data/Dataset_aug_complex_10424_.csv', sep=',')
#total_df = pd.read_csv('augmented_data/Dataset_aug_complex_10424_original.csv', sep=',')

total_df.dropna(inplace=True)
total_df = total_df[["text", "label"]]
total_df["label"] = [1 if i == "nothate" else 0 for i in total_df["label"]]
print(total_df)
total_dataset = TestDataset(total_df)
total_loader = DataLoader(total_dataset, batch_size=batch, shuffle=True)

In [None]:
test_df, train_df, _ = np.split(total_df, [int(test_rate*len(total_df)), int(test_rate*len(total_df) + train_rate*len(total_df))])
print(len(test_df), len(train_df))
train_dataset = TestDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)

# Application of SpellChecker-Seq2Seq Model

In [None]:
#sentence -> word-tokenize (whitespace-tokenize) -> check out-of-vocabulary for each word
#if word is out-of-vocabulary : collect the words (from each sentence)
#pass the words through the correction model as 1 batch -> get list of output words
#replace the original words with output words
import nltk, re, string
from nltk.tokenize import WhitespaceTokenizer, TreebankWordTokenizer
from nltk.corpus import wordnet as wn
from random import sample

VOCAB_SIZE = 128 + 2
EOS, SOS, PAD = 128, 129, 0

def preprocess_words(texts, max_len, is_source=True) :
    #max_len = max([len(t) for t in texts])
    out = []
    for text in texts :
        encoded = [ord(c) for c in text] + [EOS] + [PAD] * (max_len - len(text))
        if not is_source :
            encoded = [SOS] + encoded
        out.append(torch.tensor(encoded))
    return out  #, max_len

def tensor_to_str(t) :
  #tensor t : (N,Max_len)
  N = t.shape[0]
  str_list = []
  for i in range(N) :
    row = t[i,:]
    decoded = "".join([chr(c) for c in row if c in range(1,128)])
    str_list.append(decoded)
  return str_list

In [None]:
def spellcheck(words, model, device, model_type="lstm") :
    #input : list of strings(words)

    #Get model
    model = model.to(device)

    #Preprocess input
    max_len = max([len(i) for i in words])
    src = preprocess_words(words, max_len)

    src_len = torch.tensor([t.shape[0] for t in src], dtype=torch.int64)
    src = torch.cat([torch.unsqueeze(s, dim=0) for s in src], dim=0).to(device)

    #Do the prediction & print
    if model_type=="lstm" :
        prediction = tensor_to_str(model.predict(src, src_len))
    elif model_type=="rnn" :
        prediction = tensor_to_str(model(src, src_len)[0])
    else :
        prediction = None
    print(prediction)
    return prediction

def spell_correction(text, tokenizer, vocab, model) :
    tokenized_words = tokenizer.tokenize(text)
    new_words = []
    new_text = text
    re_punkt = re.compile("[" + string.punctuation + "]+")
    
    out_of_vocab_words = [word for word in tokenized_words if (word.lower() not in vocab) and not re_punkt.fullmatch(word)]
    #pred_words = [word.lower() for word in out_of_vocab_words]
    pred_words = spellcheck(out_of_vocab_words, model, device)

    for (word, new_word) in zip(out_of_vocab_words, pred_words) :
        print(word)
        new_text = text.replace(word, new_word)
        
    return new_text

# Hyperparams

1. LSTM w/ Attention <br>
lr = 5e-4 <br>
embedding_dim = 512 <br>
hidden_size = 512 <br>
epochs = 10 <br>
batch = 10 <br>

2. RNN(Baseline) <br>
lr = 5e-4 <br>
embedding_dim = 512 <br>
hidden_size = 512 <br>
epochs = 10 <br>
batch = 10 <br>

In [None]:
wordnetdict = wn.words(lang='eng')
tokenizer = TreebankWordTokenizer()

model_path = "models/"
lstm_name = "spelling_lstm.model"
rnn_name = "spelling_base.model"

model = torch.load(model_path + lstm_name)


#Do THIS at the preprocessing-part of training loop of the classification model
for text, label in test_loader :
    text_corrected = [spell_correction(t, tokenizer, wordnetdict, model) for t in text]
    print(text)
    print(text_corrected)