#### This notebook originates from the [work](https://www.kaggle.com/code/nhttinnguynbch/commonlit-ess-lgbm-autocorrect-deberta-v3-tuned) of [@nhttinnguynbch](https://www.kaggle.com/nhttinnguynbch)<br>Which originates from the [work](https://www.kaggle.com/code/siddhvr/commonlit-ess-lgbm-autocorrect-deberta-v3-tuned) of [@siddhvr](https://www.kaggle.com/siddhvr)

With this notebook, I tried to perform the preprocessing part separately from the submission notebook.<br>
So that I could save some time when submitting the actual notebook I was working on.<br>
<br>
Half of this notebook is generated by [@siddhvr](https://www.kaggle.com/siddhvr) and the rest is by myself, based on this [notebook](https://www.kaggle.com/code/jasonheesanglee/spellcheck-tool-comparison) for comparing spellchecking tools.<br>
However, now I found out that this wasn't the key to achieve high marks on this competition.

In [None]:
# !pip install -q "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
# !pip install -q "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"
# !pip install -q '/kaggle/input/wheel-downloader/wheelhouse/NLP/editdistpy-0.1.3.tar.gz'
# !pip install -q '/kaggle/input/symspell-677/editdistpy-0.1.3-cp310-cp310-linux_x86_64.whl'

In [None]:
!pip install -q autocorrect pyspellchecker symspellpy

In [None]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
import sentencepiece
import pkg_resources

from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
# from transformers import T5ForConditionalGeneration, T5TokenizerFast, T5Config
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
from symspellpy import SymSpell, Verbosity
import lightgbm as lgb
import optuna


warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [None]:
files = ['debertav3base', # files[0]
         'albert-large-v2', # files[1]
         'bert-base-uncased', # files[2]
         'bert-large-uncased', # files[3]
         'distilroberta-base', # files[4]
         'distilbert-base-uncased', # files[5]
         'google-electra-base-discriminator', # files[6]
         'facebook-bart-base', # files[7] # Not working
         'facebook-bart-large', # files[8]
         'funnel-transformer-small', # files[9]
         'funnel-transformer-large', # files[10]
         'roberta-base', # files[11]
         'roberta-large', # files[12]
         't5-base', # files[13] # don't use
         't5-large', # files[14] # don't use
         'xlnet-base-cased', # files[15]
         'xlnet-large-cased' # files[16]
         ]

In [None]:
IS_DEBUG = True
OPTUNA = False
FOLD = 'G_FOLD' # 'G_FOLD' or 'S_FOLD'

SEP_TKN = ' #### ' # ' [SEP]' or ' #### '

if SEP_TKN == ' #### ':
    CLS_TKN = ''
else:
    CLS_TKN = ' [CLS] '
    
class CFG:
    model_name=files[0]
    learning_rate=0.000016   #0.000015
    weight_decay=0.007        #0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_mprob=0.007
    num_train_epochs=1 if IS_DEBUG else 4
    n_splits= 2 if IS_DEBUG else 4
    batch_size= 2 if IS_DEBUG else 8
    random_seed=42
    save_steps=1000 if IS_DEBUG else 100
    max_length= 10 if IS_DEBUG else 512

In [None]:
method = ['py_and_sym', 'pyspell_only', 'symspell_only']

freq_dict_list = ["/content/symspell-677/symspell_freq_dict.txt",
            "/content/symspell-677/frequency_dictionary_en_82_765.txt",
            "/content/symspell-677/frequency_bigramdictionary_en_243_342.txt"]

yes = True
no = False

manage_misspelled_words = yes
misspelled_word_method = method[0]
freq_dict = freq_dict_list[1].split('/')[-1]
freq_dict


In [None]:
if (manage_misspelled_words==yes) & (misspelled_word_method == 'py_and_sym'):
    pyspell_detector = yes
    symspell_corrector = yes
    misspell_counter = 0

if (manage_misspelled_words==yes) & (misspelled_word_method == 'pyspell_only'):
    pyspell_detector = yes
    symspell_corrector = no
    misspell_counter = 1
    
if (manage_misspelled_words==yes) & (misspelled_word_method == 'symspell_only'):
    pyspell_detector = no
    symspell_corrector = yes
    freq_dict = freq_dict_list[0]
    misspell_counter = 2

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [None]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        if model_name == files[0]:
            self.tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/{model_name}')
        elif model_name == (files[13] or files[14]):
            self.tokenizer = T5TokenizerFast.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/transformers/{model_name}')
        
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        # input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [None]:
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

train.head()

In [None]:
def pyspellchecker_detector(sentence):
    sentence = re.sub(r'[^\w\s]','',sentence)
    spell = SpellChecker()
    tokens = sentence.split(' ')
    mis_tokens = []
    for token in spell.unknown(tokens):
        if token.isalpha():
            mis_tokens.append(token)
    return mis_tokens

def symspellpy_corrector(mis_tokens, freq_dict_):
    try:
        sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
        freq_dict = pkg_resources.resource_filename("symspellpy", freq_dict_)
        sym_spell.load_dictionary(freq_dict, term_index=0, count_index=1)
        corrected_token = {}
        for token in tqdm(mis_tokens):
            terms = sym_spell.lookup_compound(token, 
                                              max_edit_distance=2) 
            if token not in corrected_token.keys():
                corrected_token[token] = terms[0].term
        return corrected_token

    except UnicodeDecodeError:
        return mis_tokens

def py_sym_checker(df, column, new_col, freq_dict_):
    try:
        mis_tokens = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            for word in pyspellchecker_detector(sentence):
                mis_tokens.append(word)
        
        mis_token_rep = symspellpy_corrector(mis_tokens, freq_dict_)
        
        temp = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            tokens = sentence.split(' ')
            temp_str = ''
            for token in tokens:
                if token in mis_token_rep.keys():
                    temp_str = temp_str + " " + mis_token_rep.get(token)
                else:
                    temp_str = temp_str + " " + token
            temp.append(temp_str)
            
        # df[column] = pd.Series(temp)
        return pd.Series(temp)
    
    except UnicodeDecodeError:
        return df[new_col]
    
def symspellpy_correction(df, column, new_col, freq_dict=freq_dict):
    try:        
        temp = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5)
            freq_dict = freq_dict
            sym_spell.load_dictionary(freq_dict, term_index=0, count_index=1)
            terms = sym_spell.lookup_compound(sentence, 
                                              max_edit_distance=2) 

            corrected_sentence = terms[0].term
            temp.append(corrected_sentence)
            
        # df[column] = pd.Series(temp)
        return pd.Series(temp)
    
    except UnicodeDecodeError:
        return df[new_col]
    
def pyspell_correction(df, column, new_col):
    try:        
        temp_total = []
        for row_num in tqdm(range(df.shape[0])):
            sentence = df[column][row_num]
            
            spell = SpellChecker()
            tokens = nltk.word_tokenize(sentence)
            text_length = len(tokens)

            mis_tokens = [token for token in spell.unknown(tokens) if token.isalpha()]
            temp = []
            corrected_words = []
            for word in mis_tokens:
                corrected_word = spell.correction(word)
                temp.append({word : corrected_word})
                corrected_words.append(corrected_word)

            temp_1 = []
            for word in tokens:
                for set_ in temp:
                    if list(set_.keys())[0] == word:
                        word = list(set_.values())[0]
                        if word in temp_1:
                            continue
                        else:
                            temp_1.append(word)
                if word in temp_1:
                    continue
                else:
                    temp_1.append(word)

            corrected_sentence = ''
            for word in temp_1:
                try:
                    if (word.isalpha() or word.isnumeric()) == True:
                        corrected_sentence = corrected_sentence + word + ' '
                    elif word in [',', '.', '"', "'", '(', ')', '[', ']', '{', '}']:
                        corrected_sentence = corrected_sentence + word
                    else:
                        corrected_sentence = corrected_sentence + word
                except:
                    continue
            corrected_sentence = corrected_sentence.replace('  ', ' ').strip()
            temp_total.append(corrected_sentence)
            
        # df[column] = pd.Series(temp_total)
        return pd.Series(temp_total)
    except UnicodeDecodeError:
        return df[new_col]

In [None]:
if manage_misspelled_words:
    train_0 = train.copy()
    test_0 = test.copy()
    train_0['symspell_corr'] = py_sym_checker(train, 'text', 'symspell_corr',freq_dict)
    print(f"py & sym train_processed['text'][0] =\n{train_0['symspell_corr'][0]}")
    print()
    test_0['symspell_corr'] = py_sym_checker(test, 'text', 'symspell_corr',freq_dict)
    print(f"py & sym test_processed['text'][0] =\n{test_0['symspell_corr'][0]}")
    

In [None]:
if manage_misspelled_words:

    train_1 = train.copy()
    test_1 = test.copy()
    train_1['symspell_corr'] = pyspell_correction(train, 'text', 'symspell_corr')
    print(f"py only train_processed['text'][0] =\n{train_1['symspell_corr'][0]}")
    print()
    test_1['symspell_corr'] = pyspell_correction(test, 'text', 'symspell_corr')
    print(f"py only test_processed['text'][0] =\n{test_1['symspell_corr'][0]}")

In [None]:
if manage_misspelled_words:

    train_2 = train.copy()
    test_2 = test.copy()
    train_2['symspell_corr'] = symspellpy_correction(train, 'text', 'symspell_corr', freq_dict)
    print(f"sym only train_processed['text'][0] =\n{train_2['symspell_corr'][0]}")
    print()
    test_2['symspell_corr'] = symspellpy_correction(test, 'text', 'symspell_corr', freq_dict)
    print(f"sym only test_processed['text'][0] =\n{test_2['symspell_corr'][0]}")
    
else:
    
    print('no change')

In [None]:
train.columns

In [None]:
display(train_0.head(5))
display(test_0.head(5))

In [None]:
display(train_1.head(5))
display(test_1.head(5))

In [None]:
display(train_2.head(5))
display(test_2.head(5))

In [None]:
display(train.shape)
display(test.shape)

In [None]:
train.to_csv('original_train.csv', index=False)
test.to_csv('original_test.csv', index=False)

train_0.to_csv('py_sym_train.csv', index=False)
test_0.to_csv('py_sym_test.csv', index=False)
train_1.to_csv('py_only_train.csv', index=False)
test_1.to_csv('py_only_test.csv', index=False)
train_2.to_csv('sym_only_train.csv', index=False)
test_2.to_csv('sym_only_test.csv', index=False)