# **Background Problem**

# **Resource**

In [1]:
import re
# Load Dataset
with open("en_US.blogs.txt", "r", encoding="utf-8") as file:
    text = file.read()
    
# Split into sentences using regex
sentences = re.split(r'(?<=[.!?]) +', text)

print(sentences[0])

In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”.
We love you Mr.


# **Preprocessing**

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def preprocessing(text):
    # Lower Text
    text_lowercase = text.lower()

    # Remove Punctuation
    text_cleaned = re.sub(r'[^\w\s]', '', text_lowercase)  # removes punctuation

    words = word_tokenize(text_cleaned)
    
    return words

words = preprocessing(text)

vocabulary = set(words)

print(f"The first ten words in the text are: \n{words[0:10]}")
print(f"There are {len(vocabulary)} unique words in the vocabulary.")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasonmiracle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The first ten words in the text are: 
['in', 'the', 'years', 'thereafter', 'most', 'of', 'the', 'oil', 'fields', 'and']
There are 435895 unique words in the vocabulary.


# **Building Model**

In [3]:
# Create Dictionary of Frequency
word_count_dict = {}

for word in words:
    word_count_dict[word] = word_count_dict.get(word, 0) + 1
    
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'i' is {word_count_dict.get('i',0)}")

There are 435895 key values pairs
The count for the word 'i' is 769059


In [4]:
# Probability Computation Function
def count_probability(word_count_dict):
    probability = {}
    
    # Count Total Words in Corpus
    total_words = sum(word_count_dict.values())
    
    for key, values in word_count_dict.items():
        probability[key] = values/total_words
        
    return probability

probabilities = count_probability(word_count_dict)
print(f"Length of probs is {len(probabilities)}")
print(f"P('i') is {probabilities['i']:.4f}")

Length of probs is 435895
P('i') is 0.0207


## **Editing String**

In [5]:
# Delete Function
def delete_letter(word, verbose=False):
    split_letter = []
    delete_letter = []
    
    # Split Word Each Position
    split_letter = [(word[:i], word[i:]) for i in range(len(word))]
    
    # Delete Letter Each Position
    delete_letter = [left + right[1:] for left, right in split_letter if right]
    
    if verbose:
        print(f"Delete suggestion : {delete_letter}")
    
    return delete_letter

result = delete_letter("down", verbose=True)

Delete suggestion : ['own', 'dwn', 'don', 'dow']


In [6]:
# Insert Function
def insert_letter(word, verbose=False):
    split_letter = []
    insert_letter = []
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    # Split Word Each Position
    split_letter = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # Insert Letter Each Position
    for left, right in split_letter:
        for letter in letters:
            new_word = left + letter + right
            insert_letter.append(new_word)
    
    if verbose: print(f"Insert suggestion : {insert_letter}")
    
    return insert_letter

result = insert_letter('down', verbose=True)

Insert suggestion : ['adown', 'bdown', 'cdown', 'ddown', 'edown', 'fdown', 'gdown', 'hdown', 'idown', 'jdown', 'kdown', 'ldown', 'mdown', 'ndown', 'odown', 'pdown', 'qdown', 'rdown', 'sdown', 'tdown', 'udown', 'vdown', 'wdown', 'xdown', 'ydown', 'zdown', 'daown', 'dbown', 'dcown', 'ddown', 'deown', 'dfown', 'dgown', 'dhown', 'diown', 'djown', 'dkown', 'dlown', 'dmown', 'dnown', 'doown', 'dpown', 'dqown', 'drown', 'dsown', 'dtown', 'duown', 'dvown', 'dwown', 'dxown', 'dyown', 'dzown', 'doawn', 'dobwn', 'docwn', 'dodwn', 'doewn', 'dofwn', 'dogwn', 'dohwn', 'doiwn', 'dojwn', 'dokwn', 'dolwn', 'domwn', 'donwn', 'doown', 'dopwn', 'doqwn', 'dorwn', 'doswn', 'dotwn', 'douwn', 'dovwn', 'dowwn', 'doxwn', 'doywn', 'dozwn', 'dowan', 'dowbn', 'dowcn', 'dowdn', 'dowen', 'dowfn', 'dowgn', 'dowhn', 'dowin', 'dowjn', 'dowkn', 'dowln', 'dowmn', 'downn', 'dowon', 'dowpn', 'dowqn', 'dowrn', 'dowsn', 'dowtn', 'dowun', 'dowvn', 'dowwn', 'dowxn', 'dowyn', 'dowzn', 'downa', 'downb', 'downc', 'downd', 'downe'

In [7]:
# Swap Function
def swap_letter(word, verbose=False):
    split_letter = []
    swap_letter = []
    
    # Split Word Each Position
    split_letter = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # Swap Letter
    for left, right in split_letter:
        if len(right) > 1:
            new_word = left + right[1] + right[0] + right[2:]
            swap_letter.append(new_word)
            
    if verbose: print(f"Swap suggestion : {swap_letter}")

    return swap_letter

result = swap_letter("down", verbose=True)

Swap suggestion : ['odwn', 'dwon', 'donw']


In [8]:
# Replace Function
def replace_letter(word, verbose=False):
    split_letter = []
    replace_letter = []
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    # Split Word Each Position
    split_letter = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # Replace Letter Each Position
    for left, right in split_letter:
        if right:
            for letter in letters:
                if right[0] != letter:
                    new_word = left + letter + right[1:]
                    replace_letter.append(new_word)
            
    if verbose: print(f"Insert suggestion : {replace_letter}")
    
    return replace_letter

result = replace_letter('down', verbose=True)

Insert suggestion : ['aown', 'bown', 'cown', 'eown', 'fown', 'gown', 'hown', 'iown', 'jown', 'kown', 'lown', 'mown', 'nown', 'oown', 'pown', 'qown', 'rown', 'sown', 'town', 'uown', 'vown', 'wown', 'xown', 'yown', 'zown', 'dawn', 'dbwn', 'dcwn', 'ddwn', 'dewn', 'dfwn', 'dgwn', 'dhwn', 'diwn', 'djwn', 'dkwn', 'dlwn', 'dmwn', 'dnwn', 'dpwn', 'dqwn', 'drwn', 'dswn', 'dtwn', 'duwn', 'dvwn', 'dwwn', 'dxwn', 'dywn', 'dzwn', 'doan', 'dobn', 'docn', 'dodn', 'doen', 'dofn', 'dogn', 'dohn', 'doin', 'dojn', 'dokn', 'doln', 'domn', 'donn', 'doon', 'dopn', 'doqn', 'dorn', 'dosn', 'dotn', 'doun', 'dovn', 'doxn', 'doyn', 'dozn', 'dowa', 'dowb', 'dowc', 'dowd', 'dowe', 'dowf', 'dowg', 'dowh', 'dowi', 'dowj', 'dowk', 'dowl', 'dowm', 'dowo', 'dowp', 'dowq', 'dowr', 'dows', 'dowt', 'dowu', 'dowv', 'doww', 'dowx', 'dowy', 'dowz']


In [9]:
# Function
def editing_one_letter(word, verbose=False):
    letters = word.lower()
    suggestions = set(delete_letter(letters) + insert_letter(letters) + swap_letter(letters) + replace_letter(letters))
    if verbose: print(f"Edit results : {suggestions}")
    return suggestions

result = editing_one_letter("down", verbose=True)

Edit results : {'dopn', 'dosn', 'dowx', 'downd', 'wdown', 'domn', 'downw', 'ndown', 'cdown', 'dowl', 'kown', 'dgown', 'dovwn', 'don', 'gown', 'dorn', 'dlown', 'djown', 'uown', 'zdown', 'rdown', 'diown', 'downv', 'dowy', 'doin', 'dcwn', 'dowjn', 'dowfn', 'dvwn', 'doan', 'dnown', 'dowz', 'eown', 'fown', 'downe', 'dowe', 'dofwn', 'dokn', 'hdown', 'doawn', 'mdown', 'downa', 'downc', 'downt', 'donn', 'dotn', 'doyn', 'yown', 'dowr', 'dowen', 'dopwn', 'ldown', 'dmwn', 'downq', 'djwn', 'donwn', 'downr', 'dhwn', 'dowh', 'bdown', 'docwn', 'dowhn', 'doqn', 'dzwn', 'dowt', 'doln', 'kdown', 'qdown', 'doiwn', 'dwon', 'dowi', 'dobn', 'domwn', 'dowkn', 'doswn', 'jown', 'dpown', 'dofn', 'dowsn', 'town', 'downf', 'downy', 'daown', 'lown', 'dxwn', 'dowon', 'dewn', 'ydown', 'dotwn', 'wown', 'dows', 'dqown', 'gdown', 'dowk', 'ddwn', 'dgwn', 'doon', 'dowin', 'jdown', 'dowb', 'dowbn', 'dowrn', 'vown', 'dowj', 'dobwn', 'doown', 'iown', 'doen', 'dowqn', 'ddown', 'nown', 'doun', 'doww', 'doewn', 'dovn', 'dogn',

In [10]:
def autocorrection(word, vocabulary, probabilities, n=3, verbose=False):
    suggestions = []
    n_best = []
    
    if word in vocabulary:
        suggestions.append(word)
    else:
        one_edit = editing_one_letter(word)
        valid_result = one_edit.intersection(vocabulary)
        if valid_result:
            suggestions = valid_result
            
    words_probability = {word: probabilities.get(word, 0) for word in suggestions}
    
    n_best = sorted(words_probability.items(), key=lambda x: -x[1])[:n]
    
    if verbose: print("entered word = ", word, "\nsuggestions = ", suggestions)

    return n_best

autocorrection("nesw", vocabulary, probabilities, verbose=True)

entered word =  nesw 
suggestions =  {'nesn', 'nest', 'nsw', 'nssw', 'nasw', 'news', 'nes', 'nehw', 'new', 'ness'}


[('new', 0.001462007029042426),
 ('news', 0.0002067017207528097),
 ('nest', 1.6601791422088467e-05)]

In [None]:
nltk.download('averaged_perceptron_tagger')
import spacy

# Integrate Pos Tag and Autocorrect
def autocorrect_sentence(sentence, vocabulary, probabilities):
    tokenized_word = preprocessing(sentence)
    corrected = []
    
    for word in tokenized_word:
        suggestions = autocorrection(word, vocabulary, probabilities)
        corrected_word = suggestions[0][0] if suggestions else word
        corrected.append(corrected_word)
        
    return corrected

def pos_tag_corrected_sentence(sentence, vocabulary, probabilities):
    corrected_sentence = autocorrect_sentence(sentence, vocabulary, probabilities)
    print(f"Sentence : {sentence}\nCorrected : {' '.join(corrected_sentence)}\n")
    nlp = spacy.load('en_core_web_sm')
    pos_tagged = nlp(" ".join(corrected_sentence))
    for token in pos_tagged:
        print(token.text, token.pos_, token.tag_)
    return pos_tagged

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jasonmiracle/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [73]:
sentence = "I am so itred rigth now"
tagged = pos_tag_corrected_sentence(sentence, vocabulary, probabilities)

Sentence : I am so itred rigth now
Corrected : i am so tired right now
i PRON PRP
am AUX VBP
so ADV RB
tired ADJ JJ
right ADV RB
now ADV RB


# **Evaluation Model**

# **Conclusion**