In [6]:
import re
from collections import Counter
import heapq #to add more than word to the correction candidates

In [7]:
def words(text): return re.findall(r'\w+', text.lower())
Words=Counter(words(open('typo-0.2.txt').read()))


In [8]:
# reading the file with the correct spelling words
text='voc-1bwc.txt'

#declaring a dictionary with the correct words and their frequency as key
word_frequency={}

with open(text,'r') as file:
    for line in file:
        parts=line.strip().split()
        if len(parts)==2:
            word=parts[1]
            frequency=int(parts[0])
            word_frequency[word]=frequency

#adding the words orig to the dictionary 
word_frequency['orig']=Words['orig']


#adding the words in the file to the dictionary if they are not already in it
def WordFrequency(word_to_find):
    if word_to_find in word_frequency:
        return word_frequency[word_to_find]
    else:
        return 0



In [4]:
!pip install python-Levenshtein


Defaulting to user installation because normal site-packages is not writeable


### Liste of distances
Run only one of the distance sections going  from 1 to 4 then skip to the *Distance Testing* section

Levenshtein Distance

In [13]:
import Levenshtein #to calculate levenstein distance

In [14]:
#Levensthien distance
def distance_levenstein(a, b):
    return Levenshtein.distance(a, b)

In [15]:
distance_chosen=distance_levenstein

In [16]:
# get the closest words (correction) to the word we want to correct using vanilla python Deaumarau levunstein distance
def get_closest_word(word, words, n):
    closest_words = []
    
    for candidate in words:
        dist = distance_chosen(word, candidate)
        if len(closest_words) < n:
            closest_words.append((candidate, dist))
        else:
            # Replace the word with the highest distance if the current distance is smaller
            max_dist_word, max_dist = max(closest_words, key=lambda x: x[1])
            if dist < max_dist:
                closest_words.remove((max_dist_word, max_dist))
                closest_words.append((candidate, dist))
    
    closest_words.sort(key=lambda x: x[1])  # Sort by distance
    return [word for word, _ in closest_words]

Demarau Levenshtein Distance


In [68]:
import textdistance #to calculate the Demarau-Levenstein distance

In [69]:
def distance_demarau_levenshtein(a, b):
    return textdistance.damerau_levenshtein(a, b)

In [None]:
distance_chosen=distance_demarau_levenshtein

Jaccard Distance

In [54]:
from gensim.models import Word2Vec

In [56]:
model=Word2Vec( vector_size=100, window=5, min_count=1, workers=4)

In [37]:
from scipy.spatial import distance

In [60]:
def jaccard_distance(a, b):
    a = set(a)
    b = set(b)
    intersection = len(a.intersection(b))
    union= len(a.union(b))
    return  1 - intersection/union

In [62]:
distance_chosen=jaccard_distance

In [None]:
# get the closest words (correction) to the word we want to correct using heapq
def get_closest_word(word, words, n):
    closest_words = heapq.nsmallest(n, words, key=lambda x: distance_chosen(word, x))
    closest_words.sort(key=lambda tup: tup[1])
    return closest_words

Jaro-Winkler distance

In [22]:
#library to calculate the Jaro-Winkler distance
import jellyfish

In [23]:
#Jaro-Winkler distance
def Jaro_Winkler(a, b):
    return jellyfish.jaro_winkler(a, b)

In [74]:
distance_chosen=Jaro_Winkler

In [78]:
# exception for jaro-winkler distance

def get_closest_word(word, words, n):
    closest_words = []
    
    for candidate in words:
        dist = distance_chosen(word, candidate)
        if len(closest_words) < n:
            closest_words.append((candidate, dist))
        else:
            # Replace the word with the highest distance if the current distance is smaller
            max_dist_word, min_dist = min(closest_words, key=lambda x: x[1])
            if dist > min_dist:
                closest_words.remove((max_dist_word, min_dist))
                closest_words.append((candidate, dist))
    
    # Sort by distance
    return [word for word, _ in closest_words]

### Testing The Distance 

In [29]:
def replace_typo(match):
    orig=match.group(1)
    typo=match.group(2)
    correction_arr=get_closest_word(typo, word_frequency.keys(),1)
    correction = " ".join(correction_arr)
    return f'<correction typo orig="{orig}" >{correction}</correction>'
    

In [30]:
# getting the name of the distance function to add it to the name of the output file for easier identification
dist_string= str(distance_chosen).split()[1]

In [34]:

text=open('typo-0.2.txt','r').read()
pattern = r'<typo orig="(.*?)">(.*?)</typo>'
modified_text = re.sub(pattern, replace_typo, text)

# Write the modified content back to the file
with open(f'typocorrected{dist_string}.txt', 'w') as file:
    file.write(modified_text)

### Combining approaches

In [1]:
!pip install nltk


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import jellyfish
from nltk.metrics import jaccard_distance
from nltk.tokenize import word_tokenize


In [9]:



def get_closest_word_combined(range, input_word, reference_vocab):
# Calculate similarity scores for each metric
    scores = []

    for word in reference_vocab:
        levenshtein_score = 1 - jellyfish.levenshtein_distance(input_word, word) / max(len(input_word), len(word))
        jaccard_score = 1 - jaccard_distance(set(input_word), set(word))
        jaro_winkler_score = jellyfish.jaro_winkler(input_word, word)
        
        # You can adjust the weights as needed for each metric
        composite_score = 0.4 * levenshtein_score + 0.3 * jaccard_score + 0.3 * jaro_winkler_score
        scores.append((word, composite_score))

    # Sort candidate words by composite score in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    return [word for word, _ in scores[:range]]
   


In [10]:
def replace_typo_2(match):
    orig=match.group(1)
    typo=match.group(2)
    correction_arr=get_closest_word_combined(2,typo, word_frequency.keys())
    correction = " ".join(correction_arr)
    return f'<correction typo orig="{orig}" >{correction}</correction>'
    

In [12]:

text=open('typo-0.2.txt','r').read()
pattern = r'<typo orig="(.*?)">(.*?)</typo>'
modified_text = re.sub(pattern, replace_typo_2, text)

# Write the modified content back to the file
with open('typocombined.txt', 'w') as file:
    file.write(modified_text)

### Finding distance between the suggested words and the original word

In [None]:
def get_closest_score_combined(input_word, word):
# Calculate similarity scores for each metric
    scores = []

   
    levenshtein_score = 1 - jellyfish.levenshtein_distance(input_word, word) / max(len(input_word), len(word))
    jaccard_score = 1 - jaccard_distance(set(input_word), set(word))
    jaro_winkler_score = jellyfish.jaro_winkler(input_word, word)
       
        # You can adjust the weights as needed for each metric
    composite_score = 0.4 * levenshtein_score + 0.3 * jaccard_score + 0.3 * jaro_winkler_score
    

    # Sort candidate words by composite score in descending order
    
    return composite_score

In [42]:
texts=['typocorrectedjaccard_distance','typocorrectedJaro_Winkler','typocorrected2distance_levenstein']

for textco in texts:
    text=open(f'{textco}.txt','r').read()
    new_pattern = r'<correction typo orig="(.*?)" >(.*?)</correction>'
    total_distances_in_text=[]
    #finding the distance between the original word and the corrected word
    def find_distance(match):
        orig=match.group(1)
        correction=match.group(2)
        correction_arr=correction.split()
        distances=[]
        for suggestion in correction_arr:
            distances.append(get_closest_score_combined(suggestion,orig))

        best_distance=min(distances)
        return best_distance

    #iterate through all the patterns in the text file
    for pattern in re.finditer(new_pattern, text):
        total_distances_in_text.append(find_distance(pattern))

    #calculating the average distance
    average_distance=sum(total_distances_in_text)/len(total_distances_in_text)
    print(f'the average distance of {textco} is {average_distance}')

the average distance of typocorrectedjaccard_distance is 0.42534662612176893
the average distance of typocorrectedJaro_Winkler is 0.5517231534992258
the average distance of typocorrected2distance_levenstein is 0.41523627229753357
