In [1]:
import os, os.path
import nltk
from shutil import copyfile

#SSL Certificate has fauled
#that not in the system certificate store.
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
#PlaintextCorpusReader will use the default nltk.tokenize.sent_tokenize() 
#and nltk.tokenize.word_tokenize() to split your texts into sentences and words

from urllib import request

In [2]:
#----------------------------------------------STEP 1----------------------------------------------------------#
#Text number 1661 is "The Adventures of Sherlock Holmes" by Arthur Conan Doyle, and we can access it as follows.
url = "http://www.gutenberg.org/cache/epub/1661/pg1661.txt"
response = request.urlopen(url)
corpus = response.read().decode('utf8')
corpus = corpus.replace('\r', '')
length_corpus = len(corpus)

In [3]:
# Make new dir for the corpus.
corpusdir = 'newcorpus.nosync/'
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
    
copyfile("Makefile", corpusdir + "Makefile")
copyfile("spell_checker_test_set.txt",corpusdir + "spell_checker_test_set.txt")

'newcorpus.nosync/spell_checker_test_set.txt'

In [4]:
# Output the files into the directory.
filename = 'SherlockHolmes.txt'
with open(corpusdir+filename, 'w') as f:
    print(corpus, file=f) 


In [5]:
#Check that our corpus do exist and the files are correct.
# Key Note:
# 1.We split each file into words and we their equality until the penultimate word, since there is one extra '\n'
#in the created file
assert open(corpusdir+filename,'r').read().split(' ')[:-1] == corpus.split(' ')[:-1]

In [6]:
# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
# Now the text has been parsed into paragraphs, sentences and words by the default actions
# of the PlaintextCorpusReader
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
os.chdir(corpusdir)
#----------------------------------------------END OF STEP 1---------------------------------------------------#

In [7]:
#----------------------------------------------STEP 2----------------------------------------------------------#
#----------------------(a)---------------------#
#Function used as default argument in parser() function if it is not defined
def identity_preprocess(s):
    if(isinstance(s, str)):
        return s
    else: return "No string was given"

In [8]:
#----------------------(b)---------------------#
#Function to parse the text file given, line by line
def parser(path,preprocess = identity_preprocess):
    tokens = []
    for line in path.split('\n'):
        tokens+= preprocess(line)
    return tokens

In [9]:
#----------------------(c)---------------------#
import re
import string
#Tokenization step, a simple version which includes tokens of lowercase words
def tokenize(s):
    s_temp = s.strip().lower()
    s_temp = re.sub('[^A-Za-z\n\s]+', '', s_temp)
    s_temp = s_temp.replace('\n', ' ')
    s_temp = " ".join(s_temp.split())
    s_temp = s_temp.split(' ')
    s_temp[:] = [item for item in s_temp if item != '']
    return s_temp
#----------------------------------------------END OF STEP 2---------------------------------------------------#

In [10]:
#----------------------------------------------STEP 3----------------------------------------------------------#
#Constructing word tokens and alphabet of the new corpus
#----------------------(a)---------------------#
corpus_preprocessed = newcorpus.raw(newcorpus.fileids()[1])
word_tokens = parser(corpus_preprocessed, tokenize)


In [11]:
#----------------------(b)---------------------#
def tokenize_2(s):
    s_temp = s.strip()
    s_temp = " ".join(s_temp.split())
    s_temp = s_temp.split(' ')
    return s_temp


In [12]:
def parser_2(path, preprocess):
    alphabet = []
    for line in path.split('\n'):
        line = preprocess(line)
        for word in line:
            alphabet+= list(word)
            
    alphabet.append(' ')
    return set(alphabet)
        
alphabet_tokens = sorted(parser_2(corpus_preprocessed,tokenize_2))
#----------------------------------------------END OF STEP 3---------------------------------------------------#

In [13]:
#----------------------------------------------STEP 4----------------------------------------------------------#
filename = 'chars.syms'
filename =  open(filename, 'w')
result = []

filename.write('<epsilon>'+ " " + str(0)+'\n')
filename.write('<space>'+ "   " + str(1)+'\n')
for symbol in range(2,len(alphabet_tokens)):
    line = alphabet_tokens[symbol] + "         " + str(symbol)+'\n'
    filename.write(line)

filename.close()
#----------------------------------------------END OF STEP 4---------------------------------------------------#

In [14]:
#----------------------------------------------STEP 10----------------------------------------------------------#
from collections import defaultdict
def create_words_dictionary(word_tokens):
    length = len(word_tokens)
    wordfreq = defaultdict(float)
    for i in range(len(word_tokens)):
        wordfreq[word_tokens[i]] += 1/length 
    return wordfreq

words_dictionary = create_words_dictionary(word_tokens)
#for k, v in words_dictionary.items():
#    print(k, v)


In [15]:
def create_characters_dictionary(alphabet_tokens, corpus_preprocessed):
    result = {}
    length = len(corpus_preprocessed) - corpus_preprocessed.count('\n')
    charfreq = [corpus_preprocessed.count(symbol)/length for symbol in alphabet_tokens ]
    return dict(zip(alphabet_tokens,charfreq))

characters_dictionary = create_characters_dictionary(alphabet_tokens, corpus_preprocessed)
#for k, v in characters_dictionary.items():
#    print(k, v)
#----------------------------------------------END OF STEP 10---------------------------------------------------#

In [16]:
#----------------------------------------------STEP 11----------------------------------------------------------#
#Calculating the costs of transition for each word as cost_w_i = -log(p(w_i))
#and after that the mean value
#----------------------(a)---------------------#
import math
import statistics 
words_dictionary_costs = dict(zip(list(set(word_tokens)),[-math.log10(value) for key, value in words_dictionary.items()]))

costs = [words_dictionary_costs[key] for key in words_dictionary_costs]
w = statistics.mean(costs)



In [17]:
#HERE WE CREATE THE TRANDUCER I
#for the word_tokens
#----------------------(b)---------------------#
filename = 'orth_I_words.txt'
filename = open(filename,'w')

alphabet="abcdefghijklmnopqrstuvwxyz"

for letter in alphabet:
    filename.write("0 0 "+ letter +" "+ letter +" 0\n")
filename.write("0")

filename.close()
!make -s orth_I_words

In [18]:
#HERE WE CREATE THE TRANDUCER E
filename = 'orth_E_words.txt'
filename = open(filename,'w')
filename.write('0 1 <epsilon> <epsilon> 0'+'\n')
for i in range(len(alphabet)):
    filename.write('0 1 <epsilon> '+alphabet[i]+' '+str(w)+'\n')#insertion
    filename.write('0 1 ' + alphabet[i]+' <epsilon> '+str(w)+'\n')#deletion
    for j in range(len(alphabet)):
        if alphabet[i]!=alphabet[j]:
            filename.write('0 1 ' + alphabet[i]+' '+alphabet[j]+' '+str(w)+'\n')#Replace character by another

filename.write(str(1))
filename.close()
!make -s orth_E_words
!make -s transducer_words
!make -s transducershortest_words
#FINALLY WE CREATE THE TRANDUCER transducer = orth_I | orth_E | orth_I with the Makefile

In [None]:
#----------------------(c)---------------------#
#Calculating the costs of transition for each char as cost_c_i = -log(p(c_i))
#and after that the mean value
characters_dictionary_costs = dict(zip(list(set(alphabet_tokens)),[-math.log10(value) for key, value in characters_dictionary.items()]))
costs = [characters_dictionary_costs[key] for key in characters_dictionary_costs]
w = statistics.mean(costs)
print(w)

3.215014865006331


In [None]:
#HERE WE CREATE THE TRANDUCER I
#for the char_tokens
filename = 'orth_I_chars.txt'
filename = open(filename,'w')

alphabet="abcdefghijklmnopqrstuvwxyz"

for letter in alphabet:
    filename.write("0 0 "+ letter +" "+ letter +" 0\n")
filename.write("0")

filename.close()
!make -s orth_I_chars


In [None]:
#HERE WE CREATE THE TRANDUCER E
filename = 'orth_E_chars.txt'
filename = open(filename,'w')
filename.write('0 1 <epsilon> <epsilon> 0'+'\n')
for i in range(len(alphabet)):
    filename.write('0 1 <epsilon> '+alphabet[i]+' '+str(w)+'\n')#insertion
    filename.write('0 1 ' + alphabet[i]+' <epsilon> '+str(w)+'\n')#deletion
    for j in range(len(alphabet)):
        if alphabet[i]!=alphabet[j]:
            filename.write('0 1 ' + alphabet[i]+' '+alphabet[j]+' '+str(w)+'\n')#Replace character by another

filename.write(str(1))
filename.close()
!make -s orth_E_chars
!make -s transducer_chars
#FINALLY WE CREATE THE TRANDUCER transducer = orth_I | orth_E | orth_I with the Makefile
#----------------------------------------------END OF STEP 11---------------------------------------------------#

In [None]:
#----------------------------------------------STEP 12----------------------------------------------------------#
#HERE WE CREATE THE ACCEPTOR/AUTOMATO used to accept all the words of our words_tokens, of the corpus.
#One state for each letter of every word-> States will be limited later when we will apply the respective
#commands of determinization, minimization, removal of <epsilon> transitions to our orth_acceptor.fst
#----------------------(a)---------------------#
filename = 'orth_acceptor_words.txt'
acceptor=open(filename, 'w')
final_states = []
state_count = 0

acceptor.write('0 0 <epsilon> 0\n')

for word in list(set(word_tokens)):
    chars = list(word)
    if(len(chars) == 1):
        arg = ['0',' ',str(state_count+1),' ',chars[0],' ',chars[0],' ', str(words_dictionary_costs[word]),'\n']
        arg = ''.join(arg)
        acceptor.write(arg)
        state_count += len(chars)
        final_states.append(str(state_count))
    else:
        arg = ['0',' ',str(state_count+1),' ',chars[0],' ',chars[0],' ',str(words_dictionary_costs[word]),'\n']
        arg = ''.join(arg)
        acceptor.write(arg)
        for j in range(1,len(chars)):
            arg = [str(j + state_count),' ',str(j+1 + state_count),' ',chars[j],' ',chars[j],' 0','\n']
            arg = ''.join(arg)
            acceptor.write(arg)
        state_count += len(chars)
        final_states.append(str(state_count))
for i in range(0,len(final_states)):
    arg = [final_states[i],'\n']
    arg = ''.join(arg)
    acceptor.write(arg)
acceptor.close()
!make -s orth_acceptor_words
!make -s orth_acceptor_processed_words


In [None]:
#HERE WE CREATE THE ACCEPTOR/AUTOMATO used to accept all the words of our char_tokens, of the corpus.
#One state for each letter -> States will be limited later when we will apply the respective
#commands of determinization, minimization, removal of <epsilon> transitions to our orth_acceptor.fst
#----------------------(b)---------------------#
filename = 'orth_acceptor_chars.txt'
acceptor=open(filename, 'w')
final_states = []
state_count = 0

acceptor.write('0 0 <epsilon> 0\n')
for word in list(set(word_tokens)):
    chars = list(word)
    if(len(chars) == 1):
        arg = ['0',' ',str(state_count+1),' ',chars[0],' ',chars[0],' ', str(characters_dictionary_costs[chars[0]]),'\n']
        arg = ''.join(arg)
        acceptor.write(arg)
        state_count += len(chars)
        final_states.append(str(state_count))
    else:
        arg = ['0',' ',str(state_count+1),' ',chars[0],' ',chars[0],' ',str(characters_dictionary_costs[chars[0]]),'\n']
        arg = ''.join(arg)
        acceptor.write(arg)
        for j in range(1,len(chars)):
            arg = [str(j + state_count),' ',str(j+1 + state_count),' ',chars[j],' ',chars[j],' ',str(characters_dictionary_costs[chars[j]]),'\n']
            arg = ''.join(arg)
            acceptor.write(arg)
        state_count += len(chars)
        final_states.append(str(state_count))
for i in range(0,len(final_states)):
    arg = [final_states[i],'\n']
    arg = ''.join(arg)
    acceptor.write(arg)
acceptor.close()
!make -s orth_acceptor_chars
!make -s orth_acceptor_processed_chars

#----------------------------------------------END OF STEP 12---------------------------------------------------#

In [None]:
#----------------------------------------------STEP 13----------------------------------------------------------#
#----------------------(a)---------------------#
!make -s orthograph_words

#----------------------(b)---------------------#
!make -s orthograph_chars

#----------------------(c)---------------------#
filename = 'cit.txt'
filename = open(filename, 'w')
word = "cit"
state = 0
for letter in word:
    if letter!='\n':
        filename.write(str(state)+' '+str(state+1)+' '+letter+ '\n')
        state+=1
filename.write(str(state)+'\n')

filename.close()
print("Checking the word <cit> with the orthograph_words")
!make -s check_cit_words
print("Checking the word <cit> with the orthograph_chars")
!make -s check_cit_chars

#----------------------------------------------END OF STEP 13---------------------------------------------------#

Checking the word <cit> with the orthograph_words
p
0
t
i


In [None]:
#----------------------------------------------STEP 14----------------------------------------------------------#
from lib import *
filename = 'spell_checker_test_set.txt'
#We take 'spell_checker_test_set.txt', and we split to create 2 lists, the one with the correct words
#and the other with the list of the relevant wrong words. We chose randomly to ckeck 20 lines 
filename = open(filename, 'r')
lines = filename.readlines()
correct_words = []
wrong_words =[]
for line in lines:
    correct_words.append(line.split(':')[0])
    wrong_words.append((line.split(':')[1]).split())

acceptor = []

In [None]:
#We should create the dictionary based on the "chars.syms". The position in the dictionary
#represents the index in the symbol
dictionary = 'chars.syms'
dictionary= open(dictionary,'r')
lines=dictionary.readlines()
dict=[0 for i in range(len(lines))]
for line in lines:
    matching = line.split()
    dict[int(matching[1])]=matching[0]
dictionary.close()

In [None]:
#Here in file OurResults, we will save the produced words
filename_words = 'OurResults_words.txt'
filename_chars = 'OurResults_chars.txt'
result_words = open(filename_words, 'w')
result_chars = open(filename_chars, 'w')
for i in range(len(wrong_words)):
    for word in wrong_words[i]:
        #--------------------------------------------------------------------------#
        #We truncate this file in order to make the other acceptors in the same file
        acceptor=open('word_acceptor.txt', 'w')
        state = 0
        for letter in word:
            if letter!='\n':
                acceptor.write(str(state)+' '+str(state+1)+' '+letter +'\n')
                
                state+=1
        acceptor.write(str(state)+'\n')
        acceptor.close()
        #--------------------------------------------------------------------------#
        #We use the fst tool in order to create the acceptor for every word
        #The method of shortest path was used to find the best matches
        !make -s unique_word
        #--------------------------------------------------------------------------#
        #We write the result in a file in order to compare the best words later
        acceptor_shortest_words=open('Acceptor_Shortest_words.txt', 'r')
        lines=acceptor_shortest_words.readlines()
        temp_word=[]

        for j in range(2,len(lines)):
            chars = lines[j].split()
            if(len(chars) > 3):
                temp_word.append(chars[3])
        if(len(lines) > 1):
            chars = lines[0].split()
            if(len(chars) > 3):
                temp_word.append(chars[3])
        #--------------------------------------------------------------------------#
        #Apparently, now in temp_word we have the produced word, which is going to be
        #cheked based on our dictionary created in the previous block.
        for letter in temp_word[1:(len(temp_word)-1)]:
            if int(letter)!=0:
                result_words.write(dict[int(letter)])
        #--------------------------------------------------------------------------#
        #So for each word we save our result bh using this format:
        #|word orthograph| + |wrong_word| + |correct_word|
        result_words.write(' '+word+' '+correct_words[i]+'\n')
        #--------------------------------------------------------------------------#
        #Repeat the procedure for the orthograph_chars
        acceptor_shortest_chars=open('Acceptor_Shortest_chars.txt', 'r')
        lines=acceptor_shortest_chars.readlines()
        temp_word=[]
        for j in range(2,len(lines)):
            chars = lines[j].split()
            if(len(chars) > 3):
                temp_word.append(chars[3])
        if(len(lines) > 1):
            chars = lines[0].split()
            if(len(chars) > 3):
                temp_word.append(chars[3])
                
        for letter in temp_word[1:(len(temp_word)-1)]:
            if int(letter)!=0:
                result_chars.write(dict[int(letter)])
        result_chars.write(' '+word+' '+correct_words[i]+'\n')
        
        

result_words.close()
result_chars.close()

In [None]:
#### HERE WE GONNA CHECK THE CORRECTNESS OF OUR ORTHOGRAPH_WORDS
corrected_words=0
wrong_words=0
no_matching_words=0
result=open('OurResults_words.txt', 'r')
words=result.readlines()
for word in words:
    chars = word.split()
    if(len(chars) >2):
        if(chars[0] == chars[2] and chars[1]!=chars[2]):
            corrected_words+=1
        else:
            wrong_words +=1
    else:
        no_matching_words+=1

print('\nCHECKING WITH ORTHOGRAPH_WORDS GAVE THE FOLLOWING RESULTS\n')
print('Corrected Words ' + str(corrected_words))
print('Wrong Words ' + str(wrong_words))
print('There was no matching for '+ str(no_matching_words) + ' words')


In [None]:
#### HERE WE GONNA CHECK THE CORRECTNESS OF OUR ORTHOGRAPH_CHARS
corrected_words=0
wrong_words=0
no_matching_words=0
result=open('OurResults_chars.txt', 'r')
words=result.readlines()
for word in words:
    chars = word.split()
    if(len(chars) >2):
        if(chars[0] == chars[2] and chars[1]!=chars[2]):
            corrected_words+=1
        else:
            wrong_words +=1
    else:
        no_matching_words+=1

print('\nCHECKING WITH ORTHOGRAPH_CHARS GAVE THE FOLLOWING RESULTS\n')
print('Corrected Words ' + str(corrected_words))
print('Wrong Words ' + str(wrong_words))
print('There was no matching for '+ str(no_matching_words) + ' words')
#----------------------------------------------END OF STEP 14---------------------------------------------------#

In [None]:
#----------------------------------------------STEP 15----------------------------------------------------------#
