In [1]:
import os, os.path
import nltk
from shutil import copyfile

#SSL Certificate has fauled
#that not in the system certificate store.
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
#PlaintextCorpusReader will use the default nltk.tokenize.sent_tokenize() 
#and nltk.tokenize.word_tokenize() to split your texts into sentences and words

from urllib import request

In [2]:
#----------------------------------------------STEP 1----------------------------------------------------------#
#Text number 1661 is "The Adventures of Sherlock Holmes" by Arthur Conan Doyle, and we can access it as follows.
url = "http://www.gutenberg.org/cache/epub/1661/pg1661.txt"
response = request.urlopen(url)
corpus = response.read().decode('utf8')
corpus = corpus.replace('\r', '')
length_corpus = len(corpus)
print(length_corpus)
print(corpus[:100])


581864
﻿Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle

This eBook is for the


In [3]:
# Make new dir for the corpus.
corpusdir = 'newcorpus.nosync/'
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
    
copyfile("Makefile", corpusdir + "Makefile")
copyfile("spell_checker_test_set.txt",corpusdir + "spell_checker_test_set.txt")



'newcorpus.nosync/spell_checker_test_set.txt'

In [4]:
# Output the files into the directory.
filename = 'SherlockHolmes.txt'
with open(corpusdir+filename, 'w') as f:
    print(corpus, file=f) 


In [5]:
#Check that our corpus do exist and the files are correct.
# Key Note:
# 1.We split each file into words and we their equality until the penultimate word, since there is one extra '\n'
#in the created file
assert open(corpusdir+filename,'r').read().split(' ')[:-1] == corpus.split(' ')[:-1]

In [6]:
# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
# Now the text has been parsed into paragraphs, sentences and words by the default actions
# of the PlaintextCorpusReader
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
os.chdir(corpusdir)
#----------------------------------------------END OF STEP 1---------------------------------------------------#

In [7]:
#----------------------------------------------STEP 2----------------------------------------------------------#
#----------------------(a)---------------------#
#Function used as default argument in parser() function if it is not defined
def identity_preprocess(s):
    if(isinstance(s, str)):
        return s
    else: return "No string was given"

In [8]:
#----------------------(b)---------------------#
#Function to parse the text file given, line by line
def parser(path,preprocess = identity_preprocess):
    tokens = []
    for line in path.split('\n'):
        tokens+= preprocess(line)
    return tokens

In [9]:
#----------------------(c)---------------------#
import re
import string
#Tokenization step, a simple version which includes tokens of lowercase words
def tokenize(s):
    s_temp = s.strip().lower()
    s_temp = re.sub('[^A-Za-z\n\s]+', '', s_temp)
    s_temp = s_temp.replace('\n', ' ')
    s_temp = " ".join(s_temp.split())
    s_temp = s_temp.split(' ')
    s_temp[:] = [item for item in s_temp if item != '']
    return s_temp


In [10]:
#----------------------(d)---------------------#
#Comparing results of our built word tokenizer with a sentence that proves its functionality
#with the results given by nltk's word tokenizers
#print(" A RoCk3!45.! Fell frOm334 \n ~. heaven ")
#1.The word_tokenize() function is a wrapper function that calls tokenize() on an
#instance of the TreebankWordTokenizer class. It is a simpler, regular-expression 
#based tokenizer, which splits text on whitespace and punctuation:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
print( tokenizer.tokenize(" A RoCk3!45.! Fell frOm334 \n ~. heaven "))

#2.WordPunctTokenizer splits all punctuations into separate tokens:
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
print(word_punct_tokenizer.tokenize(" A RoCk3!45.! Fell frOm334 \n ~. heaven "))

#3.Our created tokenizer
print(tokenize(" A RoCk3!45.! Fell     frOm334 \n ~. heaven "))
#----------------------------------------------END OF STEP 2---------------------------------------------------#

['A', 'RoCk3', '!', '45.', '!', 'Fell', 'frOm334', '~.', 'heaven']
['A', 'RoCk3', '!', '45', '.!', 'Fell', 'frOm334', '~.', 'heaven']
['a', 'rock', 'fell', 'from', 'heaven']


In [11]:
#----------------------------------------------STEP 3----------------------------------------------------------#
#Constructing word tokens and alphabet of the new corpus
#----------------------(a)---------------------#
corpus_preprocessed = newcorpus.raw(newcorpus.fileids()[1])
word_tokens = parser(corpus_preprocessed, tokenize)

print(word_tokens[:20])


['project', 'gutenbergs', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'by', 'arthur', 'conan', 'doyle', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere']


In [12]:
#----------------------(b)---------------------#
def tokenize_2(s):
    s_temp = s.strip()
    s_temp = " ".join(s_temp.split())
    s_temp = s_temp.split(' ')
    return s_temp


In [13]:
def parser_2(path, preprocess):
    alphabet = []
    for line in path.split('\n'):
        line = preprocess(line)
        for word in line:
            alphabet+= list(word)
            
    alphabet.append(' ')
    return set(alphabet)
        
alphabet_tokens = sorted(parser_2(corpus_preprocessed,tokenize_2))
print(alphabet_tokens)

#----------------------------------------------END OF STEP 3---------------------------------------------------#

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']


In [14]:
#----------------------------------------------STEP 4----------------------------------------------------------#
filename = 'chars.syms'
filename =  open(filename, 'w')
result = []

filename.write('<epsilon>'+ " " + str(0)+'\n')
filename.write('<space>'+ "   " + str(1)+'\n')
for symbol in range(2,len(alphabet_tokens)):
    line = alphabet_tokens[symbol] + "         " + str(symbol)+'\n'
    filename.write(line)

filename.close()
#----------------------------------------------END OF STEP 4---------------------------------------------------#

In [15]:
#----------------------------------------------STEP 5----------------------------------------------------------#
#HERE WE CREATE THE TRANDUCER I
#----------------------(a)---------------------#
filename = 'orth_I.txt'
filename = open(filename,'w')

alphabet="abcdefghijklmnopqrstuvwxyz"

for letter in alphabet:
    filename.write("0 0 "+ letter +" "+ letter +" 0\n")
filename.write("0")

filename.close()
!make -s orth_I

In [16]:
#HERE WE CREATE THE TRANDUCER E
filename = 'orth_E.txt'
filename = open(filename,'w')
alphabet="abcdefghijklmnopqrstuvwxyz"
filename.write('0 1 <epsilon> <epsilon> 0'+'\n')
for i in range(len(alphabet)):
    filename.write('0 1 <epsilon> '+alphabet[i]+' '+str(1)+'\n')#insertion
    filename.write('0 1 '+alphabet[i]+' <epsilon> '+str(1)+'\n')#deletion
    for j in range(len(alphabet)):
        if alphabet[i]!=alphabet[j]:
            filename.write('0 1 '+alphabet[i]+' '+alphabet[j]+' '+str(1)+'\n')#Replace character by another

filename.write(str(1))
filename.close()
!make -s orth_E
!make -s transducer
#FINALLY WE CREATE THE TRANDUCER transducer = orth_I | orth_E | orth_I with the Makefile
#----------------------------------------------END OF STEP 5---------------------------------------------------#

In [17]:
#----------------------------------------------STEP 6----------------------------------------------------------#
#HERE WE CREATE THE ACCEPTOR/AUTOMATO used to accept all the words of our words_tokens, of the corpus.
#One state for each letter of every word-> States will be limited later when we will apply the respective
#commands of determinization, minimization, removal of <epsilon> transitions to our orth_acceptor.fst
#----------------------(a)---------------------#
filename = 'orth_acceptor.txt'
acceptor=open(filename, 'w')
final_states = []
state_count = 0

acceptor.write('0 0 <epsilon> 0\n')
for word in word_tokens:
    chars = list(word)
    if(len(chars) == 1):
        arg = ['0',' ',str(state_count+1),' ',chars[0],' ',chars[0],' 0','\n']
        arg = ''.join(arg)
        acceptor.write(arg)
        state_count += len(chars)
        final_states.append(str(state_count))
    else:
        arg = ['0',' ',str(state_count+1),' ',chars[0],' ',chars[0],' 0','\n']
        arg = ''.join(arg)
        acceptor.write(arg)
        for j in range(1,len(chars)):
            arg = [str(j + state_count),' ',str(j+1 + state_count),' ',chars[j],' ',chars[j],' 0','\n']
            arg = ''.join(arg)
            acceptor.write(arg)
        state_count += len(chars)
        final_states.append(str(state_count))
for i in range(0,len(final_states)):
    arg = [final_states[i],'\n']
    arg = ''.join(arg)
    acceptor.write(arg)
acceptor.close()
!make -s orth_acceptor
!make -s orth_acceptor_processed

#----------------------------------------------END OF STEP 6---------------------------------------------------#

In [18]:
#----------------------------------------------STEP 7----------------------------------------------------------#
#----------------------(a)---------------------#
!make -s orthograph
#----------------------(b)---------------------#
filename = 'cit.txt'
filename = open(filename, 'w')
word = "cit"
state = 0
for letter in word:
    if letter!='\n':
        filename.write(str(state)+' '+str(state+1)+' '+letter+ '\n')
        state+=1
filename.write(str(state)+'\n')

filename.close()
!make -s check_cit

#----------------------------------------------END OF STEP 7---------------------------------------------------#

3	2	c	w	1
0
1	0	t	t
2	1	i	i


In [19]:
#----------------------------------------------STEP 8----------------------------------------------------------#
#----------------------(a)---------------------#
from lib import *
filename = 'spell_checker_test_set.txt'
#We take 'spell_checker_test_set.txt', and we split to create 2 lists, the one with the correct words
#and the other with the list of the relevant wrong words. We chose randomly to ckeck 20 lines 
filename = open(filename, 'r')
lines = filename.readlines()[20:40]
correct_words = []
wrong_words =[]
for line in lines:
    correct_words.append(line.split(':')[0])
    wrong_words.append((line.split(':')[1]).split())

acceptor = []
print(correct_words)
print(wrong_words)

['further', 'monitoring', 'biscuits', 'available', 'separate', 'necessary', 'definition', 'receipt', 'remind', 'initials', 'magnificent', 'aunt', 'initial', 'there', 'experiences', 'built', 'totally', 'understand', 'southern', 'definitely']
[['futher'], ['monitering'], ['biscits', 'biscutes', 'biscuts', 'bisquits', 'buiscits', 'buiscuts'], ['avaible'], ['seperate'], ['neccesary', 'necesary', 'neccesary', 'necassary', 'necassery', 'neccasary'], ['defenition'], ['receit', 'receite', 'reciet', 'recipt'], ['remine', 'remined'], ['inetials', 'inistals', 'initails', 'initals', 'intials'], ['magnificnet', 'magificent', 'magnifcent', 'magnifecent', 'magnifiscant', 'magnifisent', 'magnificant'], ['annt', 'anut', 'arnt'], ['intial'], ['ther'], ['experances'], ['biult'], ['totaly'], ['undersand', 'undistand'], ['southen'], ['definately', 'difinately']]


In [20]:
#We should create the dictionary based on the "chars.syms". The position in the dictionary
#represents the index in the symbol
dictionary = 'chars.syms'
dictionary= open(dictionary,'r')
lines=dictionary.readlines()
dict=[0 for i in range(len(lines))]
for line in lines:
    matching = line.split()
    dict[int(matching[1])]=matching[0]
dictionary.close()

print(dict)

['<epsilon>', '<space>', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']


In [21]:
#Here in file OurResults, we will save the produced words
filename = 'OurResults.txt'
result = open(filename, 'w')
for i in range(len(wrong_words)):
    for word in wrong_words[i]:
        #--------------------------------------------------------------------------#
        #We truncate this file in order to make the other acceptors in the same file
        acceptor=open('word_acceptor.txt', 'w')
        state = 0
        for letter in word:
            if letter!='\n':
                acceptor.write(str(state)+' '+str(state+1)+' '+letter +'\n')
                
                state+=1
        acceptor.write(str(state)+'\n')
        acceptor.close()
            #--------------------------------------------------------------------------#
        #We use the fst tool in order to create the acceptor for every word
        #The method of shortest path was ussed to find the best matches
        !make -s unique_word
        #--------------------------------------------------------------------------#
        #We write the result in a file in order to compare the best words later
        acceptor_shortest=open('Acceptor_Shortest.txt', 'r')
        lines=acceptor_shortest.readlines()
        temp_word=[]

        for j in range(2,len(lines)):
            chars = lines[j].split()
            if(len(chars) > 3):
                temp_word.append(chars[3])
        if(len(lines) > 1):
            chars = lines[0].split()
            if(len(chars) > 3):
                temp_word.append(chars[3])
        #--------------------------------------------------------------------------#
        print(word,end =' ')
        #Apparently, now in temp_word we have the produced word, which is going to be
        #cheked based on our dictionary created in the previous block.
        for letter in temp_word[1:(len(temp_word)-1)]:
            if int(letter)!=0:
                print(dict[int(letter)],end ='')
                result.write(dict[int(letter)])
        print(' ',end = '')
    
        #--------------------------------------------------------------------------#
        #So for each word we save our result bh using this format:
        #|word orthograph| + |wrong_word| + |correct_word|
        print(correct_words[i])
        result.write(' '+word+' '+correct_words[i]+'\n')

result.close()

futher faurtthehrer further
monitering  monitoring
biscits  biscuits
biscutes  biscuits
biscuts  biscuits
bisquits  biscuits
buiscits  biscuits
buiscuts  biscuits
avaible  available
seperate separate separate
neccesary  necessary
necesary necessssaary necessary
neccesary  necessary
necassary necessary necessary
necassery  necessary
neccasary  necessary
defenition  definition
receit receniptt receipt
receite receive receipt
reciet  receipt
recipt receipt receipt
remine  remind
remined remained remind
inetials initials initials
inistals  initials
initails  initials
initals initials initials
intials initials initials
magnificnet  magnificent
magificent magnificent magnificent
magnifcent magnificent magnificent
magnifecent magnificent magnificent
magnifiscant  magnificent
magnifisent magnificent magnificent
magnificant magnificent magnificent
annt aunt aunt
anut 0nut aunt
arnt auranttt aunt
intial  initial
ther t0thto00he0himynrheeerrr there
experances  experiences
biult  built
totaly tota

In [22]:
#### HERE WE GONNA CHECK THE CORRECTNESS OF OUR ORTHOGRAPH
corrected_words=0
wrong_words=0
no_matching_words=0
result=open('OurResults.txt', 'r')
words=result.readlines()
for word in words:
    chars = word.split()
    if(len(chars) >2):
        if(chars[0] == chars[2] and chars[1]!=chars[2]):
            corrected_words+=1
        else:
            wrong_words +=1
    else:
        no_matching_words+=1

print('\nOur Orthograph gave the following results\n')
print('Corrected Words ' + str(corrected_words))
print('Wrong Words ' + str(wrong_words))
print('There was no matching for '+ str(no_matching_words) + ' words')
#----------------------------------------------END OF STEP 8---------------------------------------------------#


Our Orthograph gave the following results

Corrected Words 15
Wrong Words 9
There was no matching for 24 words


In [27]:
#----------------------------------------------STEP 9----------------------------------------------------------#
#----------------------(a)---------------------#
# Initialize word2vec. Context is taken as the 2 previous and 2 next words
def parser3(path,preprocess = identity_preprocess):
    tokens = []
    for line in path.split('\n'):
        s_temp = preprocess(line)
        if(s_temp ==[]):continue
        tokens.append(s_temp)
    return tokens

sent_tokens = parser3(corpus_preprocessed, tokenize)
print(sent_tokens[:30])


[['project', 'gutenbergs', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'by', 'arthur', 'conan', 'doyle'], ['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with'], ['almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or'], ['reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included'], ['with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergnet'], ['title', 'the', 'adventures', 'of', 'sherlock', 'holmes'], ['author', 'arthur', 'conan', 'doyle'], ['posting', 'date', 'april', 'ebook'], ['first', 'posted', 'november'], ['language', 'english'], ['start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'adventures', 'of', 'sherlock', 'holmes'], ['produced', 'by', 'an', 'anonymous', 'project', 'gutenberg', 'volunteer', 'and', 'jose', 'menendez'], ['the', 'adventures', 'of', 'sherlock', 'holmes'], ['by'], ['sir', 'arthur', 'conan', 'doyle'], ['i

In [28]:
#----------------------(b)---------------------#
from gensim import models
import numpy as np
import random

model = models.Word2Vec( sent_tokens,window=5, size=100, min_count = 2,workers=4)
model.train(sent_tokens, total_examples=len(sent_tokens), epochs=1000)

# get ordered vocabulary list
voc = model.wv.index2word

# get vector size
dim = model.vector_size

words_to_check = random.sample(voc, 10)
for word in words_to_check:
    sim = model.wv.most_similar(word,topn=2)
    print(word,sim)
    

leg [('whoever', 0.40393221378326416), ('fathom', 0.3576420545578003)]
presently [('whistle', 0.33856892585754395), ('country', 0.3132840394973755)]
murderous [('englishman', 0.5531414747238159), ('expression', 0.3790321946144104)]
mud [('flash', 0.4029349386692047), ('throws', 0.390768826007843)]
stepping [('trick', 0.46358785033226013), ('stared', 0.4237854778766632)]
lens [('pillow', 0.4028383791446686), ('creases', 0.3827398717403412)]
but [('and', 0.5507899522781372), ('that', 0.5200991034507751)]
fantastic [('misgivings', 0.3805195987224579), ('bride', 0.3748277425765991)]
staring [('gazing', 0.4954022765159607), ('drives', 0.40069448947906494)]
answering [('jest', 0.42209839820861816), ('tone', 0.35786348581314087)]
