In [9]:
import re
import string
from collections import Counter
import numpy as np

**Read Dataset**

In [10]:
def read_corpus(filename):
  with open(filename, "r") as file:
    lines = file.readlines()
    words = []
    for line in lines:
      words += re.findall(r'\w+', line.lower())

  return words

In [11]:
words = read_corpus("./sastrawi.txt")
print(f"Jumlah kata di dalam corpus: {len(words)}")

Jumlah kata di dalam corpus: 30820


In [12]:
vocabs = set(words)
print(f"Jumlah kosakata unik: {len(vocabs)}")

Jumlah kosakata unik: 30173


In [13]:
word_counts = Counter(words)
print(word_counts["kuliah"])

1


**Count the word's probability**


**P(word) = C(word) / V**


P(word): probability of the word

C(word): number of times the word appears in the corpus

V: total number of the word count in the corpus


In [14]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [15]:
print(word_probas["kuliah"])

3.244646333549643e-05


Split a word into two components

In [16]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [17]:
print(split("apapun"))

[('', 'apapun'), ('a', 'papun'), ('ap', 'apun'), ('apa', 'pun'), ('apap', 'un'), ('apapu', 'n'), ('apapun', '')]


1. **Insert** a letter

In [18]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters]

In [19]:
print(insert("abar"))

['aabar', 'babar', 'cabar', 'dabar', 'eabar', 'fabar', 'gabar', 'habar', 'iabar', 'jabar', 'kabar', 'labar', 'mabar', 'nabar', 'oabar', 'pabar', 'qabar', 'rabar', 'sabar', 'tabar', 'uabar', 'vabar', 'wabar', 'xabar', 'yabar', 'zabar', 'aabar', 'abbar', 'acbar', 'adbar', 'aebar', 'afbar', 'agbar', 'ahbar', 'aibar', 'ajbar', 'akbar', 'albar', 'ambar', 'anbar', 'aobar', 'apbar', 'aqbar', 'arbar', 'asbar', 'atbar', 'aubar', 'avbar', 'awbar', 'axbar', 'aybar', 'azbar', 'abaar', 'abbar', 'abcar', 'abdar', 'abear', 'abfar', 'abgar', 'abhar', 'abiar', 'abjar', 'abkar', 'ablar', 'abmar', 'abnar', 'aboar', 'abpar', 'abqar', 'abrar', 'absar', 'abtar', 'abuar', 'abvar', 'abwar', 'abxar', 'abyar', 'abzar', 'abaar', 'ababr', 'abacr', 'abadr', 'abaer', 'abafr', 'abagr', 'abahr', 'abair', 'abajr', 'abakr', 'abalr', 'abamr', 'abanr', 'abaor', 'abapr', 'abaqr', 'abarr', 'abasr', 'abatr', 'abaur', 'abavr', 'abawr', 'abaxr', 'abayr', 'abazr', 'abara', 'abarb', 'abarc', 'abard', 'abare', 'abarf', 'abarg', 

2. **Delete** a letter

In [20]:
def delete(word):
  return [l + r[1:] for l,r in split(word) if r]

In [21]:
print(delete("wwalaupun"))

['walaupun', 'walaupun', 'wwlaupun', 'wwaaupun', 'wwalupun', 'wwalapun', 'wwalauun', 'wwalaupn', 'wwalaupu']


3. **Swap** two adjacent letters

In [22]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [23]:
print(swap("yagn"))

['aygn', 'ygan', 'yang']


In [24]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

4. **Replace** a letter to another

In [25]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [26]:
print(replace("vulan"))

['aulan', 'bulan', 'culan', 'dulan', 'eulan', 'fulan', 'gulan', 'hulan', 'iulan', 'julan', 'kulan', 'lulan', 'mulan', 'nulan', 'oulan', 'pulan', 'qulan', 'rulan', 'sulan', 'tulan', 'uulan', 'vulan', 'wulan', 'xulan', 'yulan', 'zulan', 'valan', 'vblan', 'vclan', 'vdlan', 'velan', 'vflan', 'vglan', 'vhlan', 'vilan', 'vjlan', 'vklan', 'vllan', 'vmlan', 'vnlan', 'volan', 'vplan', 'vqlan', 'vrlan', 'vslan', 'vtlan', 'vulan', 'vvlan', 'vwlan', 'vxlan', 'vylan', 'vzlan', 'vuaan', 'vuban', 'vucan', 'vudan', 'vuean', 'vufan', 'vugan', 'vuhan', 'vuian', 'vujan', 'vukan', 'vulan', 'vuman', 'vunan', 'vuoan', 'vupan', 'vuqan', 'vuran', 'vusan', 'vutan', 'vuuan', 'vuvan', 'vuwan', 'vuxan', 'vuyan', 'vuzan', 'vulan', 'vulbn', 'vulcn', 'vuldn', 'vulen', 'vulfn', 'vulgn', 'vulhn', 'vulin', 'vuljn', 'vulkn', 'vulln', 'vulmn', 'vulnn', 'vulon', 'vulpn', 'vulqn', 'vulrn', 'vulsn', 'vultn', 'vulun', 'vulvn', 'vulwn', 'vulxn', 'vulyn', 'vulzn', 'vulaa', 'vulab', 'vulac', 'vulad', 'vulae', 'vulaf', 'vulag', 

**I. Level One Edit**

In [27]:
def edit1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word))

In [28]:
print(edit1("abgaimana"))

{'abgaimanp', 'ahgaimana', 'abgazimana', 'jbgaimana', 'abgaimanta', 'abgaimanai', 'abglaimana', 'abgaimanac', 'tabgaimana', 'abgkaimana', 'abgaicana', 'argaimana', 'abgakmana', 'sbgaimana', 'abgaipmana', 'abgaimanal', 'abgaimarna', 'agaimana', 'aogaimana', 'abgapmana', 'albgaimana', 'abgimana', 'abgaimayna', 'abgasimana', 'nabgaimana', 'acbgaimana', 'aobgaimana', 'abgaimara', 'absgaimana', 'abgaimaona', 'abegaimana', 'abgavimana', 'abggimana', 'abgaimaya', 'abgaimna', 'abgaimanz', 'dabgaimana', 'abgajimana', 'abgaiymana', 'abgsimana', 'abgaimanfa', 'abgaieana', 'abgaimaba', 'abgaamana', 'abgaimfana', 'abgaimada', 'rabgaimana', 'abgaimapa', 'abgaimanva', 'abgaimanxa', 'abgabimana', 'augaimana', 'qbgaimana', 'abgaumana', 'mbgaimana', 'abgaimanar', 'abgaiwmana', 'abgalimana', 'abgaimanwa', 'abgaimank', 'abgaixmana', 'aebgaimana', 'abgaimgana', 'abgaiemana', 'abgpimana', 'abgauimana', 'iabgaimana', 'abgafimana', 'abgaimxana', 'abgaimqna', 'abghimana', 'abgaimanq', 'abgaimant', 'ablgaimana'

**II. Level Two Edit**

implement level one edit twice



In [29]:
def edit2(word):
  return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

In [30]:
print(edit2("arunika"))

{'arunsija', 'arunykka', 'aruniuau', 'carumnika', 'alunikha', 'runivka', 'abrunikaa', 'xaruniuka', 'arunickai', 'aruznihka', 'daruniga', 'arunitak', 'arurnikah', 'aruncimka', 'awrunikac', 'arusniha', 'arusndika', 'arunjkqa', 'aruhikl', 'parugika', 'arcunikad', 'aruniat', 'aruniodka', 'arsnizka', 'arutiga', 'arvnikca', 'xruniyka', 'arvnikha', 'arunhiva', 'avunima', 'aruniaq', 'arunisyka', 'arunhmka', 'hrnnika', 'aurunjika', 'aruronika', 'atjrunika', 'mrueika', 'krunkia', 'arhunikra', 'arunmikpa', 'arunkirka', 'arunikkan', 'aruniikpa', 'aruuita', 'farnnika', 'arunipaw', 'aruniuar', 'darunikh', 'abuniks', 'iiunika', 'parunikas', 'cruniva', 'arunvieka', 'araunija', 'aruivka', 'ayunikh', 'fruniko', 'arunipav', 'zarunmka', 'cdarunika', 'acunikar', 'arudniku', 'arunikawv', 'arhnikia', 'arvniika', 'ahrunija', 'adcunika', 'arunsiua', 'azuvika', 'arunikeam', 'arunfiki', 'crunikas', 'akunikav', 'arpunka', 'aofrunika', 'aruniujka', 'arvunikoa', 'arunivf', 'arunizkaj', 'areunfika', 'hnarunika', 'ia

**Spelling Check Function**

IF the word exists in the vocabulary, THEN the word is already corrected spelt.

ELSE, get suggestions by edit1, edit2,or the word itself. Then filter suggestions by the vocabulary. Show a word with the highest probability



In [31]:
def correct_spelling(word, vocabulary, word_probabilities):
  if word in vocabulary:
    print(f"'{word}' sudah dieja dengan benar")
    return

  suggestions = edit1(word) or edit2(word) or [word]
  best_guesses = [w for w in suggestions if w in vocabulary]
  return [(w, word_probabilities[w]) for w in best_guesses]

In [32]:
def spellCheck(word):
  corrections = correct_spelling(word, vocabs, word_probas)

  if corrections:
    print(f"saran kosakata: {corrections}")
    probs = np.array([c[1] for c in corrections])
    best_ix = np.argmax(probs)
    correct = corrections[best_ix][0]
    print(f"kosakata dengan probabilitas terbesar: {correct}")
    print('')
    print(f"'{correct}' disarankan untuk '{word}'")

Let's try! 😸

In [44]:
spellCheck("masalh")

saran kosakata: [('masalah', 3.244646333549643e-05), ('masala', 3.244646333549643e-05)]
kosakata dengan probabilitas terbesar: masalah

'masalah' disarankan untuk 'masalh'


In [35]:
spellCheck("atmosfir")

saran kosakata: [('atmosfer', 3.244646333549643e-05)]
kosakata dengan probabilitas terbesar: atmosfer

'atmosfer' disarankan untuk 'atmosfir'


In [37]:
spellCheck("antre")

'antre' sudah dieja dengan benar
