<a href="https://colab.research.google.com/github/increpare/tatoeba_toki_pona_spellcheck/blob/main/tatoeba_toki_pona_spellcheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
# by jan Inkepa - https://github.com/increpare/tatoeba_toki_pona_spellcheck/blob/main/tatoeba_toki_pona_spellcheck.ipynb

import os
import urllib.request

print("downloading toki pona sentences from tatoeba")
urllib.request.urlretrieve('https://downloads.tatoeba.org/exports/per_language/toki/toki_sentences_detailed.tsv.bz2', 'toki_sentences_detailed.tsv.bz2')

!rm -rf /content/toki_sentences_detailed.tsv

print("decompressing data")
!bunzip2 /content/toki_sentences_detailed.tsv.bz2
print("done")


downloading toki pona sentences from tatoeba
decompressing data
done


In [35]:
import pandas as pd
df = pd.read_csv("/content/toki_sentences_detailed.tsv",names=["k","v","user","added","modified"],delimiter="\t")
words = ["a","akesi","ala","ali","alasa","ale","anpa","ante","anu","awen","e","en","esun","ijo","ike","ilo","insa","jaki","jan","jelo","jo","kala","kalama","kama","kasi","ken","kepeken","kili","kin","kipisi","kiwen","ko","kon","kule","kulupu","kute","la","lape","laso","lawa","leko","len","lete","li","lili","linja","lipu","loje","lon","luka","lukin","lupa","ma","mama","mani","meli","mi","mije","moku","moli","monsi","monsuta","mu","mun","musi","mute","namako","nanpa","nasa","nasin","nena","ni","nimi","noka","o","oko","olin","ona","open","pakala","pali","palisa","pan","pana","pi","pilin","pimeja","pini","pipi","poka","poki","pona","pu","sama","seli","selo","seme","sewi","sijelo","sike","sin","sina","sinpin","sitelen","sona","soweli","suli","suno","supa","suwi","tan","taso","tawa","telo","tenpo","toki","tomo","tonsi","tu","unpa","uta","utala","walo","wan","waso","wawa","weka","wile"]

In [46]:
import re

def validate(w):
  if w[0].isupper():
    return True#w.lower() not in words
  return w in words

errors={}

#checks for "mi/sina" li sentences, and roughly for sentences with an "e" but without a "li"/"o"/"mi"/"sina"
def validate_sentence(s,tokens,index,user):
  if user not in errors:
    errors[user]=[]

  if (tokens[0]=="mi" or tokens[0]=="sina") and tokens[1]=="li":
    errors[user].append([s,index,"mi\sina li"])
  if "li" not in tokens and "e" in tokens and "o" not in tokens:
    if "mi" not in tokens and "sina" not in tokens:      
      errors[user].append([s,index,"'e' without 'li'/'mi'/'sina'/'o'"])
  if bool(re.search(r"\bla,? (mi|sina) li\b",s)):
    errors[user].append([s,index,"mi\sina li"])
  lastchar = s[len(s)-1]
  if lastchar.isalnum():
    errors[user].append([s,index,"Sentence doesn't end with punctuation"])

freq={}
links={}
id_table={}

for index, row in df.iterrows():
  import re
  id_table[index]=row
  user = row['user']
  sentence=row['v']
  tokens = re.findall(r'\w+',sentence)
  validate_sentence(sentence,tokens,index,user)
  for w in tokens:
    if not validate(w):
      if w not in freq:
        freq[w]=0
        links[w]=[]
      freq[w]=freq[w]+1
      links[w].append(index)
      errors[user].append([sentence,index,"likely misspelled word: "+w])
      break


for u in errors:
  errorlist = errors[u]
  if len(errorlist)==0:
    continue
  print("\n")
  print("==============")
  print("\n")
  print("Likely Error report for user "+u)
  print("\n")
  for error in errorlist:
    print(""+error[0])
    print("\t"+str(error[2]))
    print("\t\thttp://tatoeba.org/eng/sentences/show/"+str(error[1]))






Likely Error report for user kroko


sina wile ala wile wan e nanpa ni
	Sentence doesn't end with punctuation
		http://tatoeba.org/eng/sentences/show/3005286
mi pona e tomo pi pata mi.
	likely misspelled word: pata
		http://tatoeba.org/eng/sentences/show/3005615
pata mi li jo e soweli.
	likely misspelled word: pata
		http://tatoeba.org/eng/sentences/show/3938433
mi kama jo e kili Apo teka tu lon esun.
	likely misspelled word: teka
		http://tatoeba.org/eng/sentences/show/4600833




Likely Error report for user bojnin


jan lili mute li musi lon ma kasi pi ma tomo
	Sentence doesn't end with punctuation
		http://tatoeba.org/eng/sentences/show/642739
tenpo suno kama la seli li pona la mi tawa tan insa
	Sentence doesn't end with punctuation
		http://tatoeba.org/eng/sentences/show/667636
pilin pi kasi kule Lila li lon ma kasi pi poka tomo mute
	Sentence doesn't end with punctuation
		http://tatoeba.org/eng/sentences/show/740208




Likely Error report for user negativeclock


jan mute l

In [47]:
d_view = [ (v,k) for k,v in freq.items() ]
d_view.sort(reverse=True) # natively sort tuples by first element

for k,w in d_view:
    print("%s: %d" % (w,k))
    for id in links[w]:
      print("\t"+id_table[id]['v'])
      print("\t\thttps://tatoeba.org/eng/sentences/show/"+str(id))

sitlen: 5
	jan ni li weka ike e sitlen lape pi mi mute la jan ni li lawa e moli tawa mi mute.
		https://tatoeba.org/eng/sentences/show/6391233
	jan Mawi li jan pali pi sitlen tawa li jo e ken.
		https://tatoeba.org/eng/sentences/show/6452080
	mi mute li pali e anu tawa ni: kama weka lon lape la ona li ken pali e sitlen lape ona tawa lon.
		https://tatoeba.org/eng/sentences/show/6482929
	tenpo sinpin la sina kepeken e ilo sitlen tawa la o pali e pali pi kama sona.
		https://tatoeba.org/eng/sentences/show/6534496
	sitelen pi nimi ma li selo tan palisa pi kasi suli la jan li ken lili lukin e sitlen ni.
		https://tatoeba.org/eng/sentences/show/6544005
sielo: 5
	sijelo pi jan Ton li suli mute la sielo pi jan Mawi li suli lili.
		https://tatoeba.org/eng/sentences/show/6843418
	moku en len sielo en tomo li suli mute tawa jan ali.
		https://tatoeba.org/eng/sentences/show/7270281
	jan Ton taso li jan pi pona sielo lon ma tomo.
		https://tatoeba.org/eng/sentences/show/8013499
	tenpo suno pini ni