<a href="https://colab.research.google.com/github/increpare/tatoeba_toki_pona_spellcheck/blob/main/tatoeba_toki_pona_spellcheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
!pip install emoji

# by jan Inkepa - https://github.com/increpare/tatoeba_toki_pona_spellcheck/blob/main/tatoeba_toki_pona_spellcheck.ipynb

import os
import urllib.request
from emoji import UNICODE_EMOJI

print("downloading toki pona sentences from tatoeba")
urllib.request.urlretrieve('https://downloads.tatoeba.org/exports/per_language/toki/toki_sentences_detailed.tsv.bz2', 'toki_sentences_detailed.tsv.bz2')

!rm -rf /content/toki_sentences_detailed.tsv

print("decompressing data")
!bunzip2 /content/toki_sentences_detailed.tsv.bz2
print("done")


downloading toki pona sentences from tatoeba
decompressing data
done


In [74]:
import pandas as pd
df = pd.read_csv("/content/toki_sentences_detailed.tsv",names=["k","v","user","added","modified"],delimiter="\t")
words = ["a","akesi","ala","ali","alasa","ale","anpa","ante","anu","awen","e","en","esun","ijo","ike","ilo","insa","jaki","jan","jelo","jo","kala","kalama","kama","kasi","ken","kepeken","kili","kin","kipisi","kiwen","ko","kon","kule","kulupu","kute","la","lape","laso","lawa","leko","len","lete","li","lili","linja","lipu","loje","lon","luka","lukin","lupa","ma","mama","mani","meli","mi","mije","moku","moli","monsi","monsuta","mu","mun","musi","mute","namako","nanpa","nasa","nasin","nena","ni","nimi","noka","o","oko","olin","ona","open","pakala","pali","palisa","pan","pana","pi","pilin","pimeja","pini","pipi","poka","poki","pona","pu","sama","seli","selo","seme","sewi","sijelo","sike","sin","sina","sinpin","sitelen","sona","soweli","suli","suno","supa","suwi","tan","taso","tawa","telo","tenpo","toki","tomo","tonsi","tu","unpa","uta","utala","walo","wan","waso","wawa","weka","wile"]

In [75]:
import re

def has_emoji(s):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count(emoji)
        if count >= 1:
            return True
    return False

def validate(w):
  if w[0].isupper():
    return True#w.lower() not in words
  return w in words or w.isnumeric()

errors={}

#checks for "mi/sina" li sentences, and roughly for sentences with an "e" but without a "li"/"o"/"mi"/"sina"
def validate_sentence(s,tokens,index,user):
  if user not in errors:
    errors[user]=[]

  if (tokens[0]=="mi" or tokens[0]=="sina") and tokens[1]=="li":
    errors[user].append([s,index,"mi\sina li"])
    pass
  if "li" not in tokens and "e" in tokens and "o" not in tokens:
    if "mi" not in tokens and "sina" not in tokens:      
      errors[user].append([s,index,"'e' without 'li'/'mi'/'sina'/'o'"])
      pass
  if bool(re.search(r"\b(la,?|\.) (mi|sina) li\b",s)):
    errors[user].append([s,index,"mi\sina li"])
    pass
  lastchar = s[len(s)-1]
  if lastchar.isalnum():
    errors[user].append([s,index,"Sentence doesn't end with punctuation"])
    pass
  if has_emoji(s):
    errors[user].append([s,index,"Contains emoji"])

freq={}
links={}
id_table={}

for index, row in df.iterrows():
  import re
  id_table[index]=row
  user = row['user']
  sentence=row['v']
  tokens = re.findall(r'\w+',sentence)
  validate_sentence(sentence,tokens,index,user)
  for w in tokens:
    if not validate(w):
      if ('"'+w+'"') not in sentence:
        if w not in freq:
          freq[w]=0
          links[w]=[]
        freq[w]=freq[w]+1
        links[w].append(index)
        errors[user].append([sentence,index,"likely misspelled word: "+w])


for u in errors:
  errorlist = errors[u]
  if len(errorlist)==0:
    continue
  print("\n")
  print("==============")
  print(" \n")
  print("Likely Error report for user "+u)
  print("\n")
  for error in errorlist:
    print(""+error[0]+"  ")
    print("- "+str(error[2])+"  ")
    print("- http://tatoeba.org/eng/sentences/show/"+str(error[1])+"  ")
    print("")


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
mi pani e ilo open lon supa.  
- likely misspelled word: pani  
- http://tatoeba.org/eng/sentences/show/9199213  

jan Ton li kama sona e toki Kanse lon tenpo mun du wan.  
- likely misspelled word: du  
- http://tatoeba.org/eng/sentences/show/9310171  



 

Likely Error report for user soweli_Elepanto


sina tawa lili taso la sina li kama moli.  
- mi\sina li  
- http://tatoeba.org/eng/sentences/show/1722700  

jan Losi li musi uta ala  
- Sentence doesn't end with punctuation  
- http://tatoeba.org/eng/sentences/show/1725107  

moku en len sielo en tomo li suli mute tawa jan ali.  
- likely misspelled word: sielo  
- http://tatoeba.org/eng/sentences/show/7270281  

jan Ton taso li jan pi pona sielo lon ma tomo.  
- likely misspelled word: sielo  
- http://tatoeba.org/eng/sentences/show/8013499  

jan Ton li lon ni la sina li toki e seme tawa ona?  
- mi\sina li  
- http://tatoeba.org/eng/sentences/show/

In [76]:
d_view = [ (v,k) for k,v in freq.items() ]
d_view.sort(reverse=True) # natively sort tuples by first element

for k,w in d_view:
    print("%s: %d" % (w,k))
    for id in links[w]:
      print("\t"+id_table[id]['v'])
      print("\t\thttps://tatoeba.org/eng/sentences/show/"+str(id))

ton: 5
	tenpo pini lila la, mi toki e ona tawa jan ton.
		https://tatoeba.org/eng/sentences/show/7866735
	jan ton li tawa musi ala.
		https://tatoeba.org/eng/sentences/show/9093391
	jan ton li jo e ilo tawa pona.
		https://tatoeba.org/eng/sentences/show/9109313
	jan ton li lawa e tomo tawa suli.
		https://tatoeba.org/eng/sentences/show/9146248
	jan ton li jo e ilo pi pona uta lon tomo pali ona. kepeken ona la ona li pona e uta ona lon pini moku.
		https://tatoeba.org/eng/sentences/show/9195993
sitlen: 5
	jan ni li weka ike e sitlen lape pi mi mute la jan ni li lawa e moli tawa mi mute.
		https://tatoeba.org/eng/sentences/show/6391233
	jan Mawi li jan pali pi sitlen tawa li jo e ken.
		https://tatoeba.org/eng/sentences/show/6452080
	mi mute li pali e anu tawa ni: kama weka lon lape la ona li ken pali e sitlen lape ona tawa lon.
		https://tatoeba.org/eng/sentences/show/6482929
	tenpo sinpin la sina kepeken e ilo sitlen tawa la o pali e pali pi kama sona.
		https://tatoeba.org/eng/sentenc