In [1]:
# py 3.5 & estnltk 1.4
from estnltk import synthesize
import pandas

In [2]:
# define cases to be generated
cases = [
    ('n', 'nimetav'),
    ('g', 'omastav'),
    ('p', 'osastav'),
    ('ill', 'sisseütlev'),
    ('in', 'seesütlev'),
    ('el', 'seestütlev'),
    ('all', 'alaleütlev'),
    ('ad', 'alalütlev'),
    ('abl', 'alaltütlev'),
    ('tr', 'saav'),
    ('ter', 'rajav'),
    ('es', 'olev'),
    ('ab', 'ilmaütlev'),
    ('kom', 'kaasaütlev')]

In [3]:
# define verbforms to be generated
verb_forms = ['b', 'd', 'da', 'des', 'ge', 'gem', 'gu', 'gu', 'ks', 'ks', 'ks', 'ks', 'ks', 'ks', 'ksid', 'ksid', 'ksime', 'ksin', 'ksite', 'ma', 'maks', 'mas', 'mast', 'mata', 'me', 'n', 'neg', 'neg ge', 'neg gem', 'neg gu', 'neg gu', 'neg gu', 'neg ks', 'neg ks', 'neg ks', 'neg ks', 'neg ks', 'neg ks', 'neg me', 'neg nud', 'neg nud', 'neg nud', 'neg nud', 'neg nud', 'neg nud', 'neg nuks', 'neg nuks', 'neg nuks', 'neg nuks', 'neg nuks', 'neg nuks', 'neg o', 'neg o', 'neg o', 'neg o', 'neg o', 'neg o', 'neg o', 'neg vat', 'neg vat', 'neg tud', 'neg vat', 'neg vat', 'neg vat', 'neg vat', 'nud', 'nuks', 'nuks', 'nuks', 'nuks', 'nuks', 'nuks', 'nuksid', 'nuksid', 'nuksime', 'nuksin', 'nuksite', 'nuvat', 'nuvat', 'nuvat', 'nuvat', 'nuvat', 'nuvat', 'o', 's', 'sid', 'sid', 'sime', 'sin', 'site', 'ta', 'tagu', 'taks', 'takse', 'tama', 'tav', 'tavat', 'te', 'ti', 'tud', 'tuks', 'tuvat', 'v', 'vad', 'vat', 'vat', 'vat', 'vat', 'vat', 'vat']

In [4]:
# function that generates noun forms and returns the table
def synthesize_all(word):
    case_rows = []
    sing_rows = []
    plur_rows = []
    for case, name in cases:
        case_rows.append(case)
        sing_rows.append(', '.join(synthesize(word, 'sg ' + case)))
        plur_rows.append(', '.join(synthesize(word, 'pl ' + case)))
    return pandas.DataFrame({'case': case_rows, 'singular': sing_rows, 'plural': plur_rows}, columns=['case', 'singular', 'plural'])

In [5]:
# use the list of nouns to synthesize forms
def synthesize_list(wordList):
    all_forms = []
    for w in wordList:
        word_forms = synthesize_all(w)
        all_forms.append(word_forms)
    return pandas.concat(all_forms, axis=0)

In [16]:
# use these verbs to synthesize forms
verb_list = ['hakkama', 'minema', 'olema', 'pidama', 'saama', 'tegema', 'tulema', 'võima']

In [41]:
# read in a list of nouns to be synthesized from the file
substList = [line.strip() for line in open("to_be_inflected_subst.csv", 'r')]

In [42]:
# generate noun forms
synthesized_substatnives = synthesize_list(substList)

In [43]:
synthesized_substatnives.head()

Unnamed: 0,case,singular,plural
0,n,eelmine,eelmised
1,g,eelmise,eelmiste
2,p,eelmist,"eelmisi, eelmiseid"
3,ill,eelmisesse,"eelmisisse, eelmistesse, eelmiseisse"
4,in,eelmises,"eelmisis, eelmistes, eelmiseis"


In [44]:
# generate verb forms
stopverbs = []
for vf in verb_forms:
    for v in verb_list:
        stopverbs.append(','.join(synthesize(v, vf)))

In [45]:
# list of singular noun forms
singulars = []
for ind, row in synthesized_substatnives.iterrows():
    singulars.append(row['singular'])

In [46]:
# list of plural noun forms
plurals = []
for ind, row in synthesized_substatnives.iterrows():
    singulars.append(row['plural'])

In [47]:
stopwords = []
for w in singulars:
    if "," in w:
        [stopwords.append(i.strip()) for i in w.split(',')]
    else:
        stopwords.append(w)

In [48]:
for w in plurals:
    if "," in w:
        [stopwords.append(i.strip()) for i in w.split(',')]
    else:
        stopwords.append(w)

In [49]:
verbs = []
for v in stopverbs:
    if "," in v:
        [verbs.append(i.strip()) for i in v.split(',')]
    else:
        verbs.append(v)
verbs = list(set(verbs))

In [50]:
with open('generated_verb_forms.csv','w') as f:
    f.write( '\n'.join(verbs) )

In [51]:
# noun forms to be removed as generated incorrectly
to_be_removed = ['ea', 'ead', 'easse', 'eale', 'ealt', 'eaks', 'eani', 'eana', 'eata', 'eaga', 'igade', 'igasid',
                 'igadesse', 'igades', 'igadest', 'igadele', 'igadel', 'igadelt', 'igadeks', 'igadeni', 'igadena',
                 'igadeta', 'igadega', 'kee', 'kee', 'keesse', 'kees', 'keest', 'keele', 'keel', 'keelt', 'keeks',
                 'keeni', 'keena', 'keeta', 'keega', 'keed', 'keede', 'keesid', 'keedesse', 'keedes', 'keedest',
                 'keedele', 'keedel', 'keedelt', 'keedeks', 'keedeni', 'keedena', 'keedeta', 'keedega', 'kõigud',
                 'kõigu', 'kõikude', 'kõiku', 'kõikisid', 'kõikusid', 'kõigusse', 'kõikudesse', 'kõigus', 'kõikudes',
                 'kõigust', 'kõikudest', 'kõigule', 'kõikudele', 'kõigul', 'kõikudel', 'kõigult', 'kõikudelt', 'kõiguks',
                 'kõikudeks', 'kõiguni', 'kõikudeni', 'kõiguna', 'kõikudena', 'kõiguta', 'kõikudeta', 'kõiguga', 'kõikudega',
                 'mittekee', 'mittekeed', 'mittekee', 'mittekeede', 'mittekeed', 'mittekeesid', 'mittekeesse', 'mittekeedesse',
                 'mittekees', 'mittekeedes', 'mittekeest', 'mittekeedest', 'mittekeele', 'mittekeedele', 'mittekeel',
                 'mittekeedel', 'mittekeelt', 'mittekeedelt', 'mittekeeks', 'mittekeedeks', 'mittekeeni', 'mittekeedeni',
                 'mittekeena', 'mittekeedena', 'mittekeeta', 'mittekeedeta', 'mittekeega', 'mittekeedega']

In [53]:
# addtional forms to be added to stopwords as not generated
additional = ['ärge', 'ärgem', 'ärgu', 'ärme', 'ära',
              'mittekeegi', 'mittekellegi', 'mittekellessegi', 'mittekelleski',
              'mittekellestki', 'mittekellelegi', 'mittekellelgi', 'mittekelleltki', 'mittekellekski', 'mittekellenigi',
              'mittekellenagi', 'mittekelletagi', 'mittekellegagi']

In [54]:
stopwords = [x for x in stopwords if x not in to_be_removed]

In [55]:
for v in verbs:
    if v not in stopwords:
        stopwords.append(v)

In [56]:
for a in additional:
    if a not in stopwords:
        stopwords.append(a)

In [57]:
def file_to_list (file):
    wordlist = [line.strip() for line in open(file, 'r')]
    return wordlist
        

In [58]:
stopwords = file_to_list("adps.csv") + file_to_list("advs-etc.csv") + file_to_list("konj.csv") + file_to_list("interj.csv") + stopwords

In [59]:
stopwords = list(set(stopwords))

In [60]:
stopwords = [x for x in stopwords if x != '']

In [61]:
lemma_stopwords = file_to_list("adps.csv") + file_to_list("advs-etc.csv") + file_to_list("konj.csv") + file_to_list("interj.csv") + file_to_list("to_be_inflected_subst.csv") + file_to_list("to_be_inflected_verbs.csv")

In [62]:
len(lemma_stopwords)

1605

In [63]:
with open('estonian-stopwords.txt','w') as f:
    f.write( '\n'.join(stopwords) )

In [64]:
with open('estonian-stopwords-lemmas.txt','w') as f:
    f.write( '\n'.join(lemma_stopwords) )