<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpustools/blob/main/S01LemmatizationEnHyV01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Morphological analysis for English and Armenian

We will create a workflow for analysing English and Armenian texts

For English we will use the TreeTagger 

For Armenian we will use the git repository with Armenian morphological analyser: 
https://github.com/timarkh/uniparser-grammar-eastern-armenian

In [4]:
# importing python libraries
import os, re, sys

## English

In [None]:
# installing TreeTagger

In [None]:
%%bash
mkdir treetagger
cd treetagger
# Download the tagger package for your system (PC-Linux, Mac OS-X, ARM64, ARMHF, ARM-Android, PPC64le-Linux).
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.4.tar.gz
tar -xzvf tree-tagger-linux-3.2.4.tar.gz
# Download the tagging scripts into the same directory.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz
gunzip tagger-scripts.tar.gz
# Download the installation script install-tagger.sh.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/install-tagger.sh
# Download the parameter files for the languages you want to process.
# list of all files (parameter files) https://cis.lmu.de/~schmid/tools/TreeTagger/#parfiles
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/english.par.gz
sh install-tagger.sh
cd ..
sudo pip install treetaggerwrapper


In [None]:
%%bash
wget https://heibox.uni-heidelberg.de/f/95a3875771c040db959a/?dl=1
mv index.html?dl=1 humanrights02.txt

wget https://heibox.uni-heidelberg.de/f/cdf240db84ca4718b718/?dl=1
mv index.html?dl=1 en1984.txt

In [None]:
!head --lines=20 humanrights02.txt
!wc humanrights02.txt

In [None]:
!./treetagger/cmd/tree-tagger-english en1984.txt >en1984_vert.txt

In [23]:
!head --lines=20 en1984_vert.txt

head: cannot open 'en1984_vert.txt' for reading: No such file or directory


In [None]:
!./treetagger/cmd/tree-tagger-english humanrights02.txt >humanrights02_vert.txt

In [None]:
!head --lines=20 humanrights02_vert.txt

## Armenian

In [7]:
# installing Armenian morphological analyser
!git clone https://github.com/timarkh/uniparser-grammar-eastern-armenian

Cloning into 'uniparser-grammar-eastern-armenian'...
remote: Enumerating objects: 181, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 181 (delta 12), reused 40 (delta 12), pack-reused 141[K
Receiving objects: 100% (181/181), 52.66 MiB | 22.98 MiB/s, done.
Resolving deltas: 100% (78/78), done.


In [8]:
# Python classes
!pip3 install uniparser-eastern-armenian

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting uniparser-eastern-armenian
  Downloading uniparser_eastern_armenian-2.1.2-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 25.5 MB/s 
Collecting uniparser-morph>=2.2.0
  Downloading uniparser_morph-2.6.4-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 6.9 MB/s 
Installing collected packages: uniparser-morph, uniparser-eastern-armenian
Successfully installed uniparser-eastern-armenian-2.1.2 uniparser-morph-2.6.4


In [None]:
# disambiguation
!sudo apt-get install cg3

In [10]:
from uniparser_eastern_armenian import EasternArmenianAnalyzer
a = EasternArmenianAnalyzer()
analyses = a.analyze_words('Ձևաբանություն')

In [11]:
for ana in analyses:
    print(ana.wf, ana.lemma, ana.gramm, ana.gloss, ana.stem, ana.subwords, ana.wfGlossed, ana.otherData)

Ձևաբանություն ձեւաբանություն N,inanim,sg,nom,nonposs morphology ձևաբանություն. [] ձևաբանություն [('trans_en', 'morphology')]


In [None]:
# nonexisting word
analyses2 = a.analyze_words('Ձևաբայու')

In [None]:
for ana2 in analyses2:
    if ana2.lemma:
      print(ana2.wf, ana2.lemma, ana2.gramm, ana2.gloss, ana2.stem, ana2.subwords, ana2.wfGlossed, ana2.otherData)
    else:
      print(ana2.wf, ana2.wf, "N", "x", ana2.stem, ana2.subwords, ana2.wfGlossed, ana2.otherData)

In [None]:
# which fields we have in analysis:

In [None]:
dir(ana)

In [None]:
analyses = a.analyze_words([['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']],
                           format='xml')

In [None]:
for ana in analyses:
    print(str(ana))

['<w><ana lex="եւ" gr="CONJ" parts="և" gloss="and" trans_en="and, too, either"></ana>և</w>']
['<w><ana lex="ես" gr="PRON,S,hum,sg,nom" parts="ես" gloss="me" trans_en="I"></ana><ana lex="է" gr="V,intr,prs,sg,2" parts="ե-ս" gloss="be-PRS.2SG" trans_en="be"></ana>Ես</w>', '<w><ana lex="սիրել" gr="V,tr,cvb,ipfv" parts="սիր-ում" gloss="love-CVB.IPFV" trans_en="love, have a passion/an affection for, like"></ana>սիրում</w>', '<w><ana lex="է" gr="V,intr,prs,sg,1" parts="ե-մ" gloss="be-PRS.1SG" trans_en="be"></ana>եմ</w>', '<w><ana lex="դու" gr="PRON,S,hum,sg,dat" parts="քեզ" gloss="thou" trans_en="you, thou"></ana>քեզ</w>', '<w><ana lex="" gr="" parts="" gloss=""></ana>:</w>']


In [None]:
analyses = a.analyze_words(['Ձևաբանություն', [['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']]],
                           format='json')

In [None]:
for ana in analyses:
    print(str(ana))

[{'wf': 'Ձևաբանություն', 'lemma': 'ձեւաբանություն', 'gramm': ['N', 'inanim', 'sg', 'nom', 'nonposs'], 'wfGlossed': 'ձևաբանություն', 'gloss': 'morphology', 'trans_en': 'morphology'}]
[[[{'wf': 'և', 'lemma': 'եւ', 'gramm': ['CONJ'], 'wfGlossed': 'և', 'gloss': 'and', 'trans_en': 'and, too, either'}]], [[{'wf': 'Ես', 'lemma': 'ես', 'gramm': ['PRON', 'S', 'hum', 'sg', 'nom'], 'wfGlossed': 'ես', 'gloss': 'me', 'trans_en': 'I'}, {'wf': 'Ես', 'lemma': 'է', 'gramm': ['V', 'intr', 'prs', 'sg', '2'], 'wfGlossed': 'ե-ս', 'gloss': 'be-PRS.2SG', 'trans_en': 'be'}], [{'wf': 'սիրում', 'lemma': 'սիրել', 'gramm': ['V', 'tr', 'cvb', 'ipfv'], 'wfGlossed': 'սիր-ում', 'gloss': 'love-CVB.IPFV', 'trans_en': 'love, have a passion/an affection for, like'}], [{'wf': 'եմ', 'lemma': 'է', 'gramm': ['V', 'intr', 'prs', 'sg', '1'], 'wfGlossed': 'ե-մ', 'gloss': 'be-PRS.1SG', 'trans_en': 'be'}], [{'wf': 'քեզ', 'lemma': 'դու', 'gramm': ['PRON', 'S', 'hum', 'sg', 'dat'], 'wfGlossed': 'քեզ', 'gloss': 'thou', 'trans_en': '

In [None]:
# analysis with disambiguation
analyses = a.analyze_words(['Ես', 'սիրում', 'եմ', 'քեզ'], disambiguate=True)

In [None]:
for ana in analyses:
    if len(ana) > 1: tab = "  "
    else: tab = ""
    for wfo in ana:
        print(tab, wfo.wf, wfo.lemma, wfo.gramm, wfo.gloss)

   Ես է V,intr,prs,sg,2 be-PRS.2SG
   Ես ես PRON,S,hum,sg,nom me
 սիրում սիրել V,tr,cvb,ipfv love-CVB.IPFV
 եմ է V,intr,prs,sg,1 be-PRS.1SG
 քեզ դու PRON,S,hum,sg,dat thou


In [None]:
print(type(wfo))

In [None]:
dir(wfo)

Str = "Սառը, վճիտ ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"

StrDe = ' „Es war ein kalter, trostloser Apriltag, und die Uhr schlug dreizehn. Das Kinn an die Brust gedrückt, um sich vor dem bitteren Wind zu schützen, eilte Winston Smith durch die gläserne Veranda des Wohnhauses Victory und hinterließ einen körnigen Sturm Staub." '

StrEn = ' "It was a cold, dreary April day, and the clock struck thirteen. Tucking his chin to his chest to shield himself from the bitter wind, Winston Smith hurried through the glass porch of the Victory apartment building, leaving behind him a storm of granular dust." '

In [14]:
Str = "Սառը, վճիտ ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"

In [15]:
Lst = re.split('[ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+', Str)

In [16]:
print(Lst)

['Սառը', 'վճիտ', 'ապրիլյան', 'օր', 'էր', 'ու', 'ժամացույցը', 'խփում', 'էր', 'տասներեքը', 'Չար', 'քամուց', 'թաքնվելու', 'համար', 'կզակը', 'սեղմելով', 'կրծքին', 'Ուինսթոն', 'Սմիթն', 'արագ', 'ներս', 'խցկվեց', 'Հաղթանակ', 'բնակելի', 'տան', 'ապակե', 'շքադռնից', 'իր', 'ետևից', 'ներս', 'թողնելով', 'հատիկավոր', 'փոշու', 'մի', 'ամբողջ', 'փոթորիկ', '']


In [17]:
Lst = re.split('([ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+)', Str)

In [18]:
print(Lst)

['Սառը', ', ', 'վճիտ', ' ', 'ապրիլյան', ' ', 'օր', ' ', 'էր', ', ', 'ու', ' ', 'ժամացույցը', ' ', 'խփում', ' ', 'էր', ' ', 'տասներեքը', '։ ', 'Չար', ' ', 'քամուց', ' ', 'թաքնվելու', ' ', 'համար', ' ', 'կզակը', ' ', 'սեղմելով', ' ', 'կրծքին', '՝ ', 'Ուինսթոն', ' ', 'Սմիթն', ' ', 'արագ', ' ', 'ներս', ' ', 'խցկվեց', ' «', 'Հաղթանակ', '» ', 'բնակելի', ' ', 'տան', ' ', 'ապակե', ' ', 'շքադռնից', '՝ ', 'իր', ' ', 'ետևից', ' ', 'ներս', ' ', 'թողնելով', ' ', 'հատիկավոր', ' ', 'փոշու', ' ', 'մի', ' ', 'ամբողջ', ' ', 'փոթորիկ', '։', '']


In [19]:
LstTok = []
for el in Lst:
    el = el.strip()
    if el != '': LstTok.append(el)


In [20]:
print(LstTok)

['Սառը', ',', 'վճիտ', 'ապրիլյան', 'օր', 'էր', ',', 'ու', 'ժամացույցը', 'խփում', 'էր', 'տասներեքը', '։', 'Չար', 'քամուց', 'թաքնվելու', 'համար', 'կզակը', 'սեղմելով', 'կրծքին', '՝', 'Ուինսթոն', 'Սմիթն', 'արագ', 'ներս', 'խցկվեց', '«', 'Հաղթանակ', '»', 'բնակելի', 'տան', 'ապակե', 'շքադռնից', '՝', 'իր', 'ետևից', 'ներս', 'թողնելով', 'հատիկավոր', 'փոշու', 'մի', 'ամբողջ', 'փոթորիկ', '։']


Does disambiguation work? Checking...

In [21]:
analysesD = a.analyze_words(LstTok, disambiguate=True)
analysesN = a.analyze_words(LstTok, disambiguate=False)

Showing ambiguous words (with tilde)

In [None]:
FD = open('FD.txt', 'w')
for ana in analysesD:
    if len(ana) > 1: tab = "~"
    else: tab = "!"
    for wfo in ana:
        if wfo.gramm == '': wfo.gramm = 'N'
        if wfo.lemma == '': wfo.lemma = wfo.wf
        if wfo.gloss == '': wfo.gloss = '[unknown]'
        SWfo = f'{tab}\t{wfo.wf}\t{wfo.gramm}\t{wfo.lemma}\t{wfo.gloss}'
        print(SWfo)
        FD.write(SWfo + '\n')

In [None]:
FNoD = open('FNoD.txt', 'w')
for ana in analysesN:
    if len(ana) > 1: tab = "~"
    else: tab = "!"
    for wfo in ana:
        if wfo.gramm == '': wfo.gramm = 'N'
        if wfo.lemma == '': wfo.lemma = wfo.wf
        if wfo.gloss == '': wfo.gloss = '[unknown]'
        SWfo = f'{tab}\t{wfo.wf}\t{wfo.gramm}\t{wfo.lemma}\t{wfo.gloss}'
        print(SWfo)
        FNoD.write(SWfo + '\n')

In [None]:
!diff FD.txt FNoD.txt

... did disambiguation work?

Translation-based disambiguation
Downloading word vector model

In [12]:
from gensim.models import Word2Vec # The word2vec model class
import gensim.downloader as api # Allows us to download some free training data

In [None]:
# Vahram's model for English
!wget https://heibox.uni-heidelberg.de/f/c2ba64e4ad844f3a99d4/?dl=1
!cp index.html?dl=1 WIKI_EN.model

In [14]:
model_WIKI_EN = Word2Vec.load("/content/WIKI_EN.model")
word_vectors_WIKI_EN = model_WIKI_EN.wv

In [15]:
distance = model_WIKI_EN.similarity('obama', 'barak')
distance2 = model_WIKI_EN.similarity('obama', 'zone')

  """Entry point for launching an IPython kernel.
  


In [None]:
print('distance = %.4f' % distance)
print('distance2 = %.4f' % distance2)

Implementing disambiguation
4-word window

In [None]:
# preparing data structures
LLContext = [] # empty list of contexts, indices are the same as with the Text list
LLText = [] # test to disambiguate, ambiguous interpretations are double entries
for ana in analysesN:
    # creating context window from glosses
    # if len(ana) > 1: tab = "~"
    # else: tab = "!"
    LwfoContext = []
    LwfoText = []
    for wfo in ana:
        if wfo.gramm == '': wfo.gramm = 'N'
        if wfo.lemma == '': wfo.lemma = wfo.wf
        if wfo.gloss == '': wfo.gloss = '[unknown]'

        SWfo = f'{wfo.wf}\t{wfo.gramm}\t{wfo.lemma}\t{wfo.gloss}'
        # print(SWfo)
        # FNoD.write(SWfo + '\n')

        # find the first part of the gloss, which may be in the word vectors model
        REPart = re.match('([A-Za-z]+)', wfo.gloss)
        if REPart: 
            SGlossMin = REPart.group(1)
            SGlossMin = SGlossMin.lower()
            # print(SGlossMin)
        else:
            SGlossMin = '[NONE]'
        LwfoContext.append(SGlossMin)
        LwfoText.append(SWfo)
    LLContext.append(LwfoContext)
    LLText.append(LwfoText)

for el in LLContext: print(el)
for el in LLText: print(el)

print(len(LLContext))
print(len(LLText))

print('done!...\n')

In [None]:
for i in range(len(LLText)):
    if len(LLText[i]) > 1: 
        print(LLText[i])
        print(LLContext[i])

In [None]:
# disambiguation algorithm
FDisambiguate = open('FDisambiguate.txt', 'w')
for i in range(len(LLText)):
    if len(LLText[i]) > 1: 
        # print(LLText[i])
        # print(LLContext[i])
        # collect context window +- 3 words
        iwStart = i-4
        if iwStart <0: iwStart=0
        iwEnd = i+4
        if iwEnd > len(LLText): iwEnd = len(LLText)
        # iwLen = iwEnd - iwStart
        winContext = LLContext[iwStart:iwEnd]
        print(winContext)
        LScores = []
        LScCand = []

        for candidate in LLContext[i]:
            ScoreCand = 0
            for LCtx in winContext:
                for Ctx in LCtx:
                    try: distance = model_WIKI_EN.similarity(candidate, Ctx)
                    except: distance = 0
                    ScoreCand += distance
            LScores.append((candidate,ScoreCand))
            LScCand.append(ScoreCand)
        LScores.sort(key=lambda a: a[1], reverse=True)
        print(LScores)

        max_value = max(LScCand)
        #  Return the max value of the list
        max_index = LScCand.index(max_value)
        FDisambiguate.write(LLText[i][max_index] + '\n')
        for el in LLText[i]:
            FDisambiguate.write('\t~\t' + el + '\n')
        for el in LScores:
            FDisambiguate.write('\t~sc:\t' + str(el) + '\n')
    else:
        FDisambiguate.write(LLText[i][0] + '\n')

    
FDisambiguate.flush()
        


In [5]:
def tokenizeHy(Str2tokenise, rePattern = '([ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+)'):
    LTokens = []
    Lst = re.split(rePattern, Str2tokenise)
    # LstTok = []
    for el in Lst:
        el = el.strip()
        if el != '': LTokens.append(el)
    return LTokens

In [6]:
Str2tokenise = "Սառը, վճիտ ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"
LTok = tokenizeHy(Str2tokenise) 
print(LTok)

['Սառը', ',', 'վճիտ', 'ապրիլյան', 'օր', 'էր', ',', 'ու', 'ժամացույցը', 'խփում', 'էր', 'տասներեքը', '։', 'Չար', 'քամուց', 'թաքնվելու', 'համար', 'կզակը', 'սեղմելով', 'կրծքին', '՝', 'Ուինսթոն', 'Սմիթն', 'արագ', 'ներս', 'խցկվեց', '«', 'Հաղթանակ', '»', 'բնակելի', 'տան', 'ապակե', 'շքադռնից', '՝', 'իր', 'ետևից', 'ներս', 'թողնելով', 'հատիկավոր', 'փոշու', 'մի', 'ամբողջ', 'փոթորիկ', '։']


In [None]:
# this should be run for disambiguate() function to work, run if not used before

from uniparser_eastern_armenian import EasternArmenianAnalyzer
a = EasternArmenianAnalyzer()

In [26]:
# merging together different lines of code
def disambiguate(LTok2disambiguate, Window = 4, show_variants = True):
    SDisambig = ''

    analysesN = a.analyze_words(LTok2disambiguate, disambiguate=False)

    # preparing data structures
    LLContext = [] # empty list of contexts, indices are the same as with the Text list
    LLText = [] # test to disambiguate, ambiguous interpretations are double entries
    for ana in analysesN:
        # creating context window from glosses
        # if len(ana) > 1: tab = "~"
        # else: tab = "!"
        LwfoContext = []
        LwfoText = []
        for wfo in ana: # preserve lemmas / word forms which are not found in dictionary
            if wfo.gramm == '': wfo.gramm = 'N'
            if wfo.lemma == '': wfo.lemma = wfo.wf
            if wfo.gloss == '': wfo.gloss = '[unknown]'

            SWfo = f'{wfo.wf}\t{wfo.gramm}\t{wfo.lemma}\t{wfo.gloss}'
            # print(SWfo)
            # FNoD.write(SWfo + '\n')

            # find the first part of the gloss, which may be in the word vectors model
            REPart = re.match('([A-Za-z]+)', wfo.gloss)
            if REPart: 
                SGlossMin = REPart.group(1)
                SGlossMin = SGlossMin.lower()
                # print(SGlossMin)
            else:
                SGlossMin = '[NONE]'
            LwfoContext.append(SGlossMin)
            LwfoText.append(SWfo)
        LLContext.append(LwfoContext)
        LLText.append(LwfoText)

    # for el in LLContext: print(el)
    # for el in LLText: print(el)

    # print(len(LLContext))
    # print(len(LLText))

    print('line done!...\n')


    for i in range(len(LLText)):
        if len(LLText[i]) > 1: 
            # print(LLText[i])
            # print(LLContext[i])
            # collect context window +- 3 words
            iwStart = i-Window
            if iwStart <0: iwStart=0
            iwEnd = i+Window
            if iwEnd > len(LLText): iwEnd = len(LLText)
            # iwLen = iwEnd - iwStart
            winContext = LLContext[iwStart:iwEnd]
            # print(winContext)
            LScores = []
            LScCand = []

            for candidate in LLContext[i]:
                ScoreCand = 0
                for LCtx in winContext:
                    for Ctx in LCtx:
                        try: distance = model_WIKI_EN.similarity(candidate, Ctx)
                        except: distance = 0
                        ScoreCand += distance
                LScores.append((candidate,ScoreCand))
                LScCand.append(ScoreCand)
            LScores.sort(key=lambda a: a[1], reverse=True)
            # print(LScores)

            max_value = max(LScCand)
            #  Return the max value of the list
            max_index = LScCand.index(max_value)
            StoWrite = LLText[i][max_index] + '\n'
            SDisambig += StoWrite

            if show_variants == True:
                for el in LLText[i]:
                    StoWrite = '\t~\t' + el + '\n'
                    SDisambig += StoWrite
                for el in LScores:
                    StoWrite = '\t~sc:\t' + str(el) + '\n'
                    SDisambig += StoWrite
        else:
            StoWrite = LLText[i][0] + '\n'
            SDisambig += StoWrite

    return SDisambig

In [None]:
!wget https://heibox.uni-heidelberg.de/f/e0bfae444a5a4c76957b/?dl=1
!mv index.html?dl=1 hy1984.txt
FInText = open('hy1984.txt','r')
FOutText = open('hy1984_vert2.txt','w')

In [27]:
for SLine in FInText:
    SLine = SLine.strip()
    LTok = tokenizeHy(SLine)
    SDisambig = disambiguate(LTok)
    FOutText.write('<p>\n')
    FOutText.write(SDisambig)
    FOutText.write('</p>\n')

FOutText.flush()

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...





line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!...

line done!

In [None]:
# downloading and analysing texts

In [None]:
!wget https://heibox.uni-heidelberg.de/f/e0bfae444a5a4c76957b/?dl=1
!mv index.html?dl=1 hy1984.txt

In [None]:
FInText = open('hy1984.txt','r')
FOutText = open('hy1984_vert.txt','w')

In [None]:
for SLine in FInText:
    SLine = SLine.strip()
    ListOfWords = re.split('[ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+', SLine) # tokenize: split on white spaces and punctuation
    # if len(ListOfWords) > 0: FOutText.write(str(ListOfWords) + '\n')
    analyses = a.analyze_words(ListOfWords, disambiguate=False)
    FOutText.write('<p>\n')
    for ana in analyses:
        # for wfo in ana:
        # how to type all variants + disambiguate ?
        for wfo in ana:
          # wfo = ana[0]
          FOutText.write(wfo.wf + '\t' + wfo.gramm + '\t' + wfo.lemma + '\t' + wfo.gloss + '\n')
          #    FOutText.write(wfo.wf + '\t' + wfo.gramm + '\t' + wfo.lemma + '\t' + wfo.gloss + '\n')
    FOutText.write('</p>\n')
FOutText.flush()