<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpustools/blob/main/S101lemHYv202509RB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing pipeline for DAAD project

## Stage 1: Lemmatization (Armenian)
Example: Universal declaration of human rights

In [None]:
# Downloading UDHR
!wget https://unicode.org/udhr/assemblies/udhr_txt.zip

In [None]:
%%bash
mkdir udhr
mv udhr_txt.zip udhr
cd udhr/
unzip udhr_txt.zip

In [None]:
%%bash
# delete lines which are not translations in some files (hy)
# delete between lines $a and $b inclusive
a=9
b=21
awk -v m=$a -v n=$b 'm <= NR && NR <= n {next} {print}' < udhr/udhr_hye.txt >udhr/udhr_hye_v03.txt
# put paragraph tags
# awk '{print "<p>\n"$0 ; print "</p>"}' udhr/udhr_hye2.txt >udhr/udhr_hye_v03.txt

In [None]:
# importing python libraries
import os, re, sys

In [None]:
# installing Armenian morphological analyser
!git clone https://github.com/timarkh/uniparser-grammar-eastern-armenian
# Python classes
!pip3 install uniparser-eastern-armenian
# disambiguation
!sudo apt-get install cg3

In [None]:
from uniparser_eastern_armenian import EasternArmenianAnalyzer
a = EasternArmenianAnalyzer()
analyses = a.analyze_words('Ձևաբանություն')
for ana in analyses:
    print(ana.wf, ana.lemma, ana.gramm, ana.gloss, ana.stem, ana.subwords, ana.wfGlossed, ana.otherData)

In [None]:
# trying out:
# nonexisting word
analyses2 = a.analyze_words('Ձևաբայու')
for ana2 in analyses2:
    if ana2.lemma:
      print(ana2.wf, ana2.lemma, ana2.gramm, ana2.gloss, ana2.stem, ana2.subwords, ana2.wfGlossed, ana2.otherData)
    else:
      print(ana2.wf, ana2.wf, "N", "x", ana2.stem, ana2.subwords, ana2.wfGlossed, ana2.otherData)

Ձևաբայու Ձևաբայու N x  []  []


In [None]:
analyses = a.analyze_words([['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']],
                           format='xml')
for ana in analyses:
    print(str(ana))

In [None]:
analyses = a.analyze_words(['Ձևաբանություն', [['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']]],
                           format='json')
for ana in analyses:
    print(str(ana))

In [None]:
# analysis with disambiguation
analyses = a.analyze_words(['Ես', 'սիրում', 'եմ', 'քեզ'], disambiguate=True)
for ana in analyses:
    if len(ana) > 1: tab = "  "
    else: tab = ""
    for wfo in ana:
        print(tab, wfo.wf, wfo.lemma, wfo.gramm, wfo.gloss)

Str = "Սառը, վճիտ ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"

StrDe = ' „Es war ein kalter, trostloser Apriltag, und die Uhr schlug dreizehn. Das Kinn an die Brust gedrückt, um sich vor dem bitteren Wind zu schützen, eilte Winston Smith durch die gläserne Veranda des Wohnhauses Victory und hinterließ einen körnigen Sturm Staub." '

StrEn = ' "It was a cold, dreary April day, and the clock struck thirteen. Tucking his chin to his chest to shield himself from the bitter wind, Winston Smith hurried through the glass porch of the Victory apartment building, leaving behind him a storm of granular dust." '

In [None]:
Str = "Սառը, վճիտ ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"

In [None]:
Lst = re.split('([ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+)', Str)
LstTok = []
for el in Lst:
    el = el.strip()
    if el != '': LstTok.append(el)


In [None]:
print(LstTok)
# does disambiguation work? not yet...

In [None]:
from gensim.models import Word2Vec # The word2vec model class
import gensim.downloader as api # Allows us to download some free training data

In [None]:
# Vahram's model for English
!wget https://heibox.uni-heidelberg.de/f/c2ba64e4ad844f3a99d4/?dl=1
!cp index.html?dl=1 WIKI_EN.model

In [None]:
model_WIKI_EN = Word2Vec.load("/content/WIKI_EN.model")
word_vectors_WIKI_EN = model_WIKI_EN.wv

In [None]:
distance = model_WIKI_EN.similarity('obama', 'barak')
distance2 = model_WIKI_EN.similarity('obama', 'zone')


In [None]:
print('distance = %.4f' % distance)
print('distance2 = %.4f' % distance2)

In [None]:
def tokenizeHy(Str2tokenise, rePattern = '([ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+)'):
    LTokens = []
    Lst = re.split(rePattern, Str2tokenise)
    # LstTok = []
    for el in Lst:
        el = el.strip()
        if el != '': LTokens.append(el)
    return LTokens

In [None]:
# merging together different lines of code
def disambiguate(LTok2disambiguate, Window = 4, show_variants = False):
    SDisambig = ''

    analysesN = a.analyze_words(LTok2disambiguate, disambiguate=False)

    # preparing data structures
    LLContext = [] # empty list of contexts, indices are the same as with the Text list
    LLText = [] # test to disambiguate, ambiguous interpretations are double entries
    for ana in analysesN:
        # creating context window from glosses
        # if len(ana) > 1: tab = "~"
        # else: tab = "!"
        LwfoContext = []
        LwfoText = []
        for wfo in ana: # preserve lemmas / word forms which are not found in dictionary
            if wfo.gramm == '': wfo.gramm = 'N'
            if wfo.lemma == '': wfo.lemma = wfo.wf
            if wfo.gloss == '': wfo.gloss = '[unknown]'

            SWfo = f'{wfo.wf}\t{wfo.gramm}\t{wfo.lemma}\t{wfo.gloss}'
            # print(SWfo)
            # FNoD.write(SWfo + '\n')

            # find the first part of the gloss, which may be in the word vectors model
            REPart = re.match('([A-Za-z]+)', wfo.gloss)
            if REPart:
                SGlossMin = REPart.group(1)
                SGlossMin = SGlossMin.lower()
                # print(SGlossMin)
            else:
                SGlossMin = '[NONE]'
            LwfoContext.append(SGlossMin)
            LwfoText.append(SWfo)
        LLContext.append(LwfoContext)
        LLText.append(LwfoText)

    # for el in LLContext: print(el)
    # for el in LLText: print(el)

    # print(len(LLContext))
    # print(len(LLText))

    print('line done!...\n')


    for i in range(len(LLText)):
        if len(LLText[i]) > 1:
            # print(LLText[i])
            # print(LLContext[i])
            # collect context window +- 3 words
            iwStart = i-Window
            if iwStart <0: iwStart=0
            iwEnd = i+Window
            if iwEnd > len(LLText): iwEnd = len(LLText)
            # iwLen = iwEnd - iwStart
            winContext = LLContext[iwStart:iwEnd]
            # print(winContext)
            LScores = []
            LScCand = []

            for candidate in LLContext[i]:
                ScoreCand = 0
                for LCtx in winContext:
                    for Ctx in LCtx:
                        try: distance = model_WIKI_EN.similarity(candidate, Ctx)
                        except: distance = 0
                        ScoreCand += distance
                LScores.append((candidate,ScoreCand))
                LScCand.append(ScoreCand)
            LScores.sort(key=lambda a: a[1], reverse=True)
            # print(LScores)

            max_value = max(LScCand)
            #  Return the max value of the list
            max_index = LScCand.index(max_value)
            StoWrite = LLText[i][max_index] + '\n'
            SDisambig += StoWrite

            if show_variants == True:
                for el in LLText[i]:
                    StoWrite = '\t~\t' + el + '\n'
                    SDisambig += StoWrite
                for el in LScores:
                    StoWrite = '\t~sc:\t' + str(el) + '\n'
                    SDisambig += StoWrite
        else:
            StoWrite = LLText[i][0] + '\n'
            SDisambig += StoWrite

    return SDisambig

In [None]:
!mkdir udhrTT

In [None]:

FInText = open('/content/udhr/udhr_hye_v03.txt','r')
FOutText = open('/content/udhrTT/udhr_hye_vert.txt','w')

In [None]:
for SLine in FInText:
    SLine = SLine.strip()
    LTok = tokenizeHy(SLine)
    SDisambig = disambiguate(LTok)
    # FOutText.write('<p>\n')
    # FOutText.write('\n')
    FOutText.write(SDisambig)
    # FOutText.write('</p>\n')
    FOutText.write('\n')

FOutText.flush()

In [None]:
!awk -F '\t' '(NF==4){printf "%s ", $3}(NF!=4){printf "\n"}' < /content/udhrTT/udhr_hye_vert.txt >/content/udhrTT/udhr_hye_lem.txt