<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpustools/blob/main/S101lemHYv202509RB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing pipeline for DAAD project

## Stage 1: Lemmatization (Armenian)
Example: Universal declaration of human rights

the link aboout pdf2txt conversion: https://chatgpt.com/share/68bbeed8-0d08-800e-8418-b0816c6393d4



In [None]:
# Downloading UDHR
# !wget https://unicode.org/udhr/assemblies/udhr_txt.zip
# downloading from the UN website, converting pdf to txt
# the link aboout pdf2txt conversion: https://chatgpt.com/share/68bbeed8-0d08-800e-8418-b0816c6393d4

!wget https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/arm.pdf


In [None]:
!pip install pypdf
from pypdf import PdfReader

In [6]:
def pyPDF2TXT(SFInput, SFOutput):

    reader = PdfReader(SFInput)
    text = ""

    for page in reader.pages:
        text += page.extract_text() + "\n"

    with open(SFOutput, "w", encoding="utf-8") as f:
        f.write(text)

    print("PDF successfully converted to TXT!")

pyPDF2TXT("arm.pdf", "arm.txt")

In [None]:
%%bash
mkdir udhr
mv udhr_txt.zip udhr
cd udhr/
unzip udhr_txt.zip

In [None]:
%%bash
# delete lines which are not translations in some files (hy)
# delete between lines $a and $b inclusive
a=9
b=21
awk -v m=$a -v n=$b 'm <= NR && NR <= n {next} {print}' < udhr/udhr_hye.txt >udhr/udhr_hye_v03.txt
# put paragraph tags
# awk '{print "<p>\n"$0 ; print "</p>"}' udhr/udhr_hye2.txt >udhr/udhr_hye_v03.txt

In [8]:
# importing python libraries
import os, re, sys

In [9]:
# installing Armenian morphological analyser
!git clone https://github.com/timarkh/uniparser-grammar-eastern-armenian
# Python classes
!pip3 install uniparser-eastern-armenian
# disambiguation
!sudo apt-get install cg3

Cloning into 'uniparser-grammar-eastern-armenian'...
remote: Enumerating objects: 181, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 181 (delta 12), reused 40 (delta 12), pack-reused 141 (from 1)[K
Receiving objects: 100% (181/181), 52.66 MiB | 14.39 MiB/s, done.
Resolving deltas: 100% (78/78), done.
Collecting uniparser-eastern-armenian
  Downloading uniparser_eastern_armenian-2.1.2-py3-none-any.whl.metadata (5.2 kB)
Collecting uniparser-morph>=2.2.0 (from uniparser-eastern-armenian)
  Downloading uniparser_morph-2.9.4-py3-none-any.whl.metadata (5.8 kB)
Collecting textdistance>=4.0.0 (from uniparser-morph>=2.2.0->uniparser-eastern-armenian)
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading uniparser_eastern_armenian-2.1.2-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadin

In [10]:
from uniparser_eastern_armenian import EasternArmenianAnalyzer
a = EasternArmenianAnalyzer()
analyses = a.analyze_words('Ձևաբանություն')
for ana in analyses:
    print(ana.wf, ana.lemma, ana.gramm, ana.gloss, ana.stem, ana.subwords, ana.wfGlossed, ana.otherData)

Ձևաբանություն ձեւաբանություն N,inanim,sg,nom,nonposs morphology ձևաբանություն. [] ձևաբանություն [('trans_en', 'morphology')]


In [11]:
# trying out:
# nonexisting word
analyses2 = a.analyze_words('Ձևաբայու')
for ana2 in analyses2:
    if ana2.lemma:
      print(ana2.wf, ana2.lemma, ana2.gramm, ana2.gloss, ana2.stem, ana2.subwords, ana2.wfGlossed, ana2.otherData)
    else:
      print(ana2.wf, ana2.wf, "N", "x", ana2.stem, ana2.subwords, ana2.wfGlossed, ana2.otherData)

Ձևաբայու Ձևաբայու N x  []  []


In [12]:
analyses = a.analyze_words([['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']],
                           format='xml')
for ana in analyses:
    print(str(ana))

['<w><ana lex="եւ" gr="CONJ" parts="և" gloss="and" trans_en="and, too, either"></ana>և</w>']
['<w><ana lex="ես" gr="PRON,S,hum,sg,nom" parts="ես" gloss="me" trans_en="I"></ana><ana lex="է" gr="V,intr,prs,sg,2" parts="ե-ս" gloss="be-PRS.2SG" trans_en="be"></ana>Ես</w>', '<w><ana lex="սիրել" gr="V,tr,cvb,ipfv" parts="սիր-ում" gloss="love-CVB.IPFV" trans_en="love, have a passion/an affection for, like"></ana>սիրում</w>', '<w><ana lex="է" gr="V,intr,prs,sg,1" parts="ե-մ" gloss="be-PRS.1SG" trans_en="be"></ana>եմ</w>', '<w><ana lex="դու" gr="PRON,S,hum,sg,dat" parts="քեզ" gloss="thou" trans_en="you, thou"></ana>քեզ</w>', '<w><ana lex="" gr="" parts="" gloss=""></ana>:</w>']


In [13]:
analyses = a.analyze_words(['Ձևաբանություն', [['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']]],
                           format='json')
for ana in analyses:
    print(str(ana))

[{'wf': 'Ձևաբանություն', 'lemma': 'ձեւաբանություն', 'gramm': ['N', 'inanim', 'sg', 'nom', 'nonposs'], 'wfGlossed': 'ձևաբանություն', 'gloss': 'morphology', 'trans_en': 'morphology'}]
[[[{'wf': 'և', 'lemma': 'եւ', 'gramm': ['CONJ'], 'wfGlossed': 'և', 'gloss': 'and', 'trans_en': 'and, too, either'}]], [[{'wf': 'Ես', 'lemma': 'ես', 'gramm': ['PRON', 'S', 'hum', 'sg', 'nom'], 'wfGlossed': 'ես', 'gloss': 'me', 'trans_en': 'I'}, {'wf': 'Ես', 'lemma': 'է', 'gramm': ['V', 'intr', 'prs', 'sg', '2'], 'wfGlossed': 'ե-ս', 'gloss': 'be-PRS.2SG', 'trans_en': 'be'}], [{'wf': 'սիրում', 'lemma': 'սիրել', 'gramm': ['V', 'tr', 'cvb', 'ipfv'], 'wfGlossed': 'սիր-ում', 'gloss': 'love-CVB.IPFV', 'trans_en': 'love, have a passion/an affection for, like'}], [{'wf': 'եմ', 'lemma': 'է', 'gramm': ['V', 'intr', 'prs', 'sg', '1'], 'wfGlossed': 'ե-մ', 'gloss': 'be-PRS.1SG', 'trans_en': 'be'}], [{'wf': 'քեզ', 'lemma': 'դու', 'gramm': ['PRON', 'S', 'hum', 'sg', 'dat'], 'wfGlossed': 'քեզ', 'gloss': 'thou', 'trans_en': '

In [14]:
# analysis with disambiguation
analyses = a.analyze_words(['Ես', 'սիրում', 'եմ', 'քեզ'], disambiguate=True)
for ana in analyses:
    if len(ana) > 1: tab = "  "
    else: tab = ""
    for wfo in ana:
        print(tab, wfo.wf, wfo.lemma, wfo.gramm, wfo.gloss)

   Ես ես PRON,S,hum,sg,nom me
   Ես է V,intr,prs,sg,2 be-PRS.2SG
 սիրում սիրել V,tr,cvb,ipfv love-CVB.IPFV
 եմ է V,intr,prs,sg,1 be-PRS.1SG
 քեզ դու PRON,S,hum,sg,dat thou


Str = "Սառը, վճիտ ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"

StrDe = ' „Es war ein kalter, trostloser Apriltag, und die Uhr schlug dreizehn. Das Kinn an die Brust gedrückt, um sich vor dem bitteren Wind zu schützen, eilte Winston Smith durch die gläserne Veranda des Wohnhauses Victory und hinterließ einen körnigen Sturm Staub." '

StrEn = ' "It was a cold, dreary April day, and the clock struck thirteen. Tucking his chin to his chest to shield himself from the bitter wind, Winston Smith hurried through the glass porch of the Victory apartment building, leaving behind him a storm of granular dust." '

In [18]:
Str = "Սառը, վճիտ. ապրիլյան օր էր, ու ժամացույցը խփում էր տասներեքը։ Չար քամուց թաքնվելու համար կզակը սեղմելով կրծքին՝ Ուինսթոն Սմիթն արագ ներս խցկվեց «Հաղթանակ» բնակելի տան ապակե շքադռնից՝ իր ետևից ներս թողնելով հատիկավոր փոշու մի ամբողջ փոթորիկ։"

In [24]:
Lst = re.split(r'([ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+)', Str)
LstTok = []
for el in Lst:
    el = el.strip()
    if el != '': LstTok.append(el)


In [25]:
print(LstTok)
# does disambiguation work? not yet...

['Սառը', ',', 'վճիտ', '.', 'ապրիլյան', 'օր', 'էր', ',', 'ու', 'ժամացույցը', 'խփում', 'էր', 'տասներեքը', '։', 'Չար', 'քամուց', 'թաքնվելու', 'համար', 'կզակը', 'սեղմելով', 'կրծքին', '՝', 'Ուինսթոն', 'Սմիթն', 'արագ', 'ներս', 'խցկվեց', '«', 'Հաղթանակ', '»', 'բնակելի', 'տան', 'ապակե', 'շքադռնից', '՝', 'իր', 'ետևից', 'ներս', 'թողնելով', 'հատիկավոր', 'փոշու', 'մի', 'ամբողջ', 'փոթորիկ', '։']


In [None]:
from gensim.models import Word2Vec # The word2vec model class
import gensim.downloader as api # Allows us to download some free training data

In [None]:
# Vahram's model for English
!wget https://heibox.uni-heidelberg.de/f/c2ba64e4ad844f3a99d4/?dl=1
!cp index.html?dl=1 WIKI_EN.model

In [None]:
model_WIKI_EN = Word2Vec.load("/content/WIKI_EN.model")
word_vectors_WIKI_EN = model_WIKI_EN.wv

In [None]:
distance = model_WIKI_EN.similarity('obama', 'barak')
distance2 = model_WIKI_EN.similarity('obama', 'zone')


In [None]:
print('distance = %.4f' % distance)
print('distance2 = %.4f' % distance2)

In [26]:
def tokenizeHy(Str2tokenise, rePattern = r'([ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+)'):
    LTokens = []
    Lst = re.split(rePattern, Str2tokenise)
    # LstTok = []
    for el in Lst:
        el = el.strip()
        if el != '': LTokens.append(el)
    return LTokens

In [27]:
# merging together different lines of code
def disambiguate(LTok2disambiguate, Window = 4, show_variants = False):
    SDisambig = ''

    analysesN = a.analyze_words(LTok2disambiguate, disambiguate=False)

    # preparing data structures
    LLContext = [] # empty list of contexts, indices are the same as with the Text list
    LLText = [] # test to disambiguate, ambiguous interpretations are double entries
    for ana in analysesN:
        # creating context window from glosses
        # if len(ana) > 1: tab = "~"
        # else: tab = "!"
        LwfoContext = []
        LwfoText = []
        for wfo in ana: # preserve lemmas / word forms which are not found in dictionary
            if wfo.gramm == '': wfo.gramm = 'N'
            if wfo.lemma == '': wfo.lemma = wfo.wf
            if wfo.gloss == '': wfo.gloss = '[unknown]'

            SWfo = f'{wfo.wf}\t{wfo.gramm}\t{wfo.lemma}\t{wfo.gloss}'
            # print(SWfo)
            # FNoD.write(SWfo + '\n')

            # find the first part of the gloss, which may be in the word vectors model
            REPart = re.match('([A-Za-z]+)', wfo.gloss)
            if REPart:
                SGlossMin = REPart.group(1)
                SGlossMin = SGlossMin.lower()
                # print(SGlossMin)
            else:
                SGlossMin = '[NONE]'
            LwfoContext.append(SGlossMin)
            LwfoText.append(SWfo)
        LLContext.append(LwfoContext)
        LLText.append(LwfoText)

    # for el in LLContext: print(el)
    # for el in LLText: print(el)

    # print(len(LLContext))
    # print(len(LLText))

    print('line done!...\n')


    for i in range(len(LLText)):
        if len(LLText[i]) > 1:
            # print(LLText[i])
            # print(LLContext[i])
            # collect context window +- 3 words
            iwStart = i-Window
            if iwStart <0: iwStart=0
            iwEnd = i+Window
            if iwEnd > len(LLText): iwEnd = len(LLText)
            # iwLen = iwEnd - iwStart
            winContext = LLContext[iwStart:iwEnd]
            # print(winContext)
            LScores = []
            LScCand = []

            for candidate in LLContext[i]:
                ScoreCand = 0
                for LCtx in winContext:
                    for Ctx in LCtx:
                        try: distance = model_WIKI_EN.similarity(candidate, Ctx)
                        except: distance = 0
                        ScoreCand += distance
                LScores.append((candidate,ScoreCand))
                LScCand.append(ScoreCand)
            LScores.sort(key=lambda a: a[1], reverse=True)
            # print(LScores)

            max_value = max(LScCand)
            #  Return the max value of the list
            max_index = LScCand.index(max_value)
            StoWrite = LLText[i][max_index] + '\n'
            SDisambig += StoWrite

            if show_variants == True:
                for el in LLText[i]:
                    StoWrite = '\t~\t' + el + '\n'
                    SDisambig += StoWrite
                for el in LScores:
                    StoWrite = '\t~sc:\t' + str(el) + '\n'
                    SDisambig += StoWrite
        else:
            StoWrite = LLText[i][0] + '\n'
            SDisambig += StoWrite

    return SDisambig

In [None]:
!mkdir udhrTT

In [28]:

# FInText = open('/content/udhr/udhr_hye_v03.txt','r')
# FOutText = open('/content/udhrTT/udhr_hye_vert.txt','w')


FInText = open('arm.txt','r')
FOutText = open('arm.vert','w')

In [None]:
for SLine in FInText:
    SLine = SLine.strip()
    LTok = tokenizeHy(SLine)
    SDisambig = disambiguate(LTok)
    # FOutText.write('<p>\n')
    # FOutText.write('\n')
    FOutText.write(SDisambig)
    # FOutText.write('</p>\n')
    FOutText.write('\n')

FOutText.flush()

In [30]:
# !awk -F '\t' '(NF==4){printf "%s ", $3}(NF!=4){printf "\n"}' < /content/udhrTT/udhr_hye_vert.txt >/content/udhrTT/udhr_hye_lem.txt
!awk -F '\t' '(NF==4){printf "%s ", $3}(NF!=4){printf "\n"}' < arm.vert >arm_lem.txt