<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpustools/blob/main/S101lemHYstanzaWithDemoV202509.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Armenian lemmatization with Stanza

## downloading evaluation sets
- 420 words: test with about 420 words of Armenian text
- Armenian "Brown-type" corpus b

In [None]:
# download some sample Armenian sentences, or upload your own, change the
!wget https://heibox.uni-heidelberg.de/f/ce6096da570f47b99500/?dl=1
!mv index.html?dl=1 evaluation-set-v01.txt
!wget https://heibox.uni-heidelberg.de/f/a847a12bffd4491f9070/?dl=1
!mv index.html?dl=1 TED2020-dehy-hy-aa.txt

## Installing stanza

### Explanation:

It is important to downgrade to Torch 2.5, because Stanza doesn't work with later versions. See the ChatGPT explanation at: https://chatgpt.com/share/68bad9f0-fc38-800e-a17d-898e6fef0e35




In [None]:
# it is important to downgrade to Torch 2.5, because Stanza doesn't work with later versions. See the ChatGPT explanation at https://chatgpt.com/share/68bad9f0-fc38-800e-a17d-898e6fef0e35
!pip install spacy-stanza
!pip install torch==2.5.1

In [None]:
import stanza
import spacy_stanza
import torch
print("Torch version:", torch.__version__)

### testing English stanza (optional)

In [None]:
# optional
# Download the stanza model if necessary
stanza.download("en")

# Initialize the pipeline
nlp = spacy_stanza.load_pipeline("en")

doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
print(doc.ents)

### Downloading and testing Armenian stanza

In [None]:
stanza.download("hy")
# Load Armenian pipeline (make sure stanza models are downloaded first)
nlp_hy = spacy_stanza.load_pipeline("hy")

In [None]:
# Example Armenian text
text = "Ես գնում եմ դպրոց։"
doc = nlp_hy(text)

for token in doc:
    print(f"\n=== Token: {token.text} ===")

    print(f"\n=== Index: {token.idx} ===")

    # Core text info
    print("Text:        ", token.text)
    print("Lemma:       ", token.lemma_)
    print("POS (UPOS):  ", token.pos_)
    print("Tag (XPOS):  ", token.tag_)
    print("Morph:       ", token.morph)
    print("Dep:         ", token.dep_)
    print("Head:        ", token.head.text)

    # Entity info
    print("Ent type:    ", token.ent_type_)
    print("Ent IOB:     ", token.ent_iob_)

    # Document & sentence info
    print("Is sent start:", token.is_sent_start)
    print("Sentence:    ", token.sent.text)

    # Orthographic info
    print("Lower:       ", token.lower_)
    print("Shape:       ", token.shape_)
    print("Whitespace:  ", repr(token.whitespace_))

    # Booleans
    print("is_alpha:    ", token.is_alpha)
    print("is_digit:    ", token.is_digit)
    print("is_punct:    ", token.is_punct)
    print("is_space:    ", token.is_space)
    print("is_stop:     ", token.is_stop)

    # Position info
    print("Index:       ", token.i)
    print("Char offset: ", token.idx)


In [None]:
### optional
doc = nlp_hy("ՄԱՐԴՈՒ ԻՐԱՎՈՒՆՔՆԵՐԻ ՀԱՄԸՆԴՀԱՆՈՒՐ ՀՌՉԱԿԱԳԻՐ. ՆԵՐԱԾԱԿԱՆ. Քանզի մարդկային ընտանիքի բոլոր անդամներին ներհատուկ արժանապատվությունըև հավասար ու անօտարելի իրավունքները աշխարհի ազատության, արդարության ու խաղաղության հիմքն են.")

In [None]:
### optional
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)


### full analysis of the file (optional)
- includes dependency parsing

In [None]:
### optional
with open('/content/TED2020-dehy-hy-aa.txt', 'r', encoding='utf-8') as infile, open('/content/TED2020-dehy-hy-aa-ANALYSIS-full-v01.txt', 'w') as outfile:
    # read sample.txt an and write its content into sample2.txt
    outfile.write("{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{parentLem}\t{LAncestors}\n")
    for line in infile:
        line = line.strip()
        doc = nlp_hy(line)
        # outfile.write(line + '\n')
        for token in doc:
            LAncestors = list(token.ancestors)
            print(str(LAncestors))
            try:
                SLAncestors = str(list(token.ancestors))
                parent = LAncestors[0]
                parentLem = parent.lemma_
            except:
                parentLem = "NONE"
            outfile.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{parentLem}\t{SLAncestors}\n")


### function for lemmatization

In [None]:
def parseFile(iFileName, oFileName, nlp_model = nlp_hy):
    with open(iFileName, 'r', encoding='utf-8') as infile, open(oFileName, 'w') as outfile:
        # read sample.txt an and write its content into sample2.txt
        outfile.write("{token.text}\t{token.pos_}\t{token.lemma_}\n")
        c = 0
        for line in infile:
            c+=1
            if c%10 == 0: print(str(c))
            line = line.strip()
            doc = nlp_model(line)
            # outfile.write(line + '\n')
            for token in doc:
                LAncestors = list(token.ancestors)
                # print(str(LAncestors))
                try:
                    SLAncestors = str(list(token.ancestors))
                    parent = LAncestors[0]
                    parentLem = parent.lemma_
                except:
                    parentLem = "NONE"
                outfile.write(f"{token.text}\t{token.pos_}\t{token.lemma_}\n")

    return


### command to lemmatize the file

In [None]:
parseFile('/content/TED2020-dehy-hy-aa', '/content/TED2020-dehy-hy-aa--lemmatization-v01.txt', nlp_hy)

In [None]:
parseFile('evaluation-set-v01.txt', 'evaluation-set-v01.vert', nlp_hy)