<a href="https://colab.research.google.com/github/iued-uni-heidelberg/DAAD-Training-2021/blob/main/compLingProject101MorphologicalAnalysisV01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Morphological analysis for English and Armenian

We will create a workflow for analysing English and Armenian texts

For English we will use the TreeTagger 

For Armenian we will use the git repository with Armenian morphological analyser: 
https://github.com/timarkh/uniparser-grammar-eastern-armenian

In [1]:
# importing python libraries
import os, re, sys

## English

In [None]:
# installing TreeTagger

In [None]:
%%bash
mkdir treetagger
cd treetagger
# Download the tagger package for your system (PC-Linux, Mac OS-X, ARM64, ARMHF, ARM-Android, PPC64le-Linux).
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.4.tar.gz
tar -xzvf tree-tagger-linux-3.2.4.tar.gz
# Download the tagging scripts into the same directory.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz
gunzip tagger-scripts.tar.gz
# Download the installation script install-tagger.sh.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/install-tagger.sh
# Download the parameter files for the languages you want to process.
# list of all files (parameter files) https://cis.lmu.de/~schmid/tools/TreeTagger/#parfiles
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/english.par.gz
sh install-tagger.sh
cd ..
sudo pip install treetaggerwrapper


In [None]:
%%bash
wget https://heibox.uni-heidelberg.de/f/c888756380ba4f42b210/?dl=1
mv index.html?dl=1 humanrights_hy.txt

wget https://heibox.uni-heidelberg.de/f/95a3875771c040db959a/?dl=1
mv index.html?dl=1 humanrights02.txt

wget https://heibox.uni-heidelberg.de/f/cdf240db84ca4718b718/?dl=1
mv index.html?dl=1 en1984.txt

In [None]:
!head --lines=20 humanrights_hy.txt
!wc humanrights_hy.txt

In [None]:
!head --lines=20 humanrights02.txt
!wc humanrights02.txt

In [None]:
!./treetagger/cmd/tree-tagger-english en1984.txt >en1984_vert.txt

In [None]:
!head --lines=20 en1984_vert.txt

In [None]:
!./treetagger/cmd/tree-tagger-english humanrights02.txt >humanrights02_vert.txt

In [None]:
!head --lines=20 humanrights02_vert.txt

## Armenian

In [2]:
# installing Armenian morphological analyser
!git clone https://github.com/timarkh/uniparser-grammar-eastern-armenian

Cloning into 'uniparser-grammar-eastern-armenian'...
remote: Enumerating objects: 170, done.[K
remote: Counting objects: 100% (170/170), done.[K
remote: Compressing objects: 100% (95/95), done.[K
remote: Total 170 (delta 77), reused 165 (delta 72), pack-reused 0[K
Receiving objects: 100% (170/170), 33.36 MiB | 16.08 MiB/s, done.
Resolving deltas: 100% (77/77), done.


In [3]:
# Python classes
!pip3 install uniparser-eastern-armenian

Collecting uniparser-eastern-armenian
  Downloading uniparser_eastern_armenian-2.1.1-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 5.3 MB/s 
Collecting uniparser-morph>=2.2.0
  Downloading uniparser_morph-2.4.2-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.0 MB/s 
Installing collected packages: uniparser-morph, uniparser-eastern-armenian
Successfully installed uniparser-eastern-armenian-2.1.1 uniparser-morph-2.4.2


In [None]:
# disambiguation
!sudo apt-get install cg3

In [5]:
from uniparser_eastern_armenian import EasternArmenianAnalyzer
a = EasternArmenianAnalyzer()
analyses = a.analyze_words('Ձևաբանություն')

In [6]:
for ana in analyses:
    print(ana.wf, ana.lemma, ana.gramm, ana.gloss, ana.stem, ana.subwords, ana.wfGlossed, ana.otherData)

Ձևաբանություն ձեւաբանություն N,inanim,sg,nom,nonposs morphology ձևաբանություն. [] ձևաբանություն [('trans_en', 'morphology')]


In [None]:
dir(ana)

In [None]:
analyses = a.analyze_words([['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']],
                           format='xml')

In [None]:
for ana in analyses:
    print(str(ana))

In [None]:
analyses = a.analyze_words(['Ձևաբանություն', [['և'], ['Ես', 'սիրում', 'եմ', 'քեզ', ':']]],
                           format='json')

In [None]:
for ana in analyses:
    print(str(ana))

In [7]:
# analysis with disambiguation
analyses = a.analyze_words(['Ես', 'սիրում', 'եմ', 'քեզ'], disambiguate=True)

In [8]:
for ana in analyses:
    for wfo in ana:
        print(wfo.wf, wfo.lemma, wfo.gramm, wfo.gloss)

Ես ես PRON,S,hum,sg,nom me
Ես է V,intr,prs,sg,2 be-PRS.2SG
սիրում սիրել V,tr,cvb,ipfv love-CVB.IPFV
եմ է V,intr,prs,sg,1 be-PRS.1SG
քեզ դու PRON,S,hum,sg,dat thou


In [None]:
print(type(wfo))

In [None]:
dir(wfo)

In [None]:
# downloading and analysing texts

In [None]:
!wget https://heibox.uni-heidelberg.de/f/e0bfae444a5a4c76957b/?dl=1
!mv index.html?dl=1 hy1984.txt

In [None]:
FInText = open('hy1984.txt','r')
FOutText = open('hy1984_vert.txt','w')

In [None]:
FInText = open('humanrights_hy.txt','r')
FOutText = open('humanrights_hy_vert.txt','w')

In [None]:
for SLine in FInText:
    SLine = SLine.strip()
    ListOfWords = re.split('[ ,\.:;\!\(\)\"\[\]՞՝«»\-\—՝։\։]+', SLine) # tokenize: split on white spaces and punctuation
    # if len(ListOfWords) > 0: FOutText.write(str(ListOfWords) + '\n')
    analyses = a.analyze_words(ListOfWords, disambiguate=False)
    FOutText.write('<p>\n')
    for ana in analyses:
        # for wfo in ana:
        # how to type all variants + disambiguate ?
        wfo = ana[0]
        FOutText.write(wfo.wf + '\t' + wfo.gramm + '\t' + wfo.lemma + '\t' + wfo.gloss + '\n')
        #    FOutText.write(wfo.wf + '\t' + wfo.gramm + '\t' + wfo.lemma + '\t' + wfo.gloss + '\n')
    FOutText.write('</p>\n')
FOutText.flush()