<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpustools/blob/main/S101lemKA_corrections_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing pipeline for DAAD project
## Stage 1: Lemmatization (Georgian) with corrections (manual annotation)

# Georgian 'random' corpus 
- (collected in SketchEngine using non-topical high-frequent keywords from the top 1000 of the wikipedia list)
- we try first on the corpus where corrections have been made for 3k Georgian most frequent word froms

In [None]:
!wget https://heibox.uni-heidelberg.de/f/362c24e311104c9aa35b/?dl=1
!mv index.html?dl=1 georgianrandom02.txt

In [None]:
%%bash
# installing TreeTagger (en, de, ka)
mkdir treetagger
cd treetagger
# Download the tagger package for your system (PC-Linux, Mac OS-X, ARM64, ARMHF, ARM-Android, PPC64le-Linux).
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.4.tar.gz
tar -xzvf tree-tagger-linux-3.2.4.tar.gz
# Download the tagging scripts into the same directory.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz
gunzip tagger-scripts.tar.gz
# Download the installation script install-tagger.sh.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/install-tagger.sh
# Download the parameter files for the languages you want to process.
# list of all files (parameter files) https://cis.lmu.de/~schmid/tools/TreeTagger/#parfiles
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/english.par.gz
sh install-tagger.sh
cd ..
sudo pip install treetaggerwrapper
# changing options: no-unknown, sgml, lemma
mv /content/treetagger/cmd/tree-tagger-english /content/tree-tagger-english0
awk '{ if (NR == 9) print "OPTIONS=\"-token -lemma -sgml -no-unknown\""; else print $0}' /content/tree-tagger-english0 > /content/treetagger/cmd/tree-tagger-english
chmod a+x ./treetagger/cmd/tree-tagger-english
# downloading German and Georgian 
wget https://heibox.uni-heidelberg.de/f/ec8226edebb64a359407/?dl=1
mv index.html?dl=1 /content/treetagger/lib/german-utf8.par
wget https://heibox.uni-heidelberg.de/f/9183090d2bdb41e09055/?dl=1
mv index.html?dl=1 /content/treetagger/lib/georgian.par
wget https://heibox.uni-heidelberg.de/f/9cafab0509d64ed1ac4b/?dl=1
mv index.html?dl=1 /content/treetagger/cmd/tree-tagger-georgian2
cp /content/treetagger/cmd/tree-tagger-georgian2 /content/treetagger/cmd/tree-tagger-georgian
# German2 = -no-unknown 
# note: tree-tagger-german will not work, as parameter files have not been downloaded, only use tree-tagger-german2 with utf8 encoding
wget https://heibox.uni-heidelberg.de/f/acb9b8a2fa4f40e08f8a/?dl=1
mv index.html?dl=1 /content/treetagger/cmd/tree-tagger-german2
chmod a+x /content/treetagger/cmd/tree-tagger-georgian2
chmod a+x /content/treetagger/cmd/tree-tagger-german2

wget https://heibox.uni-heidelberg.de/f/a6f7f36f175942ccad0a/?dl=1
mv index.html?dl=1 /content/treetagger/cmd/tree-tagger-georgian
chmod a+x /content/treetagger/cmd/tree-tagger-georgian


In [None]:
%%bash
# Downloading a table with corrected forms
# cp /content/treetagger/cmd/tree-tagger-georgian2 /content/treetagger/cmd/tree-tagger-georgian
wget https://heibox.uni-heidelberg.de/f/e9010b0f3e7649ef9552/?dl=1
mv index.html?dl=1 georgianrandom--unknown-frq-all.tsv

In [4]:
# analysing German file 
# !./treetagger/cmd/tree-tagger-german2 /content/udhr/udhr_kat_v02.txt >/content/udhrTT/udhr_kat_vert.txt
!./treetagger/cmd/tree-tagger-georgian georgianrandom02.txt >georgianrandom02.vert

	reading parameters ...
	tagging ...
11301000	 finished.


In [None]:
!head -n 50 georgianrandom02.vert


In [6]:
!wc georgianrandom02.vert

 11301824  33112747 394150707 georgianrandom02.vert


In [7]:
## how to print counter in AWK:
## https://stackoverflow.com/questions/67901330/awk-how-to-print-the-last-value-of-a-counter

# !awk -F '\t' '(NF==3){printf "%s ", $3; if(FNR % 10000 == 0){printf "\n"}}' < /content/udhrTT/udhr_deu_1996_vert.txt >/content/udhrTT/udhr_deu_1996_lem.txt

# if we need <p> tags for paragraphs:
# !awk -F '\t' '(NF==3){printf "%s ", $3}(NF!=3){printf "\n%s\n", $0}' < /content/udhrTT/udhr_deu_1996_vert.txt >/content/udhrTT/udhr_deu_1996_lem.txt
# if we do not need paragraph tags
# !awk -F '\t' '(NF==3){printf "%s ", $3}(NF!=3){printf "\n"}' < /content/udhrTT/udhr_kat_vert.txt >/content/udhrTT/udhr_kat_lem.txt
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' <georgianrandom02.vert >georgianrandom02-unknown.txt

2123093


In [8]:
cCoverage = 2123093 / 11301824
print(cCoverage)

0.18785401365301743


## Improving coverage with annotated word forms
- annotations are taken from `georgianrandom--unknown-frq-all.tsv'


In [9]:
# reading a corrected file into a dictionary
# printing dictionaries both for lemmas and PoS to verify

import os, sys, re
DCorrectionsPoS = {}
DCorrectionsLem = {}
with open("georgianrandom--unknown-frq-all.tsv") as f:
    counter = 0
    for sline in f:
        counter += 1
        if counter == 1: continue # we skip the first line

        sline = sline.strip()
        LLine = sline.split('\t')
        try: SWord = LLine[1]
        except: SWord = ''

        try: SPoS = LLine[2]
        except: SPoS = ''

        try: SLemma = LLine[3]
        except: SLemma = ''

        try: SPoSCorrected = LLine[4]
        except: SPoSCorrected = ''
        if SPoSCorrected != '' and SWord != '':
            SPoS = SPoSCorrected
            DCorrectionsPoS[SWord] = SPoS

        if SWord != '' and SLemma != '': DCorrectionsLem[SWord] = SLemma
        # if SWord != '' and SPoS != '': DCorrectionsPoS[SWord] = SPoS

print(len(DCorrectionsPoS.items()))
print(len(DCorrectionsLem.items()))
FDictCorrected = open('dictLemmasCorrected.txt', 'w')
for key, value in sorted(DCorrectionsLem.items()):
    FDictCorrected.write(f'{key}\t{value}\n')
FDictCorrected.flush()

FDictCorrectedPoS = open('dictPoSCorrected.txt', 'w')
for key, value in sorted(DCorrectionsPoS.items()):
    FDictCorrectedPoS.write(f'{key}\t{value}\n')
FDictCorrectedPoS.flush()


416
2382


In [46]:
# applying corrections to vert file with unknown

'''
FOutVertCorrected = open('georgianrandom02corr.vert', 'w')
with open('georgianrandom02.vert') as FInVertUnknown:
    for SLine in FInVertUnknown:
        SLine = SLine.rstrip()
        LLine = SLine.split('\t')
        if len(LLine) != 3:
            FOutVertCorrected.write(f'{SLine}\n')
        else:
            SWord = LLine[0]
            SPoS = LLine[1]
            SLem = LLine[2]

            if SLem == '<unknown>':
                try: SLem = DCorrectionsLem[SWord]
                except: pass

                try: SPoS = DCorrectionsPoS[SWord]
                except: pass
            FOutVertCorrected.write(f'{SWord}\t{SPoS}\t{SLem}\n')           
FOutVertCorrected.flush()
'''
def applyCorrections(SFInVertUnknown, SFOutVertCorrected, DCorrectionsLem, DCorrectionsPoS):
    FOutVertCorrected = open(SFOutVertCorrected, 'w')
    DReplacements = {}
    with open(SFInVertUnknown) as FInVertUnknown:
        counter = 0
        unknownFound = 0
        unknownCorrected = 0
        unknownCorrectedPoS = 0
        for SLine in FInVertUnknown:
            counter +=1
            if counter % 1000000 == 0: sys.stdout.write(f'{counter}, unknownFound={unknownFound}, unknownCorrected={unknownCorrected}({unknownCorrected/unknownFound*100}%), unknownTypesCorrected={len(DReplacements)}, unknownCorrectedPoS={unknownCorrectedPoS}\n')
            SLine = SLine.rstrip()
            LLine = SLine.split('\t')
            if len(LLine) != 3:
                FOutVertCorrected.write(f'{SLine}\n')
            else:
                SWord = LLine[0]
                SPoS = LLine[1]
                SLem = LLine[2]

                if SLem == '<unknown>':
                    unknownFound +=1
                    if SWord in DCorrectionsLem: 
                        SLem = DCorrectionsLem[SWord]
                        unknownCorrected +=1
                        try:
                            DReplacements[f'{SWord}\t{SLem}'] += 1
                        except:
                            DReplacements[f'{SWord}\t{SLem}'] = 1

                    # except: pass

                    if SWord in DCorrectionsPoS: 
                        SPoS = DCorrectionsPoS[SWord]
                        unknownCorrectedPoS += 1
                    # except: pass
                FOutVertCorrected.write(f'{SWord}\t{SPoS}\t{SLem}\n')
    FOutVertCorrected.flush()

    return (counter, unknownFound, unknownCorrected, unknownCorrectedPoS), DReplacements



def reportStatistics(TupleIn1):
    counter, unknownFound, unknownCorrected, unknownCorrectedPoS = TupleIn1
    ITypesCorrected = len(DReplacements)
    UnknownBeforeUpdate = unknownFound / counter
    UnknowAfterUpdate = (unknownFound - unknownCorrected) / counter
    UnknownChange = UnknownBeforeUpdate - UnknowAfterUpdate

    sys.stdout.write(f'\nAll words:{counter}, Unknown:{unknownFound}, UnknownCorrected:{unknownCorrected}({unknownCorrected/unknownFound*100}%), UnknownTypesCorrected:{ITypesCorrected}, UnknownPoSCorrected:{unknownCorrectedPoS}\n')
    sys.stdout.write(f'\nUnknown before update:{unknownFound}({UnknownBeforeUpdate * 100})%; Unknown after update:{unknownFound - unknownCorrected}({UnknowAfterUpdate * 100})%; Change:{unknownCorrected}({UnknownChange * 100})%\n', )


In [47]:
def printFrqDict(DFrq, SFOut):
    FOut = open(SFOut, 'w')
    count = 0
    for key, val in sorted(DFrq.items(), key=lambda item: item[1], reverse=True):
        count+=1
        FOut.write(f'{count}\t{key}\t{val}\n')
    FOut.flush()
    

In [48]:
TupleIn, DReplacements = applyCorrections('georgianrandom02.vert', 'georgianrandom02corr.vert', DCorrectionsLem, DCorrectionsPoS)
# counter, unknownFound, unknownCorrected, unknownCorrectedPoS = TupleIn
reportStatistics(TupleIn)
printFrqDict(DReplacements, 'georgianrandom02replacements.txt')

1000000, unknownFound=157803, unknownCorrected=23055(14.609988403262294%), unknownTypesCorrected=2175, unknownCorrectedPoS=4590
2000000, unknownFound=327636, unknownCorrected=46754(14.270104628307024%), unknownTypesCorrected=2303, unknownCorrectedPoS=9058
3000000, unknownFound=529485, unknownCorrected=74758(14.119002426886503%), unknownTypesCorrected=2335, unknownCorrectedPoS=13488
4000000, unknownFound=712430, unknownCorrected=104038(14.603259267577165%), unknownTypesCorrected=2352, unknownCorrectedPoS=20214
5000000, unknownFound=904472, unknownCorrected=129723(14.342400870342034%), unknownTypesCorrected=2362, unknownCorrectedPoS=24568
6000000, unknownFound=1119863, unknownCorrected=157396(14.054933505259125%), unknownTypesCorrected=2370, unknownCorrectedPoS=30731
7000000, unknownFound=1289215, unknownCorrected=183651(14.245180206559807%), unknownTypesCorrected=2372, unknownCorrectedPoS=35679
8000000, unknownFound=1447605, unknownCorrected=206991(14.298859150113463%), unknownTypesCorr

In [49]:
!wc georgianrandom02corr.vert

 11301824  33112747 398210244 georgianrandom02corr.vert


In [50]:
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' <georgianrandom02corr.vert >georgianrandom02corr-unknown.txt

1826952


In [51]:
cCoverage2 = 1826952 / 11301824
print(cCoverage2)

0.16165107508310164


## Running tagger / checking improvement on Specialised corpora
- downloading the corpora
- tagging
- checking baseline coverage
- checking improved coverage
- intersection of unknown words
- preparing datasets (1) intersections; (2) specialised for each corpus

In [None]:
!wget https://heibox.uni-heidelberg.de/f/dc5bcb4413aa42668130/?dl=1
!mv index.html?dl=1 specialisedCorpus.zip

In [None]:
!unzip specialisedCorpus.zip

In [17]:
!wc specialised-corpora/*

   70620   596366 11660439 specialised-corpora/cFiktion.txt
   81314   529838 11302534 specialised-corpora/cNaturwissenschaft.txt
   63223   973857 21459641 specialised-corpora/cRechtswissenschaft.txt
  215157  2100061 44422614 total


In [18]:
# FName = 'ocrOutKA.txt'
# FNameOut = 'ocrLinesKA.txt'

def removeNewLines(FName, FNameOut):
    FIn = open(FName, 'r')
    FOut = open(FNameOut, 'w')

    for SLine in FIn:
        SLine = SLine.strip()
        if SLine == '': 
            FOut.write('\n\n')
            continue
        if SLine[-1] == '-':
            SLine2write = SLine[:-1]
            FOut.write(SLine2write)
            continue

        FOut.write(SLine + ' ')
    FOut.flush()
    return


In [19]:
!mkdir specialised-corpora-s01-lines/

In [20]:
removeNewLines('specialised-corpora/cFiktion.txt', 'specialised-corpora-s01-lines/cFiktion.txt')
removeNewLines('specialised-corpora/cNaturwissenschaft.txt', 'specialised-corpora-s01-lines/cNaturwissenschaft.txt')
removeNewLines('specialised-corpora/cRechtswissenschaft.txt', 'specialised-corpora-s01-lines/cRechtswissenschaft.txt')

In [None]:
!head -n 10 specialised-corpora/cFiktion.txt

In [None]:
!head -n 10 specialised-corpora-s01-lines/cFiktion.txt

In [None]:
!head -n 50 specialised-corpora-s01-lines/cRechtswissenschaft.txt

In [21]:
!cat specialised-corpora-s01-lines/* >specialised-corpora-s01-all.txt

### preparing texts for tagging: removing new lines, etc.

In [22]:
!mkdir specialised-corpora-s02-vert/

In [23]:
!./treetagger/cmd/tree-tagger-georgian specialised-corpora-s01-lines/cFiktion.txt >specialised-corpora-s02-vert/cFiktion.vert

	reading parameters ...
	tagging ...
747000	 finished.


In [24]:
!./treetagger/cmd/tree-tagger-georgian specialised-corpora-s01-lines/cNaturwissenschaft.txt >specialised-corpora-s02-vert/cNaturwissenschaft.vert

	reading parameters ...
	tagging ...
638000	 finished.


In [25]:
!./treetagger/cmd/tree-tagger-georgian specialised-corpora-s01-lines/cRechtswissenschaft.txt >specialised-corpora-s02-vert/cRechtswissenschaft.vert

	reading parameters ...
	tagging ...
1181000	 finished.


In [26]:
!./treetagger/cmd/tree-tagger-georgian specialised-corpora-s01-all.txt >specialised-corpora-s01-all.vert

	reading parameters ...
	tagging ...
2567000	 finished.


### checking coverage on joint specialised corpus
- coverage of raw corpus
- coverage of corrections

In [27]:
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' <specialised-corpora-s01-all.vert >specialised-corpora-s01-all-unknown.vert

514895


In [28]:
!wc specialised-corpora-s01-all.vert
!wc specialised-corpora-s01-all-unknown.vert

 2567303  7701931 91404493 specialised-corpora-s01-all.vert
  514895  1544685 19910737 specialised-corpora-s01-all-unknown.vert


In [29]:
cCoverage3 = 514895 / 2567303
print(cCoverage3)

0.20055871862417488


In [56]:
TupleStat, DReplacements = applyCorrections('specialised-corpora-s01-all.vert', 'specialised-corpora-s01-all-corr.vert', DCorrectionsLem, DCorrectionsPoS)
reportStatistics(TupleStat)



1000000, unknownFound=181642, unknownCorrected=15100(8.313055350634766%), unknownTypesCorrected=1606, unknownCorrectedPoS=3608
2000000, unknownFound=397008, unknownCorrected=40194(10.124229234675372%), unknownTypesCorrected=1993, unknownCorrectedPoS=8837

All words:2567303, Unknown:514895, UnknownCorrected:56079(10.891346779440468%), UnknownTypesCorrected:2014, UnknownPoSCorrected:11936

Unknown before update:514895(20.055871862417487)%; Unknown after update:458816(17.871517308241373)%; Change:56079(2.1843545541761156)%


In [57]:
printFrqDict(DReplacements, 'specialised-corpora-s01-all-replacements.txt')

In [31]:
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' <specialised-corpora-s01-all-corr.vert >specialised-corpora-s01-all-corr-unknown.vert

458816


In [None]:
!wc specialised-corpora-s01-all-corr.vert
!wc specialised-corpora-s01-all-corr-unknown.vert

 2567303  7701931 92197268 specialised-corpora-s01-all-corr.vert
  458816  1376448 17633815 specialised-corpora-s01-all-corr-unknown.vert


In [None]:
cCoverage4 = 458816 / 2567303
print(cCoverage4)

0.17871517308241372


In [None]:
# corrections to individual corpora
!mkdir specialised-corpora-s03-cVert

In [None]:
# applying corrections and reporting statistics for each individual corpora
# Fiktion
TupleStatF = applyCorrections('/content/specialised-corpora-s02-vert/cFiktion.vert', '/content/specialised-corpora-s03-cVert/cFiktion.vert', DCorrectionsLem, DCorrectionsPoS)
reportStatistics(TupleStatF)



All words:747302, Unknown:134416, UnknownCorrected:9060(6.740269015593382%), UnknownPoSCorrected:2739

Unknown before update:17.98683798517868%; Unknown after update:16.774476717578704%; Change:1.212361267599979%


In [None]:
# Natur...
TupleStatN = applyCorrections('/content/specialised-corpora-s02-vert/cNaturwissenschaft.vert', '/content/specialised-corpora-s03-cVert/cNaturwissenschaft.vert', DCorrectionsLem, DCorrectionsPoS)
reportStatistics(TupleStatN)



All words:638588, Unknown:128802, UnknownCorrected:14649(11.373270601388178%), UnknownPoSCorrected:2586

Unknown before update:20.169812148051637%; Unknown after update:17.87584483266206%; Change:2.293967315389578%


In [None]:
# Recht
TupleStatR = applyCorrections('/content/specialised-corpora-s02-vert/cRechtswissenschaft.vert', '/content/specialised-corpora-s03-cVert/cRechtswissenschaft.vert', DCorrectionsLem, DCorrectionsPoS)
reportStatistics(TupleStatR)

1000000, unknownFound=210522, unknownCorrected=27088(12.867063774807383%), unknownCorrectedPoS=5747

All words:1181412, Unknown:251676, UnknownCorrected:32370(12.861774662661517%), UnknownPoSCorrected:6611

Unknown before update:21.302983209921685%; Unknown after update:18.563041513036943%; Change:2.7399416968847428%


In [None]:
!mkdir specialised-corpora-s04-cUnKVert

In [None]:
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' </content/specialised-corpora-s03-cVert/cFiktion.vert >/content/specialised-corpora-s04-cUnKVert/cFiktion.vert
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' </content/specialised-corpora-s03-cVert/cNaturwissenschaft.vert >/content/specialised-corpora-s04-cUnKVert/cNaturwissenschaft.vert
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' </content/specialised-corpora-s03-cVert/cRechtswissenschaft.vert >/content/specialised-corpora-s04-cUnKVert/cRechtswissenschaft.vert

125356
114153
219306


In [None]:
!head -n 40 /content/specialised-corpora-s04-cUnKVert/cFiktion.vert

In [None]:
def corp2frqDict(SFIn):
    DFrq = {}
    with open(SFIn) as FIn:
        for SLine in FIn:
            SLine = SLine.rstrip()
            # LLine = SLine.split('\t')
            try:
                DFrq[SLine]+=1
            except:
                DFrq[SLine]=1
    return DFrq


In [None]:
DFrqAllCorpora = corp2frqDict('specialised-corpora-s01-all-corr-unknown.vert')
DFrqFiktion = corp2frqDict('/content/specialised-corpora-s04-cUnKVert/cFiktion.vert')
DFrqNaturwissenschaft = corp2frqDict('/content/specialised-corpora-s04-cUnKVert/cNaturwissenschaft.vert')
DFrqRechtswissenschaft = corp2frqDict('/content/specialised-corpora-s04-cUnKVert/cRechtswissenschaft.vert')
print(len(DFrqAllCorpora))
print(len(DFrqFiktion))
print(len(DFrqNaturwissenschaft))
print(len(DFrqRechtswissenschaft))

142418
57882
53659
45696


In [None]:
def selectDTop(DInput, ITop):
    DTop = {}
    c=0
    for key, val in sorted(DInput.items(), key=lambda item: item[1], reverse=True):
        c+=1
        if c > ITop: break
        DTop[key]=val
    return DTop


def intersect2dicts(D1, D2, Top1, Top2):
    DIners = {}
    Dc1 = selectDTop(D1, Top1)
    Dc2 = selectDTop(D2, Top2)

    for key, val in sorted(Dc1.items(), key=lambda item: item[1], reverse=True):
        if key in Dc2.keys():
            Dc2frq = Dc2[key]
            IFrqAve = (val + Dc2frq) / 2
            DIners[key] = IFrqAve
    return DIners

def union2dicts(D1, D2, Top1, Top2):
    DUnion = {}
    Dc1 = selectDTop(D1, Top1)
    Dc2 = selectDTop(D2, Top2)

    for key, val in sorted(Dc1.items(), key=lambda item: item[1], reverse=True):
        if key in Dc2.keys():
            Dc2frq = Dc2[key]
            IFrqAve = (val + Dc2frq) / 2
            DUnion[key] = IFrqAve
        else:
            DUnion[key] = val

    for key, val in sorted(Dc2.items(), key=lambda item: item[1], reverse=True):
        if key in Dc1.keys():
            Dc1frq = Dc1[key]
            IFrqAve = (val + Dc1frq) / 2
            DUnion[key] = IFrqAve
        else:
            DUnion[key] = val
    return DUnion

In [None]:
DIntersection12 = intersect2dicts(DFrqFiktion, DFrqNaturwissenschaft, 500000, 500000)
DIntersection123 = intersect2dicts(DIntersection12, DFrqRechtswissenschaft, 500000, 500000)
print('DIntersection12', len(DIntersection12))
print('DIntersection123', len(DIntersection123))
print('')

DIntersection1 = intersect2dicts(DFrqFiktion, DFrqNaturwissenschaft, 500000, 500000)
DIntersection2 = intersect2dicts(DFrqNaturwissenschaft, DFrqRechtswissenschaft, 500000, 500000)
DIntersection3 = intersect2dicts(DFrqFiktion, DFrqRechtswissenschaft, 500000, 500000)

print('DIntersection1', len(DIntersection1))
print('DIntersection2', len(DIntersection2))
print('DIntersection3', len(DIntersection3))

DUnion12 = union2dicts(DIntersection1, DIntersection2, 500000, 500000)
DUnion123 = union2dicts(DUnion12, DIntersection3, 500000, 500000)
print('DUnion12', len(DUnion12))
print('DUnion123', len(DUnion123))





DIntersection12 7412
DIntersection123 1688

DIntersection1 7412
DIntersection2 5572
DIntersection3 3523
DUnion12 11296
DUnion123 13131


In [None]:
def printDictWExternalFrq(DFrq, DExternalFrq, DFrqF, DFrqN, DFrqR, SFOut):
    FOut = open(SFOut, 'w')
    D4Sorting = {}
    count = 0
    SumF = 0
    SumN = 0
    SumR = 0
    SumAll = 0
    for key, val in sorted(DFrq.items(), key=lambda item: item[1], reverse=True):
        count+=1
        if key in DExternalFrq.keys():
            valAll = DExternalFrq[key]
        else:
            valAll = 0
        if key in DFrqF.keys():
            valF = DFrqF[key]
        else:
            valF = 0
        if key in DFrqN.keys():
            valN = DFrqN[key]
        else:
            valN = 0
        if key in DFrqR.keys():
            valR = DFrqR[key]
        else:
            valR = 0
        # if valF > 1 and valN > 1 and valR > 1:
        LPartFrequencies = sorted([valF, valN, valR])
        FrqLow1 = LPartFrequencies[0]
        FrqLow2 = LPartFrequencies[1]
        # if FrqLow1 > 1 and FrqLow2 > 1:
        if FrqLow2 > 1:
            D4Sorting[f'{key}\t{valF}\t{valN}\t{valR}'] = valAll
            SumF += valF
            SumN += valN
            SumR += valR
            SumAll += valAll

        FOut.write(f'{count}\t{key}\t{valF}\t{valN}\t{valR}\t{val}\t{valAll}\n')
    FOut.flush()
    print(f'Fiktion:{SumF}\tNatur:{SumN}\tRecht{SumR}\tAll{SumAll}\n')
    return D4Sorting

In [None]:
D4Sorting = printDictWExternalFrq(DUnion123, DFrqAllCorpora, DFrqFiktion, DFrqNaturwissenschaft, DFrqRechtswissenschaft, '/content/specialised-corpora-s05-union123UnkonwAllFrq.txt')
print(len(D4Sorting))
printFrqDict(D4Sorting, '/content/specialised-corpora-s06-union123Unkonw1.txt')

Fiktion:21081	Natur:22473	Recht47302	All90856

4151


In [None]:
# printDictWExternalFrq(DUnion123, DFrqAllCorpora, '/content/specialised-corpora-s05-union123UnkonwAllFrq.txt')
printFrqDict(DUnion123, '/content/specialised-corpora-s04-union123Unkonw.txt')

In [None]:
# !mkdir specialised-corpora-s05splitUnion123

In [None]:
def splitFileNParts(FN2Split, N):
    for el in range(N):
        Elem = el+1
        SEl = str(Elem)
        SFNOut = FN2Split + '_' + SEl + '.tsv'
        FNOut = open(SFNOut, 'w')
        with open(FN2Split, 'r') as FIn:
            count = 0
            for SLine in FIn:
                count +=1
                if (count + el) % N == 0:
                    FNOut.write(SLine)
        FNOut.flush()
        FIn.close()        








In [None]:
splitFileNParts('/content/specialised-corpora-s06-union123Unkonw1.txt', 3)

## end: Specialised corpora

## Preparing datasets from further improvement
- what is still `unknown'


In [None]:
!head -n 50 georgianrandom02-unknown.txt


In [None]:
!tail -n 20 georgianrandom02-unknown.txt

In [None]:
D = {}
with open("georgianrandom02-unknown.txt", 'r') as f:
    for line in f:
        line = line.rstrip()
        try:
            D[line] +=1
        except:
            D[line] = 1

fo = open("georgianrandom02-unknown-frq.txt", 'w')
ICorpusLen = 11301824
IUnknown = 2123093
FCoverage = IUnknown/ICorpusLen
print('percent Unknown TT: ', FCoverage)
IRestUnknown = IUnknown
c = 0
for key, val in sorted(D.items(), key=lambda item: item[1], reverse=True):
    c+=1
    IRestUnknown = IRestUnknown - val
    FRestCoverage = IRestUnknown/ICorpusLen
    fo.write(str(c) + '\t' + key + '\t' + str(val) + '\t' + str(FRestCoverage) + '\n')
    

percent Unknown TT:  0.18785401365301743


In [None]:
!head -n 50 georgianrandom02-unknown-frq.txt

## Selecting 3 sets of 1k words for correction (random corpus)



In [None]:
import sys, os, re

FOut1 = open('georgianrandom02-unknown-frq-p1of3.txt', 'w')
FOut2 = open('georgianrandom02-unknown-frq-p2of3.txt', 'w')
FOut3 = open('georgianrandom02-unknown-frq-p3of3.txt', 'w')
c = 0
ICorpusLen = 11301824
IUnknown = 2123093
IRestUnknown = IUnknown
for key, val in sorted(D.items(), key=lambda item: item[1], reverse=True):
    c+=1
    IRestUnknown = IRestUnknown - val
    FRestCoverage = IRestUnknown/ICorpusLen
    if(c + 2) % 3 == 0:
        FOut1.write(str(c) + '\t' + key + '\t' + str(val) + '\t' + str(FRestCoverage) + '\n')
    elif(c + 1) % 3 == 0:
        FOut2.write(str(c) + '\t' + key + '\t' + str(val) + '\t' + str(FRestCoverage) + '\n')
    elif c % 3 == 0:
        FOut3.write(str(c) + '\t' + key + '\t' + str(val) + '\t' + str(FRestCoverage) + '\n')
    else:
        sys.stdout.write('error: \t' + str(c) + '\t' + key + '\t' + str(val) + '\t' + str(FRestCoverage) + '\n')

## GIP DAAD corpus
### checking coverage

In [None]:
!wget https://heibox.uni-heidelberg.de/f/1981a35452db40c583c0/?dl=1
!mv index.html?dl=1 select-kat.tgz

In [None]:
!tar xvzf select-kat.tgz

In [None]:
import os
FOut = open('georgian-corp-gip-v01.txt', 'w')
for root, dirs, files in os.walk("./select-kat"):
    path = root.split(os.sep)
    print((len(path) - 1) * '---', os.path.basename(root))
    for file in files:
        print(len(path) * '---', file)
        FOut.write('<doc filename="' + file + '">\n')
        SFilePath = os.path.join(root,file)
        FIn = open(SFilePath, 'r')
        SIn = FIn.read()
        FOut.write(SIn)
        FOut.write('</doc>\n\n')

FOut.flush()
FOut.close()



In [None]:
!wc georgian-corp-gip-v01.txt

   44448   892632 19802190 georgian-corp-gip-v01.txt


In [None]:
!./treetagger/cmd/tree-tagger-georgian georgian-corp-gip-v01.txt >georgian-corp-gip-v01.vert

	reading parameters ...
	tagging ...
1086000	 finished.


In [None]:
!wc georgian-corp-gip-v01.vert

 1086727  3260172 40704147 georgian-corp-gip-v01.vert


In [None]:
!awk -F '\t' '($3=="<unknown>"){j++; printf "%s\n", $0} END{print j+0  > "/dev/stderr" }' <georgian-corp-gip-v01.vert >georgian-corp-gip-v01-unknown.txt

222297


In [None]:
ICorpusLen = 1086727
IUnknown = 222297
FCoverage = IUnknown/ICorpusLen
print('percent Unknown TT: ', FCoverage)

percent Unknown TT:  0.2045564341366323


In [None]:
E = {}
with open("georgian-corp-gip-v01-unknown.txt", 'r') as f:
    for line in f:
        line = line.rstrip()
        try:
            E[line] +=1
        except:
            E[line] = 1

In [None]:
## checking coverage on georgianrandom02 corpus
IRestUnknown = IUnknown
c = 0
CFound = 0
fo = open("georgian-corp-gip-v01-unknown-from-georgianrandom02-frq.txt", 'w')
for key, val in sorted(D.items(), key=lambda item: item[1], reverse=True):
    c+=1
    BFound = False
    try: 
        val2 = E[key]
        BFound = True
        CFound += 1
    except:
        val2 = 0

    IRestUnknown = IRestUnknown - val2
    FRestCoverage = IRestUnknown/ICorpusLen
    fo.write(str(c) + '\t' + key + '\t' + str(val) + '\t' + str(BFound) + '\t' + str(CFound) + '\t' + str(FRestCoverage) + '\n')



# Optional dataset (not used)
## Example: Universal declaration of human rights (

In [None]:
# Downloading UDHR
!wget https://unicode.org/udhr/assemblies/udhr_txt.zip


In [None]:
%%bash
mkdir udhr
mv udhr_txt.zip udhr
cd udhr/
unzip udhr_txt.zip

In [None]:
%%bash
# put paragraph tags
awk '{print "<p>\n"$0 ; print "</p>"}' /content/udhr/udhr_kat.txt >/content/udhr/udhr_kat_v02.txt