<a href="https://colab.research.google.com/github/iued-uni-heidelberg/DAAD-Training-2021/blob/main/Terminologieextraktion9EvaluationKeyWordsV02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Stage 0: Some useful read/write and convert functions

In [3]:
# import useful libraries, files

import re, os, sys

In [4]:
# file for recording results of different configurations
# run this only once
!rm AllTermExtractionResultsV01.txt
!rm AllTermExtractionResultsV02.txt

FOutResults1 = open('AllTermExtractionResultsV01.txt', 'a')
FOutResults2 = open('AllTermExtractionResultsV02.txt', 'a')
FOutResults1.write('Run\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
FOutResults2.write('Run\tW1P\tW1R\tW2P\tW2R\tW3P\tW3R\tW4P\tW4R\n') # only precision and recall figures
FOutResults1.flush()
FOutResults2.flush()

rm: cannot remove 'AllTermExtractionResultsV01.txt': No such file or directory
rm: cannot remove 'AllTermExtractionResultsV02.txt': No such file or directory


In [5]:
## to modify if necessary; however, we try to keep the code standard and parametrize as much as possible

# useful functions
# a useful function for recording / visualising current stage of dictionaries
def printDictionary(DictionaryFrq, FOut, K = 1, Rev = True): # printing a dictionary: by values or alphabetically
    for Word, Frq in sorted( DictionaryFrq.items() , key=lambda x: x[K], reverse=Rev):
        FOut.write(Word + '\t' + str(Frq) + '\n')
    FOut.flush()
    return

# another useful function to just read and return a 2-field dictionary, eg., frequency or keyness
def readDictionary(FIN, SkipComments = True, Caps=False):
    DScoresLarge = {} # keywords - scores
    for Line in FIN:
        if SkipComments and re.match('#', Line): 
            continue
        Line = Line.strip()
        if Caps: 
            Line = Line.upper() # convert to upper case
        LFieldsKW = re.split('\t', Line)
        SWord = LFieldsKW[0]
        AKScore = float(LFieldsKW[1])
        DScoresLarge[SWord] = AKScore   
    return DScoresLarge

# another possibly useful function: convert dictionary values to ranks (frequency, keyness weights, etc.)
# for understanding how far down the list the item has been found...
# currently not used ... 
def rankDict(DIN):
    '''
    reading a frequency dictionary from a file
    '''
    DTermRanks = {}
    i = 0
    IRank = 0
    IPrevFrq = 0
    SumRanks = 0
    for SKey, Frq in DIN.items():
        # if re.match('#', SKey): continue # skipping comments
        i+=1
        if IPrevFrq != Frq: IRank = i # rank is the number of the highest ranking element of the same frequency group
        IPrevFrq = Frq
        
        DTermRanks[SKey] = IRank
        SumRanks += IRank

    AAveRank = SumRanks / i
    print(f'MaxRank = {IRank}\nAve Rank = {AAveRank}\n')
    return DTermRanks, AAveRank


# Main evaluation function
# One-directional comparision of dictionaries
# one-directional comparison of two dictionaries; arguments: DGoldStandard (smaller) DTest (larger), file: GS items found in DTest; GS items missing from DText...
# usually testing: smaller vs. bigger dictionaries
def countIntersectDictionaries(DGS, DTest, FOutputPrecFOUND, FOutputPrecMISSING, SortBy = 0, Rev = False):
    '''
    general function: intersect dictionaries, return new intersection dictionaries, record "in" and "out" expressions
    
    3b: intersecting All possible MWEs in GS list with the "Extracted" list
    DA (smaller and going over each element) with D1W / DMWE lists 
    '''

    print('Total len of Gold Standard: ' + str(len(DGS.items())))
    IFound = 0
    IMissing = 0
    SumFoundRanks = 0
    DFound = {} # intersection dictionary

    for Word, Frq in sorted(DGS.items(),  key=lambda x: x[SortBy], reverse=Rev):
        if Word in DTest:
            IFound += 1
            try: # normally will not fire: if this word already exists with some rank, calculate the average of a new and old rank
                r0 = DFound[Word]
                r1 = DTest[Word]
                r = (r0+r1)/2
                DFound[Word] = r
                print('r?')
            except: # normal route: find the rank of the word in the dictionary
                DFound[Word] = DTest[Word]

            SumFoundRanks += DTest[Word] # add rank, to calculate average
            try: FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + str(DFound[Word]) + '\n') # record/calculate average rank, etc.
            except: 
                FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
                print(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
        else:
            IMissing += 1
            FOutputPrecMISSING.write(Word + '\t' + str(Frq) + '\n') # record/calculate average rank, etc.

    
    # print(f'Found: {IFound}')
    # print(f'Missing: {IMissing}')
    try: ACoverage = IFound / len(DGS.items())
    except: ACoverage = 0
    # print(f'Found2LenGS: {ACoverage}')
    try: AAverageFoundRanks = SumFoundRanks / IFound
    except: AAverageFoundRanks = 0
    # print(f'Ave Found Ranks: {AAverageFoundRanks} \n')

    print(f'Found: {IFound} ; Missing: {IMissing} ; AveRank: {AAverageFoundRanks} ; ACoverage: {ACoverage} ')
    FOutputPrecFOUND.flush()
    FOutputPrecMISSING.flush()

    return ACoverage, AAverageFoundRanks, DFound



# extracting annotated terms from the gold standard in xml format
def vertCollectAnnotation(FInVert, SXmlTag):
    L3AnnotatedSegs = []
    L2Seg = [] # a list of the current segment -- eash string is added 
    BInTerm = False # boolean flag: inside / outside term
    RTagOpen = re.compile('<' + SXmlTag + '>')
    RTagClose = re.compile('</' + SXmlTag + '>')
    for SLine in FInVert:
        SLine = SLine.strip()
        if re.match(RTagOpen, SLine):
            BInTerm = True
        elif re.match(RTagClose, SLine):
            BInTerm = False
            L3AnnotatedSegs.append(L2Seg)
            L2Seg = []
        else:
            if BInTerm == True:
                LFields = re.split('\t', SLine)
                L2Seg.append(LFields)

    return L3AnnotatedSegs



def convertBrecket2Xml(FInAnnot, FOutAnnot, RInOpen, RInClose, SOutOpen, SOutClose):
    RCOpen = re.compile(RInOpen)
    RCClose = re.compile(RInClose)
    for SLine in FInAnnot:
        SLine.strip()
        SLine = re.sub(RCOpen, SOutOpen, SLine)
        SLine = re.sub(RCClose, SOutClose, SLine)

        FOutAnnot.write(SLine + '\n')
    FOutAnnot.flush()
    return
  

In [None]:
# read / generate all the necessary texts
# first define functions, then download files and read them into dictionaries...
## Annotated Gold Standard
# Stage 1: Preparing Gold standard: Reading / extracting information from gold standard: creating a list of annotated terms
# set 1 (same text annotated by two annotators)

!wget https://heibox.uni-heidelberg.de/f/ae1110c4f9ad42b9a3d5/?dl=1
!mv index.html?dl=1 BGH1_s00Astghik.txt
!wget https://heibox.uni-heidelberg.de/f/398e7a10fa3241519f26/?dl=1
!mv index.html?dl=1 BGH1_s00Maia.txt

# set 2 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/0c787f26123f49178639/?dl=1
!mv index.html?dl=1 BGH2_s00Hayk.txt
!wget https://heibox.uni-heidelberg.de/f/356205b502fb4d759ad5/?dl=1
!mv index.html?dl=1 BGH2_s00Nino.txt

# set 3 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/ed0c7af9a9d04967b449/?dl=1
!mv index.html?dl=1 BGH3_s00Tamar.txt
# !wget 
# !mv index.html?dl=1

# one more will be added: Frau Khachatryan
!cat BGH1_s00Astghik.txt BGH1_s00Maia.txt BGH2_s00Hayk.txt BGH2_s00Nino.txt BGH3_s00Tamar.txt >BGH0_s00GoldStandard.txt

FInBGH0_s00GoldStandard = open('BGH0_s00GoldStandard.txt', 'r')
FOutBGH0_s00GoldStandard = open('BGH0_s00GoldStandard_xml.txt', 'w')

convertBrecket2Xml(FInBGH0_s00GoldStandard, FOutBGH0_s00GoldStandard, '<<+', '>>+', '<TERM>', '</TERM>')
# this result is pos-tagged and uploaded in the next step
# command:
# tree-tagger-de.sh /Users/bogdan/Seafile/research/corpus/DAAD-corpus/daad-experiments/BGH0_s00GoldStandard_xml.txt >/Users/bogdan/Seafile/research/corpus/DAAD-corpus/daad-experiments/BGH0_s00GoldStandard_LEM.txt

# further versions of the Gold Standard: 
# Annotated and PoS-tagged Gold Standard -- for extraction of the correct evaluation set

# printing words of different length
FOutput = open('BGH0_s01GoldS_Terms.txt', 'w')
FOutputDict1w = open('BGH0_s01GoldS_D1w.txt', 'w') # 1-word terms
FOutputDict2w = open('BGH0_s01GoldS_D2w.txt', 'w') # 2-word terminological expressions
FOutputDict3w = open('BGH0_s01GoldS_D3w.txt', 'w') # 3-word terminological expressions
FOutputDict4w = open('BGH0_s01GoldS_DMWE.txt', 'w') # more than 3 words


In [None]:
# !wget https://heibox.uni-heidelberg.de/f/4e719e0466a143c0b1b5/?dl=1
!wget https://heibox.uni-heidelberg.de/f/d8f1bb53632d40538e0d/?dl=1
!mv index.html?dl=1 BGH0_s00GS_LEM.txt

In [8]:
# reading datasets
# list of gold-standard annotated terms, with lemmatization and pos fileds
FInBGH0_s00GS_LEM = open('BGH0_s00GS_LEM.txt', 'r')
# FOutBGH0_s00GS_Terms = open('BGH0_s00GS_Terms.txt', 'w')
L3AnnotatedSegs = vertCollectAnnotation(FInBGH0_s00GS_LEM, 'TERM')

# testing the file read
# for LSegment in L3AnnotatedSegs: FOutBGH0_s00GS_Terms.write(str(LSegment) + '\n')
# FOutBGH0_s00GS_Terms.flush()

In [11]:
L3AnnotatedSegs[10]

[['Oberstaatsanwalt', 'NN', 'Oberstaatsanwalt'],
 ['beim', 'APPRART', 'bei'],
 ['Bundesgerichtshof', 'NN', 'Bundesgerichtshof']]

In [14]:
def createDictOfPatterns(L3AnnotatedSegs, IFieldN, Normalize = False):
    '''
    take the list of annotated terms and return a dictionary of MWEs
    '''
    DPatternsFrq = {} # returned dictionary of PoS patterns, etc.
    for L2TermFlds in L3AnnotatedSegs:
        LFlds = [] # here we will collect the values of the selected field
        for LWordFlds in L2TermFlds:
            if Normalize:
                try: PoS = LWordFlds[IFieldN]
                except: 
                    print('PoS not found')
                    PoS = ''
                if re.match('N', PoS): LWordFlds[IFieldN] = 'N'
                if re.match('ADJ', PoS): LWordFlds[IFieldN] = 'ADJ'
                if re.match('V', PoS): LWordFlds[IFieldN] = 'V'
            try: LFlds.append(LWordFlds[IFieldN])
            except: print('index Error')
        if len(LFlds) > 0: SFlds = ' '.join(LFlds)
        try: DPatternsFrq[SFlds] += 1
        except: DPatternsFrq[SFlds] = 1

    return DPatternsFrq

DPatternsFrq = createDictOfPatterns(L3AnnotatedSegs, 1, Normalize = True)
FOutTermPOS = open('BGH0_s00GoldStandard_pos.txt', 'w')

printDictionary(DPatternsFrq, FOutTermPOS)

In [25]:
# L3AnnotatedSegs[10]
# FInput = open('BGH0_s00GoldStandard.txt', 'r')
# for statistical purposes - separately single and multiword terms

def comparePattern(L2TermFlds, LPattern, IFldN):
    '''
    compares if a pattern is found in the term field
    '''
    for k in range(len(LPattern)):
        if re.match(LPattern[k], L2TermFlds[k][IFldN]): continue
        else: return False
    return True


def selectTerms(L3AnnotatedSegs, L2Patterns = None, LNoEdge = None, LNoStart = None, L2NoEnd = None,  SplitLen = False, IFldNumber = 0):
    '''
    function: 1. selects terms which match specified POS pattern; 2. divides them into dictionaries according to length
    the function can also visualise terms with a specific pos pattern, specified in L2Patterns, e.g., L2Patterns = [['N', '\$']]

    '''
    DGS = {}
    DGS1w = {} # dictionary of single words
    DGS2w = {} # dictionary of 2-word expressions
    DGS3w = {} # dictionary of 3-word expressions
    DGS4w = {} # dictionary of other mwes
    IGS = 0
    IGS1w = 0 # number of annotated tokens of single words
    IGS2w = 0
    IGS3w = 0
    IGS4w = 0 # number of annotated tokens of multiwords

    if L2Patterns: # positive filter
        for L2AnnotatedSeg in L3AnnotatedSegs: # for each multiword term, where words are represented as fields
            ILenTerm = len(L2AnnotatedSeg)
            for LPattern in L2Patterns:
                if len(LPattern) == ILenTerm and comparePattern(L2AnnotatedSeg, LPattern, 1):
                    LTerm = []
                    for LTerm2Fields in L2AnnotatedSeg:
                        LTerm.append(LTerm2Fields[IFldNumber])
                    STerm = ' '.join(LTerm)
                    try: DGS[STerm] += 1
                    except: DGS[STerm] = 1

    else: # negative filter checking
        for L2AnnotatedSeg in L3AnnotatedSegs:
            if LNoEdge: # PoS which cannot apper at the edge
                for SPoS in LNoEdge:
                    if re.match(SPoS, L2AnnotatedSeg[-1][1]) or re.match(SPoS, L2AnnotatedSeg[0][1]): 
                        continue
            if LNoStart:
                for SPoS in LNoStart:
                    if re.match(re.match(SPoS, L2AnnotatedSeg[0][1])): 
                        continue
            if L2NoEnd:
                for SPoS in L2NoEnd:
                    if re.match(re.match(SPoS, L2AnnotatedSeg[-1][1])): 
                        continue
            LTerm = []
            for LTerm2Fields in L2AnnotatedSeg:
                LTerm.append(LTerm2Fields[IFldNumber])
            STerm = ' '.join(LTerm)
            try: DGS[STerm] += 1
            except: DGS[STerm] = 1
                         

    return DGS

# DTermConfFrq = selectTerms(L3AnnotatedSegs, L2Patterns = [['N', '\$']], SplitLen = False, IFldNumber = 0)
# FOutExamples = open('BGH0_s00GoldStandard_examples_ADJ_N.txt', 'w')

DTermConfFrq = selectTerms(L3AnnotatedSegs, L2Patterns = None, LNoEdge = None, LNoStart = None, L2NoEnd = ['ADJ'], SplitLen = False, IFldNumber = 0)
FOutExamples = open('BGH0_s00GoldStandard_examples.txt', 'w')

printDictionary(DTermConfFrq, FOutExamples)


UnboundLocalError: ignored