<a href="https://colab.research.google.com/github/iued-uni-heidelberg/DAAD-Training-2021/blob/main/Terminologieextraktion6EvaluationKeyWordsV01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Terminology extraction with keywords, association measures, etc.

Here we enrich extracted MWEs with keyness information (and other paremeters) and test Precision / Recall

## Preparing gold standard annotation dictinaries

In [38]:
# Stage 0: Some useful read/write and convert functions

In [None]:
# import useful libraries
import re, os, sys

In [None]:
# a useful function for recording / visualising current stage of dictionaries
def printDictionary(DictionaryFrq, FOut, K = 1, Rev = True): # printing a dictionary: by values or alphabetically
    for Word, Frq in sorted( DictionaryFrq.items() , key=lambda x: x[K], reverse=Rev):
        FOut.write(Word + '\t' + str(Frq) + '\n')
    return

In [36]:
# another useful function to just read and return a 2-field dictionary, eg., frequency or keyness
def readDictionary(FIN, SkipComments = True, Caps=False):
    DScoresLarge = {} # keywords - scores
    for Line in FIN:
        if SkipComments and re.match('#', Line): continue
        Line = Line.strip()
        if Caps: Line.upper() # convert to upper case
        LFieldsKW = re.split('\t', Line)
        SWord = LFieldsKW[0]
        AKScore = float(LFieldsKW[1])
        DScoresLarge[SWord] = AKScore   
    return DScoresLarge

In [37]:
# another possibly useful function: convert dictionary values to ranks (frequency, keyness weights, etc.)
# for understanding how far down the list the item has been found...
def rankDict(DIN):
    '''
    reading a frequency dictionary from a file
    '''
    DTermRanks = {}
    i = 0
    IRank = 0
    IPrevFrq = 0
    SumRanks = 0
    for SKey, Frq in DIN.items():
        # if re.match('#', SKey): continue # skipping comments
        i+=1
        if IPrevFrq != Frq: IRank = i # rank is the number of the highest ranking element of the same frequency group
        IPrevFrq = Frq
        
        DTermRanks[SKey] = IRank
        SumRanks += IRank

    AAveRank = SumRanks / i
    print(f'MaxRank = {IRank}\nAve Rank = {AAveRank}\n')
    return DTermRanks, AAveRank

In [35]:
# Main evaluation function
# One-directional comparision of dictionaries
# one-directional comparison of two dictionaries; arguments: DGoldStandard (smaller) DTest (larger), file: GS items found in DTest; GS items missing from DText...
# usually testing: smaller vs. bigger dictionaries
def countIntersectDictionaries(DGS, DTest, FOutputPrecFOUND, FOutputPrecMISSING, SortBy = 0, Rev = False):
    '''
    general function: intersect dictionaries, return new intersection dictionaries, record "in" and "out" expressions
    
    3b: intersecting All possible MWEs in GS list with the "Extracted" list
    DA (smaller and going over each element) with D1W / DMWE lists 
    '''

    print('Total len of Gold Standard: ' + str(len(DGS.items())))
    IFound = 0
    IMissing = 0
    SumFoundRanks = 0
    DFound = {} # intersection dictionary

    for Word, Frq in sorted(DGS.items(),  key=lambda x: x[SortBy], reverse=Rev):
        if Word in DTest:
            IFound += 1
            try: # normally will not fire: if this word already exists with some rank, calculate the average of a new and old rank
                r0 = DFound[Word]
                r1 = DTest[Word]
                r = (r0+r1)/2
                DFound[Word] = r
                print('r?')
            except: # normal route: find the rank of the word in the dictionary
                DFound[Word] = DTest[Word]

            SumFoundRanks += DTest[Word] # add rank, to calculate average
            try: FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + str(D1W[Word]) + '\n') # record/calculate average rank, etc.
            except: 
                FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
                print(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
        else:
            IMissing += 1
            FOutputPrecMISSING.write(Word + '\t' + str(Frq) + '\n') # record/calculate average rank, etc.

    print(f'Found: {IFound}')
    print(f'Missing: {IMissing}')
    try: ACoverage = IFound / len(DGS.items())
    except: ACoverage = 0
    print(f'Found2LenGS: {ACoverage}')
    try: AAverageFoundRanks = SumFoundRanks / IFound
    except: AAverageFoundRanks = 0
    print(f'Ave Found Ranks: {AAverageFoundRanks} \n')

    return ACoverage, AAverageFoundRanks, DFound


In [None]:
# Stage 1: Preparing Gold standard: Reading / extracting information from gold standard: creating a list of annotated terms
# set 1 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/ae1110c4f9ad42b9a3d5/?dl=1
!mv index.html?dl=1 BGH1_s00Astghik.txt
!wget https://heibox.uni-heidelberg.de/f/398e7a10fa3241519f26/?dl=1
!mv index.html?dl=1 BGH1_s00Maia.txt

# set 2 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/0c787f26123f49178639/?dl=1
!mv index.html?dl=1 BGH2_s00Hayk.txt
!wget https://heibox.uni-heidelberg.de/f/356205b502fb4d759ad5/?dl=1
!mv index.html?dl=1 BGH2_s00Nino.txt

# set 3 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/ed0c7af9a9d04967b449/?dl=1
!mv index.html?dl=1 BGH3_s00Tamar.txt
# !wget 
# !mv index.html?dl=1 

In [None]:
# one more will be added: Frau Khachatryan
!cat BGH1_s00Astghik.txt BGH1_s00Maia.txt BGH2_s00Hayk.txt BGH2_s00Nino.txt BGH3_s00Tamar.txt >BGH0_s00GoldStandard.txt

In [None]:
FInput = open('BGH0_s00GoldStandard.txt', 'r')
FOutput = open('BGH0_s01GoldS_Terms.txt', 'w')
# for statistical purposes - separately single and multiword terms
FOutputDict1w = open('BGH0_s01GoldS_D1w.txt', 'w') # 1-word terms
FOutputDict2w = open('BGH0_s01GoldS_D2w.txt', 'w') # 2-word terminological expressions
FOutputDict3w = open('BGH0_s01GoldS_D3w.txt', 'w') # 3-word terminological expressions
FOutputDictMWE = open('BGH0_s01GoldS_DMWE.txt', 'w') # more than 3 words

In [None]:
# creating gold-standard dictionaries for evaluation tasks:
# import re, os, sys
LGSTerms = [] # gold standard terms
DGS1w = {} # dictionary of single words
DGS2w = {} # dictionary of 2-word expressions
DGS3w = {} # dictionary of 3-word expressions
DGSMWE = {} # dictionary of other mwes
IGS1w = 0 # number of annotated tokens of single words
IGS2w = 0
IGS3w = 0
IGSMWE = 0 # number of annotated tokens of multiwords
for SLine in FInput:
    LAnnotatedTermsInLine = re.findall('<<([^><]+)>>', SLine)
    LGSTerms.extend(LAnnotatedTermsInLine)

for GSTerm in LGSTerms:
    GSTerm = GSTerm.strip()
    GSTerm = GSTerm.strip('„“"().')

    # everything is converted to upper case for quick dictionary lookup
    GSTerm = GSTerm.upper()
    
    GSTerm = re.sub(' +', ' ', GSTerm)
    LGSTErms = re.split(' ', GSTerm)
    if len(LGSTErms) > 3:
        IGSMWE += 1
        try: DGSMWE[GSTerm] += 1
        except: DGSMWE[GSTerm] = 1
    elif len(LGSTErms) > 2:
        IGS3w += 1
        try: DGS3w[GSTerm] += 1
        except: DGS3w[GSTerm] = 1
    elif len(LGSTErms) > 1:
        IGS2w += 1
        try: DGS2w[GSTerm] +=1
        except: DGS2w[GSTerm] = 1
    else:
        IGS1w += 1
        try: DGS1w[GSTerm] +=1
        except: DGS1w[GSTerm] = 1

    FOutput.write(GSTerm + '\n')

FOutputDictMWE.write('# Number of tokens: ' + str(IGSMWE) + '\n')
FOutputDict3w.write('# Number of tokens: ' + str(IGS3w) + '\n')
FOutputDict2w.write('# Number of tokens: ' + str(IGS2w) + '\n')
FOutputDict1w.write('# Number of tokens: ' + str(IGS1w) + '\n')

printDictionary(DGSMWE, FOutputDictMWE)
printDictionary(DGS3w, FOutputDict3w)
printDictionary(DGS2w, FOutputDict2w)
printDictionary(DGS1w, FOutputDict1w)

FOutputDictMWE.flush()
FOutputDictMWE.close()
FOutputDict3w.flush()
FOutputDict3w.close()
FOutputDict2w.flush()
FOutputDict2w.close()
FOutputDict1w.flush()
FOutputDict1w.close()

FOutput.flush()
FOutput.close()

FInput.close()

## Preparing the 'keyness' dictionary

In [None]:
# Stage 2: preparing keyness dictionary
!wget https://heibox.uni-heidelberg.de/f/aa4560e627bd4b1d8055/?dl=1
!mv index.html?dl=1 TK_KW_Verif_V02.csv

!wget https://heibox.uni-heidelberg.de/f/a83ba95576a244a59966/?dl=1
!mv index.html?dl=1 KW_BGH_10000.tsv

In [None]:
# Preparing a dictionnary of keyness weights, checking the 'approval' status
FInputKW = open('TK_KW_Verif_V02.csv', 'r')
FInputKWLarge = open('KW_BGH_10000.tsv', 'r') # for experiments with Precision / Recall
FOutputKW = open('TK_KW_Verif_V02.txt', 'w')
# FOutputGSKWS1w = open('BGH1_s01GoldSKW_D1w.txt', 'w')
# FOutputGSKWS2w = open('BGH1_s01GoldSKW_D2w.txt', 'w')
# FOutputGSKWS3w = open('BGH1_s01GoldSKW_D3w.txt', 'w')
# FOutputGSKWSMWE = open('BGH1_s01GoldSKW_MWE.txt', 'w')

In [None]:
DScoresKW = {} # keywords - scores
DScoresNK = {} # non-keywords
DStatKW = {} # status: key/non-key-word
for Line in FInputKW:
    LFieldsKW = re.split('\t', Line) # add: strip()
    SWord = LFieldsKW[1]
    AKScore = float(LFieldsKW[2])
    AKStat = float(LFieldsKW[3])
    DStatKW[SWord] = AKStat
    if AKStat > 0: # change value to 0.5 if we need to restrict to 'sure' terms only (value 1)
        DScoresKW[SWord] = AKScore
    else:
        DScoresNK[SWord] = AKScore

In [None]:
# reading the large keyword dictionary
DScoresKWLarge = readDictionary(FInputKWLarge, Caps=True)

In [None]:
# we create a dictionary of keyness values with only upper case letters, which will be checked against also uppercased term candidates
# the same dictionary as DScoresKW, but with ensured conversion in to upper case:
DScoresREKWquick = {} # dictionary of RE
for kw, val in DScoresKW.items():
    SUpperC = kw.upper() # making sure our key words are in upper case
    # these are the alternatives, which we do not consider in this stage...
    # SLowerC = kw.lower()
    # SSentenceC = kw.capitalize()
    # RPatternKW = re.compile('^' + kw + '$', re.IGNORECASE)
    DScoresREKWquick[SUpperC] = val # 
    # DScoresREKWquick[SLowerC] = val # 
    # DScoresREKWquick[SSentenceC] = val # 

In [None]:
printDictionary(DScoresREKWquick, FOutputKW)

## Preparing a dictionary of automatically extracted terms using PoS configurations vs. key-word based enhancements and re-orderings of this list

In [None]:
# Stage 3: Reading a file with extracted terms; capitalizing everything...
# Reading test data - Possible Terms (extracted automatically): reading the text files of single and multiword terms, recording ranks
# single words candidates
#
# Warning: these files are 8 and 70 MB respectively (relatively large to view on-line)
!wget https://heibox.uni-heidelberg.de/f/a9171080790f4932b7b1/?dl=1
!mv index.html?dl=1 BGH0_s02term1w.txt

# multiword candidates
!wget https://heibox.uni-heidelberg.de/f/2488701205e34e4683b1/?dl=1
!mv index.html?dl=1 BGH0_s02termMWE.txt

In [None]:
FAutoTerms1w = open('BGH0_s02term1w.txt', 'r')
FAutoTermsMWE = open('BGH0_s02termMWE.txt', 'r')

In [None]:
# ... here we will add functions for reading this dictionary (e.g., as ranked list, etc.)
DAutoTerms1w = readDictionary(FAutoTerms1w, Caps=True)
DAutoTermsMWE = readDictionary(FAutoTermsMWE, Caps=True)

In [None]:
# optional: convert values to ranks
DAutoTermsR1w = rankDict(DAutoTerms1w)
DAutoTermsRMWE = rankDict(DAutoTermsMWE)

In [None]:
# Stage 3.1 combine extracted words with keyness (e.g., filter by keyness, etc.)
# here we will add / combine information about keyness...
# to be implemented...

We will check:

- how terminology extraction works for Precison and Recall (intersecting the Gold Standard and extracted terms); 

- how high is the rank of the terms in the extracted list, etc...

In [None]:
# Stage04: preparing data for calculating precision and recall on the space of all possible MWEs, 1, 2, 3 words; (overlapping)
# keeping only 1 version of the text (2 annotators annotated the same text twice to measure interannotator agreement)
!cat BGH1_s00Astghik.txt BGH2_s00Hayk.txt BGH3_s00Tamar.txt >BGH0_s03GoldStandard1Version.txt


This function will be used for measuring P and R:
one-way comparison of dictionaries

In [None]:
FInputGS1V = open('BGH0_s03GoldStandard1Version.txt', 'r')
# tokenizing gold standard
'''
The idea is to tokenise the gold standard (from Stage 0), and to generate all possible MWEs for each string / pargraph
    then we can test what is the coverage (non-overlapping) or precision (overlapping)
    or: we create a dictionary of potential single and MWE strings and check what has been identified ?
    or: comparing with 'oracle': known annotations are run as a point of comparision on the space; and we establish relations, i.e., the amount of over-generation

    tasks: 
        4a: create the "all possible strings" space from gold standard text
        4b: intersect 4a results with corpus list of extracted MWEs >> generate "extracted from gold standard" dictionary
        4c: intersect human annotation in gold standard with 3a >> generate "correct in gold standard" dictionary
        4d: intersect 4b and 4c, >> correctly extracted
        4e: calculate 4d/4b = precision
            calculate 4d/4c = recall

'''
# 3a: processing gold standard: tokenizing
import re, os, sys
LLParTokens = [] # List of paragraphs, each represented as a list of tokens
for SLine in FInputGS1V:
    # print(SLine)
    SLine = SLine.strip() # implement this change

    # remove annotation


    SLine = re.sub('[<>]+', ' ', SLine)
    SLine = re.sub(' +', ' ', SLine)



    SLine = SLine.upper() # capitalize all words
    # print(SLine)

    LLine = re.split(' ', SLine) 

    # separate punctuation
    LLine = re.findall(r"[\w']+|[.,!?;()\-„“\"]", SLine)
    # SLine = re.sub(r'(,\.;:\-\!\?\(\)\[\]\“\")', r' \1 ', SLine)

    LLine = list(filter(None, LLine))
    LLParTokens.append(LLine)

FInputGS1V.close()


In [None]:
print(str(LLParTokens[9]))

In [None]:
# Stage 4A
# generating candidate MWEs for cheking if / when they have been identified as terms
# algorithm from Terminologieextraktion3 notebook
# 4a: creating space of all possible overlapping MWEs in gold standard

def tokens2candNGrams(LWords, N): # working with specific N-gram size, to keep number of candidates under control
    '''
    convert a list of tokens into a list of all possible MWEs (works for each paragraph)
    '''
    LLCandidates = [] # lists - tokenised results
    # LSCandidates = [] # strings - joint results

    for i in range(N): # for up to the required N-gram length

        for IPosition in range(len(LWords) - i): # unigrams -- no change; bigrams: up to penultimate, etc.
            LCandidate = LWords[IPosition : IPosition + i + 1]
            # SCandidate = ' '.join(LCandidate)
            LLCandidates.append(LCandidate)
        # LSCandidates.append(SCandidate)
    
    return LLCandidates


# LLCandidates = tokens2candNGrams(['this', 'is', 'a', 'test', 'of', 'the', 'function'], 4)
# for L in LLCandidates:
#    print(str(L))

#    ''' # full version; now abandoned...
#    for klen in range(len(LWords)): # lengths of candidate lists
#        klength = klen+1 # true length: for 0 it is le = 1
#        # print(f'klen:{klength};')
#        for i in range(len(LWords) - klen): # positions where candidates start
#            # print(f'i:{i};')
#            LCandidate = LWords[i:i+klength]
#            SCandidate = ' '.join(LCandidate)
#            LLCandidates.append(LCandidate)
#            LSCandidates.append(SCandidate)
#        
#    return LLCandidates, LSCandidates
#    '''

In [None]:
DA_1W = {} # dictionary of 1-word candidates from the gold standard text (to be tested)
DA_2W = {} # dictionary of 2-word candidates from the gold standard text (to be tested)
DA_3W = {} # dictionary of 3-word candidates from the gold standard text (to be tested)
DA_MWE = {} # dictionary of MWE candidates from the gold standard text (to be tested)

for LTokens in LLParTokens: # for each paragraph
    LLCandidates = tokens2candNGrams(LTokens, 4)
    # print(str(LLCandidates))
    for LCandidate in LLCandidates:
        SCandidate = ' '.join(LCandidate)
        if len(LCandidate) > 3:
            try:
                DA_MWE[SCandidate] += 1
            except:
                DA_MWE[SCandidate] = 1  
        elif len(LCandidate) > 2:
            try:
                DA_3W[SCandidate] += 1
            except:
                DA_3W[SCandidate] = 1  
        elif len(LCandidate) > 1:
            try:
                DA_2W[SCandidate] += 1
            except:
                DA_2W[SCandidate] = 1  
        else:
            try:
                DA_1W[SCandidate] += 1
            except:
                DA_1W[SCandidate] = 1


In [None]:
FOutputA1w = open('BGH0_s04A_1w_res.txt', 'w')
FOutputA2w = open('BGH0_s04A_2w_res.txt', 'w')
FOutputA3w = open('BGH0_s04A_3w_res.txt', 'w')
FOutputAMWE = open('BGH0_s04A_MWE_res.txt', 'w')

In [None]:
printDictionary(DA_1W, FOutputA1w, Rev = True)
printDictionary(DA_2W, FOutputA2w, Rev = True)
printDictionary(DA_3W, FOutputA3w, Rev = True)
printDictionary(DA_MWE, FOutputAMWE, Rev = True)

In [None]:
# Stage 4B 
# creating output files for B
FOutputFOUND1wBinA = open('BGH0_s04BinA1w_resY.txt', 'w')
FOutputMISSING1wBinA = open('BGH0_s04BinA1w_resN.txt', 'w')

FOutputFOUND2wBinA = open('BGH0_s04BinA2w_resY.txt', 'w')
FOutputMISSING2wBinA = open('BGH0_s04BinA2w_resN.txt', 'w')

FOutputFOUND3wBinA = open('BGH0_s04BinA3w_resY.txt', 'w')
FOutputMISSING3wBinA = open('BGH0_s04BinA3w_resN.txt', 'w')

FOutputFOUNDMWEsBinA = open('BGH0_s04BinAMWE_resY.txt', 'w')
FOutputMISSINGMWEsBinA = open('BGH0_s04BinAMWE_resN.txt', 'w')

In [None]:
# Stage 4B preparing B-set for calculating performance
AFound1wBinA, AAverageFoundRanks1wBinA, DB_1W = countIntersectDictionaries(DA_1W, DAutoTermsR1w, FOutputFOUND1wBinA, FOutputMISSING1wBinA, SortBy = 0, Rev = False)
AFound2wBinA, AAverageFoundRanks2wBinA, DB_2W = countIntersectDictionaries(DA_2W, DAutoTermsRMWE, FOutputFOUND1wBinA, FOutputMISSING1wBinA, SortBy = 0, Rev = False)
AFound3wBinA, AAverageFoundRanks3wBinA, DB_3W = countIntersectDictionaries(DA_3W, DAutoTermsRMWE, FOutputFOUND1wBinA, FOutputMISSING1wBinA, SortBy = 0, Rev = False)
AFoundMWEBinA, AAverageFoundRanksMWEBinA, DB_MWE = countIntersectDictionaries(DA_MWE, DAutoTermsRMWE, FOutputFOUNDMWEsBinA, FOutputMISSINGMWEsBinA, SortBy = 0, Rev = False)
