<a href="https://colab.research.google.com/github/iued-uni-heidelberg/DAAD-Training-2021/blob/main/Terminologieextraktion7EvaluationKeyWordsV01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Terminology extraction with keywords, association measures, etc.

Here we enrich extracted MWEs with keyness information (and other paremeters) and test Precision / Recall

## Preparing gold standard annotation dictinaries

In [1]:
# Stage 0: Some useful read/write and convert functions

In [2]:
# import useful libraries
import re, os, sys

In [50]:
# file for recording results of different configurations
!rm AllTermExtractionResultsV01.txt
!rm AllTermExtractionResultsV02.txt

In [51]:
FOutResults1 = open('AllTermExtractionResultsV01.txt', 'a')
FOutResults2 = open('AllTermExtractionResultsV02.txt', 'a')
FOutResults1.write('Run\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
FOutResults2.write('Run\tW1P\tW1R\tW2P\tW2R\tW3P\tW3R\tW4P\tW4R\n') # only precision and recall figures
FOutResults1.flush()
FOutResults2.flush()

In [5]:
# a useful function for recording / visualising current stage of dictionaries
def printDictionary(DictionaryFrq, FOut, K = 1, Rev = True): # printing a dictionary: by values or alphabetically
    for Word, Frq in sorted( DictionaryFrq.items() , key=lambda x: x[K], reverse=Rev):
        FOut.write(Word + '\t' + str(Frq) + '\n')
    return

In [6]:
# another useful function to just read and return a 2-field dictionary, eg., frequency or keyness
def readDictionary(FIN, SkipComments = True, Caps=False):
    DScoresLarge = {} # keywords - scores
    for Line in FIN:
        if SkipComments and re.match('#', Line): 
            continue
        Line = Line.strip()
        if Caps: 
            Line = Line.upper() # convert to upper case
        LFieldsKW = re.split('\t', Line)
        SWord = LFieldsKW[0]
        AKScore = float(LFieldsKW[1])
        DScoresLarge[SWord] = AKScore   
    return DScoresLarge

In [7]:
# another possibly useful function: convert dictionary values to ranks (frequency, keyness weights, etc.)
# for understanding how far down the list the item has been found...
def rankDict(DIN):
    '''
    reading a frequency dictionary from a file
    '''
    DTermRanks = {}
    i = 0
    IRank = 0
    IPrevFrq = 0
    SumRanks = 0
    for SKey, Frq in DIN.items():
        # if re.match('#', SKey): continue # skipping comments
        i+=1
        if IPrevFrq != Frq: IRank = i # rank is the number of the highest ranking element of the same frequency group
        IPrevFrq = Frq
        
        DTermRanks[SKey] = IRank
        SumRanks += IRank

    AAveRank = SumRanks / i
    print(f'MaxRank = {IRank}\nAve Rank = {AAveRank}\n')
    return DTermRanks, AAveRank

In [8]:
# Main evaluation function
# One-directional comparision of dictionaries
# one-directional comparison of two dictionaries; arguments: DGoldStandard (smaller) DTest (larger), file: GS items found in DTest; GS items missing from DText...
# usually testing: smaller vs. bigger dictionaries
def countIntersectDictionaries(DGS, DTest, FOutputPrecFOUND, FOutputPrecMISSING, SortBy = 0, Rev = False):
    '''
    general function: intersect dictionaries, return new intersection dictionaries, record "in" and "out" expressions
    
    3b: intersecting All possible MWEs in GS list with the "Extracted" list
    DA (smaller and going over each element) with D1W / DMWE lists 
    '''

    print('Total len of Gold Standard: ' + str(len(DGS.items())))
    IFound = 0
    IMissing = 0
    SumFoundRanks = 0
    DFound = {} # intersection dictionary

    for Word, Frq in sorted(DGS.items(),  key=lambda x: x[SortBy], reverse=Rev):
        if Word in DTest:
            IFound += 1
            try: # normally will not fire: if this word already exists with some rank, calculate the average of a new and old rank
                r0 = DFound[Word]
                r1 = DTest[Word]
                r = (r0+r1)/2
                DFound[Word] = r
                print('r?')
            except: # normal route: find the rank of the word in the dictionary
                DFound[Word] = DTest[Word]

            SumFoundRanks += DTest[Word] # add rank, to calculate average
            try: FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + str(DFound[Word]) + '\n') # record/calculate average rank, etc.
            except: 
                FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
                print(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
        else:
            IMissing += 1
            FOutputPrecMISSING.write(Word + '\t' + str(Frq) + '\n') # record/calculate average rank, etc.

    print(f'Found: {IFound}')
    print(f'Missing: {IMissing}')
    try: ACoverage = IFound / len(DGS.items())
    except: ACoverage = 0
    print(f'Found2LenGS: {ACoverage}')
    try: AAverageFoundRanks = SumFoundRanks / IFound
    except: AAverageFoundRanks = 0
    print(f'Ave Found Ranks: {AAverageFoundRanks} \n')

    FOutputPrecFOUND.flush()
    FOutputPrecMISSING.flush()

    return ACoverage, AAverageFoundRanks, DFound


In [None]:
# Stage 1: Preparing Gold standard: Reading / extracting information from gold standard: creating a list of annotated terms
# set 1 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/ae1110c4f9ad42b9a3d5/?dl=1
!mv index.html?dl=1 BGH1_s00Astghik.txt
!wget https://heibox.uni-heidelberg.de/f/398e7a10fa3241519f26/?dl=1
!mv index.html?dl=1 BGH1_s00Maia.txt

# set 2 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/0c787f26123f49178639/?dl=1
!mv index.html?dl=1 BGH2_s00Hayk.txt
!wget https://heibox.uni-heidelberg.de/f/356205b502fb4d759ad5/?dl=1
!mv index.html?dl=1 BGH2_s00Nino.txt

# set 3 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/ed0c7af9a9d04967b449/?dl=1
!mv index.html?dl=1 BGH3_s00Tamar.txt
# !wget 
# !mv index.html?dl=1 

In [10]:
# one more will be added: Frau Khachatryan
!cat BGH1_s00Astghik.txt BGH1_s00Maia.txt BGH2_s00Hayk.txt BGH2_s00Nino.txt BGH3_s00Tamar.txt >BGH0_s00GoldStandard.txt

In [11]:
FInput = open('BGH0_s00GoldStandard.txt', 'r')
FOutput = open('BGH0_s01GoldS_Terms.txt', 'w')
# for statistical purposes - separately single and multiword terms
FOutputDict1w = open('BGH0_s01GoldS_D1w.txt', 'w') # 1-word terms
FOutputDict2w = open('BGH0_s01GoldS_D2w.txt', 'w') # 2-word terminological expressions
FOutputDict3w = open('BGH0_s01GoldS_D3w.txt', 'w') # 3-word terminological expressions
FOutputDictMWE = open('BGH0_s01GoldS_DMWE.txt', 'w') # more than 3 words

In [12]:
# creating gold-standard dictionaries for evaluation tasks:
# import re, os, sys
LGSTerms = [] # gold standard terms
DGS1w = {} # dictionary of single words
DGS2w = {} # dictionary of 2-word expressions
DGS3w = {} # dictionary of 3-word expressions
DGSMWE = {} # dictionary of other mwes
IGS1w = 0 # number of annotated tokens of single words
IGS2w = 0
IGS3w = 0
IGSMWE = 0 # number of annotated tokens of multiwords
for SLine in FInput:
    LAnnotatedTermsInLine = re.findall('<<([^><]+)>>', SLine)
    LGSTerms.extend(LAnnotatedTermsInLine)

for GSTerm in LGSTerms:
    GSTerm = GSTerm.strip()
    GSTerm = GSTerm.strip('„“"().')

    # everything is converted to upper case for quick dictionary lookup
    GSTerm = GSTerm.upper()
    
    GSTerm = re.sub(' +', ' ', GSTerm)
    LGSTErms = re.split(' ', GSTerm)
    if len(LGSTErms) > 3:
        IGSMWE += 1
        try: DGSMWE[GSTerm] += 1
        except: DGSMWE[GSTerm] = 1
    elif len(LGSTErms) > 2:
        IGS3w += 1
        try: DGS3w[GSTerm] += 1
        except: DGS3w[GSTerm] = 1
    elif len(LGSTErms) > 1:
        IGS2w += 1
        try: DGS2w[GSTerm] +=1
        except: DGS2w[GSTerm] = 1
    else:
        IGS1w += 1
        try: DGS1w[GSTerm] +=1
        except: DGS1w[GSTerm] = 1

    FOutput.write(GSTerm + '\n')

FOutputDictMWE.write('# Number of tokens: ' + str(IGSMWE) + '\n')
FOutputDict3w.write('# Number of tokens: ' + str(IGS3w) + '\n')
FOutputDict2w.write('# Number of tokens: ' + str(IGS2w) + '\n')
FOutputDict1w.write('# Number of tokens: ' + str(IGS1w) + '\n')

printDictionary(DGSMWE, FOutputDictMWE)
printDictionary(DGS3w, FOutputDict3w)
printDictionary(DGS2w, FOutputDict2w)
printDictionary(DGS1w, FOutputDict1w)

FOutputDictMWE.flush()
FOutputDictMWE.close()
FOutputDict3w.flush()
FOutputDict3w.close()
FOutputDict2w.flush()
FOutputDict2w.close()
FOutputDict1w.flush()
FOutputDict1w.close()

FOutput.flush()
FOutput.close()

FInput.close()

## Preparing the 'keyness' dictionary

In [13]:
# Stage 2: preparing keyness dictionary
!wget https://heibox.uni-heidelberg.de/f/aa4560e627bd4b1d8055/?dl=1
!mv index.html?dl=1 TK_KW_Verif_V02.csv

!wget https://heibox.uni-heidelberg.de/f/a83ba95576a244a59966/?dl=1
!mv index.html?dl=1 KW_BGH_10000.tsv

--2021-10-28 15:44:44--  https://heibox.uni-heidelberg.de/f/aa4560e627bd4b1d8055/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Connecting to heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)|129.206.7.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://heibox.uni-heidelberg.de/seafhttp/files/0c87334f-cb22-4d2a-bca7-eae34c1689d4/TK_KW_Verif_V02.csv [following]
--2021-10-28 15:44:45--  https://heibox.uni-heidelberg.de/seafhttp/files/0c87334f-cb22-4d2a-bca7-eae34c1689d4/TK_KW_Verif_V02.csv
Reusing existing connection to heibox.uni-heidelberg.de:443.
HTTP request sent, awaiting response... 200 OK
Length: 38812 (38K) [application/octet-stream]
Saving to: ‘index.html?dl=1’


2021-10-28 15:44:46 (95.1 KB/s) - ‘index.html?dl=1’ saved [38812/38812]

--2021-10-28 15:44:46--  https://heibox.uni-heidelberg.de/f/a83ba95576a244a59966/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Con

In [14]:
# Preparing a dictionnary of keyness weights, checking the 'approval' status
FInputKW = open('TK_KW_Verif_V02.csv', 'r')
FInputKWLarge = open('KW_BGH_10000.tsv', 'r') # for experiments with Precision / Recall
FOutputKW = open('TK_KW_Verif_V02.txt', 'w')
# FOutputGSKWS1w = open('BGH1_s01GoldSKW_D1w.txt', 'w')
# FOutputGSKWS2w = open('BGH1_s01GoldSKW_D2w.txt', 'w')
# FOutputGSKWS3w = open('BGH1_s01GoldSKW_D3w.txt', 'w')
# FOutputGSKWSMWE = open('BGH1_s01GoldSKW_MWE.txt', 'w')

In [15]:
DScoresKW = {} # keywords - scores
DScoresNK = {} # non-keywords
DStatKW = {} # status: key/non-key-word
for Line in FInputKW:
    LFieldsKW = re.split('\t', Line) # add: strip()
    SWord = LFieldsKW[1]
    AKScore = float(LFieldsKW[2])
    AKStat = float(LFieldsKW[3])
    DStatKW[SWord] = AKStat
    if AKStat > 0: # change value to 0.5 if we need to restrict to 'sure' terms only (value 1)
        DScoresKW[SWord] = AKScore
    else:
        DScoresNK[SWord] = AKScore

In [16]:
# reading the large keyword dictionary
DScoresKWLarge = readDictionary(FInputKWLarge, Caps=True)

In [17]:
# we create a dictionary of keyness values with only upper case letters, which will be checked against also uppercased term candidates
# the same dictionary as DScoresKW, but with ensured conversion in to upper case:
DScoresREKWquick = {} # dictionary of RE
for kw, val in DScoresKW.items():
    SUpperC = kw.upper() # making sure our key words are in upper case
    # these are the alternatives, which we do not consider in this stage...
    # SLowerC = kw.lower()
    # SSentenceC = kw.capitalize()
    # RPatternKW = re.compile('^' + kw + '$', re.IGNORECASE)
    DScoresREKWquick[SUpperC] = val # 
    # DScoresREKWquick[SLowerC] = val # 
    # DScoresREKWquick[SSentenceC] = val # 

In [18]:
printDictionary(DScoresREKWquick, FOutputKW)

## Preparing a dictionary of automatically extracted terms using PoS configurations vs. key-word based enhancements and re-orderings of this list

In [19]:
# Stage 3: Reading a file with extracted terms; capitalizing everything...
# Reading test data - Possible Terms (extracted automatically): reading the text files of single and multiword terms, recording ranks
# single words candidates
#
# Warning: these files are 8 and 70 MB respectively (relatively large to view on-line)
!wget https://heibox.uni-heidelberg.de/f/a9171080790f4932b7b1/?dl=1
!mv index.html?dl=1 BGH0_s02term1w.txt

# multiword candidates
!wget https://heibox.uni-heidelberg.de/f/2488701205e34e4683b1/?dl=1
!mv index.html?dl=1 BGH0_s02termMWE.txt

--2021-10-28 15:45:55--  https://heibox.uni-heidelberg.de/f/a9171080790f4932b7b1/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Connecting to heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)|129.206.7.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://heibox.uni-heidelberg.de/seafhttp/files/e549407e-ac75-4d1d-af32-82ae003f4c38/BGH_term1w.txt [following]
--2021-10-28 15:45:56--  https://heibox.uni-heidelberg.de/seafhttp/files/e549407e-ac75-4d1d-af32-82ae003f4c38/BGH_term1w.txt
Reusing existing connection to heibox.uni-heidelberg.de:443.
HTTP request sent, awaiting response... 200 OK
Length: 8954371 (8.5M) [text/plain]
Saving to: ‘index.html?dl=1’


2021-10-28 15:45:57 (5.30 MB/s) - ‘index.html?dl=1’ saved [8954371/8954371]

--2021-10-28 15:45:57--  https://heibox.uni-heidelberg.de/f/2488701205e34e4683b1/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Connecting to heibox

In [20]:
FAutoTerms1w = open('BGH0_s02term1w.txt', 'r')
FAutoTermsMWE = open('BGH0_s02termMWE.txt', 'r')

FoutAutoTerms1w = open('BGH0_s02term1w_out.txt', 'w')
FoutAutoTermsMWE = open('BGH0_s02termMWE_out.txt', 'w')

In [21]:
# ... here we add functions for reading this dictionary (e.g., as ranked list, etc.)
DAutoTerms1w = readDictionary(FAutoTerms1w, Caps=True)
DAutoTermsMWE = readDictionary(FAutoTermsMWE, Caps=True)


In [22]:
# optional: convert values to ranks
# DAutoTermsR1w = rankDict(DAutoTerms1w)
# DAutoTermsRMWE = rankDict(DAutoTermsMWE)

In [23]:
printDictionary(DAutoTerms1w,FoutAutoTerms1w)
printDictionary(DAutoTermsMWE,FoutAutoTermsMWE)

In [24]:
# Stage 3.1 combine extracted words with keyness (e.g., filter by keyness, etc.)
# here we will add / combine information about keyness...
# to be implemented...
# only allow those terms into the Auto dictionary, which have weights; replace frq by keyness weights (or sum)
# function to be used on all dictionaries:

def filterDictByKWDict(DAuto, DKeyness, Threshold = 0):
    DAutoFiltered = {}
    for SAutoTerm, Frq in DAuto.items():
        AKeynessAll = 0
        LAutoTermWs = re.split(' ', SAutoTerm)
        for STerm in LAutoTermWs:
            if STerm in DKeyness:
                if DKeyness[STerm] > AKeynessAll:
                    AKeynessAll = DKeyness[STerm] # we take the maximum keyness
        if AKeynessAll > Threshold:
            DAutoFiltered[SAutoTerm] = AKeynessAll

    return DAutoFiltered

In [25]:
DAutoTerms01KW_H1w = filterDictByKWDict(DAutoTerms1w, DScoresREKWquick) # autoterms filtered by human annotated items
DAutoTerms01KW_HMWE = filterDictByKWDict(DAutoTermsMWE, DScoresREKWquick)


In [26]:
FOutAutoTermsFiltered1w = open('BGH0_s03FilteredTerms1w.txt', 'w')
FOutAutoTermsFilteredMWE = open('BGH0_s03FilteredTermsMWE.txt', 'w')

In [27]:
printDictionary(DAutoTerms01KW_H1w, FOutAutoTermsFiltered1w)
printDictionary(DAutoTerms01KW_HMWE, FOutAutoTermsFilteredMWE)


In [28]:
# print statistics
print(len(DAutoTerms1w))
print(len(DAutoTermsMWE))

print(len(DScoresREKWquick))

print(len(DAutoTerms01KW_H1w))
print(len(DAutoTerms01KW_HMWE))

466669
2434043
809
673
454461


In [42]:
# now we update the dictionaries -- a different configuration (do not run in first experiment...)
# DAutoTerms1w -- was used for creating DAutoTerms01KW_H1w
# DAutoTermsMWE -- was used for creating DAutoTerms01KW_HMWE
DAutoTerms1w_copy = DAutoTerms1w
DAutoTermsMWE_copy = DAutoTermsMWE
del DAutoTerms1w
del DAutoTermsMWE
DAutoTerms1w = DAutoTerms01KW_H1w
DAutoTermsMWE = DAutoTerms01KW_HMWE


We will check:

- how terminology extraction works for Precison and Recall (intersecting the Gold Standard and extracted terms); 

- how high is the rank of the terms in the extracted list, etc...

In [29]:
# Stage04: preparing data for calculating precision and recall on the space of all possible MWEs, 1, 2, 3 words; (overlapping)
# keeping only 1 version of the text (2 annotators annotated the same text twice to measure interannotator agreement)
!cat BGH1_s00Astghik.txt BGH2_s00Hayk.txt BGH3_s00Tamar.txt >BGH0_s03GoldStandard1Version.txt


This function will be used for measuring P and R:
one-way comparison of dictionaries

In [30]:
FInputGS1V = open('BGH0_s03GoldStandard1Version.txt', 'r')
# tokenizing gold standard
'''
The idea is to tokenise the gold standard (from Stage 0), and to generate all possible MWEs for each string / pargraph
    then we can test what is the coverage (non-overlapping) or precision (overlapping)
    or: we create a dictionary of potential single and MWE strings and check what has been identified ?
    or: comparing with 'oracle': known annotations are run as a point of comparision on the space; and we establish relations, i.e., the amount of over-generation

    tasks: 
        4a: create the "all possible strings" space from gold standard text
        4b: intersect 4a results with corpus list of extracted MWEs >> generate "extracted from gold standard" dictionary
        4c: intersect human annotation in gold standard with 4a >> generate "correct in gold standard" dictionary
        4d: intersect 4b and 4c, >> correctly extracted
        4e: calculate 4d/4b = precision
            calculate 4d/4c = recall

'''
# 3a: processing gold standard: tokenizing
import re, os, sys
LLParTokens = [] # List of paragraphs, each represented as a list of tokens
for SLine in FInputGS1V:
    # print(SLine)
    SLine = SLine.strip() # implement this change

    # remove annotation


    SLine = re.sub('[<>]+', ' ', SLine)
    SLine = re.sub(' +', ' ', SLine)



    SLine = SLine.upper() # capitalize all words
    # print(SLine)

    LLine = re.split(' ', SLine) 

    # separate punctuation
    LLine = re.findall(r"[\w']+|[.,!?;()\-„“\"]", SLine)
    # SLine = re.sub(r'(,\.;:\-\!\?\(\)\[\]\“\")', r' \1 ', SLine)

    LLine = list(filter(None, LLine))
    LLParTokens.append(LLine)

FInputGS1V.close()


In [31]:
print(str(LLParTokens[9]))

['UND', 'DIE', 'RICHTER', 'AM', 'BUNDESGERICHTSHOF']


In [32]:
# Stage 4A
# generating candidate MWEs for cheking if / when they have been identified as terms
# algorithm from Terminologieextraktion3 notebook
# 4a: creating space of all possible overlapping MWEs in gold standard

def tokens2candNGrams(LWords, N): # working with specific N-gram size, to keep number of candidates under control
    '''
    convert a list of tokens into a list of all possible MWEs (works for each paragraph)
    '''
    LLCandidates = [] # lists - tokenised results
    # LSCandidates = [] # strings - joint results

    for i in range(N): # for up to the required N-gram length

        for IPosition in range(len(LWords) - i): # unigrams -- no change; bigrams: up to penultimate, etc.
            LCandidate = LWords[IPosition : IPosition + i + 1]
            # SCandidate = ' '.join(LCandidate)
            LLCandidates.append(LCandidate)
        # LSCandidates.append(SCandidate)
    
    return LLCandidates


# LLCandidates = tokens2candNGrams(['this', 'is', 'a', 'test', 'of', 'the', 'function'], 4)
# for L in LLCandidates:
#    print(str(L))

#    ''' # full version; now abandoned...
#    for klen in range(len(LWords)): # lengths of candidate lists
#        klength = klen+1 # true length: for 0 it is le = 1
#        # print(f'klen:{klength};')
#        for i in range(len(LWords) - klen): # positions where candidates start
#            # print(f'i:{i};')
#            LCandidate = LWords[i:i+klength]
#            SCandidate = ' '.join(LCandidate)
#            LLCandidates.append(LCandidate)
#            LSCandidates.append(SCandidate)
#        
#    return LLCandidates, LSCandidates
#    '''

In [33]:
DA_1W = {} # dictionary of 1-word candidates from the gold standard text (to be tested)
DA_2W = {} # dictionary of 2-word candidates from the gold standard text (to be tested)
DA_3W = {} # dictionary of 3-word candidates from the gold standard text (to be tested)
DA_MWE = {} # dictionary of MWE candidates from the gold standard text (to be tested)

for LTokens in LLParTokens: # for each paragraph
    LLCandidates = tokens2candNGrams(LTokens, 4)
    # print(str(LLCandidates))
    for LCandidate in LLCandidates:
        SCandidate = ' '.join(LCandidate)
        if len(LCandidate) > 3:
            try:
                DA_MWE[SCandidate] += 1
            except:
                DA_MWE[SCandidate] = 1  
        elif len(LCandidate) > 2:
            try:
                DA_3W[SCandidate] += 1
            except:
                DA_3W[SCandidate] = 1  
        elif len(LCandidate) > 1:
            try:
                DA_2W[SCandidate] += 1
            except:
                DA_2W[SCandidate] = 1  
        else:
            try:
                DA_1W[SCandidate] += 1
            except:
                DA_1W[SCandidate] = 1


In [34]:
FOutputA1w = open('BGH0_s04A_1w_res.txt', 'w')
FOutputA2w = open('BGH0_s04A_2w_res.txt', 'w')
FOutputA3w = open('BGH0_s04A_3w_res.txt', 'w')
FOutputAMWE = open('BGH0_s04A_MWE_res.txt', 'w')

In [35]:
printDictionary(DA_1W, FOutputA1w, Rev = True)
printDictionary(DA_2W, FOutputA2w, Rev = True)
printDictionary(DA_3W, FOutputA3w, Rev = True)
printDictionary(DA_MWE, FOutputAMWE, Rev = True)

In [36]:
W1A = len(DA_1W)
W2A = len(DA_2W)
W3A = len(DA_3W)
W4A = len(DA_MWE)
# FOutResults1.write('NoKW\t') # which run
# FOutResults1.write(f'NoKW\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
# FOutResults2.write('Run\tW1P\tW1R\tW2P\tW2R\tW3P\tW3R\tW4P\tW4R\n') # only precision and recall figures


In [37]:
# Stage 4B 
# creating output files for B
FOutputFOUND1wBinA = open('BGH0_s04BinA1w_resY.txt', 'w')
FOutputMISSING1wBinA = open('BGH0_s04BinA1w_resN.txt', 'w')

FOutputFOUND2wBinA = open('BGH0_s04BinA2w_resY.txt', 'w')
FOutputMISSING2wBinA = open('BGH0_s04BinA2w_resN.txt', 'w')

FOutputFOUND3wBinA = open('BGH0_s04BinA3w_resY.txt', 'w')
FOutputMISSING3wBinA = open('BGH0_s04BinA3w_resN.txt', 'w')

FOutputFOUNDMWEsBinA = open('BGH0_s04BinAMWE_resY.txt', 'w')
FOutputMISSINGMWEsBinA = open('BGH0_s04BinAMWE_resN.txt', 'w')

In [38]:
# Stage 4B preparing B-set for calculating performance
W1B, AAverageFoundRanks1wBinA, DB_1W = countIntersectDictionaries(DA_1W, DAutoTerms1w, FOutputFOUND1wBinA, FOutputMISSING1wBinA, SortBy = 0, Rev = False)
AFound2wBinA, AAverageFoundRanks2wBinA, DB_2W = countIntersectDictionaries(DA_2W, DAutoTermsMWE, FOutputFOUND2wBinA, FOutputMISSING2wBinA, SortBy = 0, Rev = False)
AFound3wBinA, AAverageFoundRanks3wBinA, DB_3W = countIntersectDictionaries(DA_3W, DAutoTermsMWE, FOutputFOUND3wBinA, FOutputMISSING3wBinA, SortBy = 0, Rev = False)
AFoundMWEBinA, AAverageFoundRanksMWEBinA, DB_MWE = countIntersectDictionaries(DA_MWE, DAutoTermsMWE, FOutputFOUNDMWEsBinA, FOutputMISSINGMWEsBinA, SortBy = 0, Rev = False)




Total len of Gold Standard: 16555
Found: 9454
Missing: 7101
Found2LenGS: 0.5710661431591664
Ave Found Ranks: 2455.1286228051617 

Total len of Gold Standard: 82886
Found: 7796
Missing: 75090
Found2LenGS: 0.09405689742537944
Ave Found Ranks: 160.4489481785531 

Total len of Gold Standard: 138283
Found: 2052
Missing: 136231
Found2LenGS: 0.01483913423920511
Ave Found Ranks: 32.143762183235864 

Total len of Gold Standard: 162112
Found: 415
Missing: 161697
Found2LenGS: 0.0025599585471772603
Ave Found Ranks: 26.636144578313253 



In [39]:
# FOutResults1.write(f'NoKW\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
W1B = len(DB_1W)
W2B = len(DB_2W)
W3B = len(DB_3W)
W4B = len(DB_MWE)


In [40]:
# 3c: creating output files for C (normally should be 100% overlap, this is just a check of the general approach)
FOutputFOUND1wCinA = open('BGH0_s04CinA1w_resY.txt', 'w')
FOutputMISSING1wCinA = open('BGH0_s04CinA1w_resN.txt', 'w')

FOutputFOUND2wCinA = open('BGH0_s04CinA2w_resY.txt', 'w')
FOutputMISSING2wCinA = open('BGH0_s04CinA2w_resN.txt', 'w')

FOutputFOUND3wCinA = open('BGH0_s04CinA3w_resY.txt', 'w')
FOutputMISSING3wCinA = open('BGH0_s04CinA3w_resN.txt', 'w')

FOutputFOUNDMWEsCinA = open('BGH0_s04CinAMWE_resY.txt', 'w')
FOutputMISSINGMWEsCinA = open('BGH0_s04CinAMWE_resN.txt', 'w')

In [41]:
# intersecting set A (all possible N-grams in gold standard) with the annotion from the gold standard
AFound1wCinA, AAverageFoundRanks1wCinA, DC_1W = countIntersectDictionaries(DA_1W, DGS1w, FOutputFOUND1wCinA, FOutputMISSING1wCinA, SortBy = 0, Rev = False)
AFound2wCinA, AAverageFoundRanks1wCinA, DC_2W = countIntersectDictionaries(DA_2W, DGS2w, FOutputFOUND2wCinA, FOutputMISSING2wCinA, SortBy = 0, Rev = False)
AFound3wCinA, AAverageFoundRanks1wCinA, DC_3W = countIntersectDictionaries(DA_3W, DGS3w, FOutputFOUND3wCinA, FOutputMISSING3wCinA, SortBy = 0, Rev = False)
AFoundMWECinA, AAverageFoundRanksMWECinA, DC_MWE = countIntersectDictionaries(DA_MWE, DGSMWE, FOutputFOUNDMWEsCinA, FOutputMISSINGMWEsCinA, SortBy = 0, Rev = False)


Total len of Gold Standard: 16555
Found: 2959
Missing: 13596
Found2LenGS: 0.1787375415282392
Ave Found Ranks: 5.422440013518081 

Total len of Gold Standard: 82886
Found: 266
Missing: 82620
Found2LenGS: 0.003209227131240499
Ave Found Ranks: 1.4548872180451127 

Total len of Gold Standard: 138283
Found: 133
Missing: 138150
Found2LenGS: 0.000961795737726257
Ave Found Ranks: 2.691729323308271 

Total len of Gold Standard: 162112
Found: 16
Missing: 162096
Found2LenGS: 9.869719699960521e-05
Ave Found Ranks: 1.125 



In [42]:
# FOutResults1.write(f'NoKW\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
W1C = len(DC_1W)
W2C = len(DC_2W)
W3C = len(DC_3W)
W4C = len(DC_MWE)

In [43]:
# Stage 5: precision and recall calculations
# precision
FOutputFOUND1wDinB_precision = open('BGH0_s05DinB1w_resY_precision.txt', 'w')
FOutputMISSING1wDinB_precision = open('BGH0_s05DinB1w_resN_precision.txt', 'w')

FOutputFOUND2wDinB_precision = open('BGH0_s05DinB2w_resY_precision.txt', 'w')
FOutputMISSING2wDinB_precision = open('BGH0_s05DinB2w_resN_precision.txt', 'w')

FOutputFOUND3wDinB_precision = open('BGH0_s05DinB3w_resY_precision.txt', 'w')
FOutputMISSING3wDinB_precision = open('BGH0_s05DinB3w_resN_precision.txt', 'w')

FOutputFOUNDMWEsDinB_precision = open('BGH0_s05DinBMWE_resY_precision.txt', 'w')
FOutputMISSINGMWEsDinB_precision = open('BGH0_s05DinBMWE_resN_precision.txt', 'w')


# recall
FOutputFOUND1wDinC_recall = open('BGH0_s05DinC1w_resY_recall.txt', 'w')
FOutputMISSING1wDinC_recall = open('BGH0_s05DinC1w_resN_recall.txt', 'w')

FOutputFOUND2wDinC_recall = open('BGH0_s05DinC2w_resY_recall.txt', 'w')
FOutputMISSING2wDinC_recall = open('BGH0_s05DinC2w_resN_recall.txt', 'w')

FOutputFOUND3wDinC_recall = open('BGH0_s05DinC3w_resY_recall.txt', 'w')
FOutputMISSING3wDinC_recall = open('BGH0_s05DinC3w_resN_recall.txt', 'w')

FOutputFOUNDMWEsDinC_recall = open('BGH0_s05DinCMWE_resY_recall.txt', 'w')
FOutputMISSINGMWEsDinC_recall = open('BGH0_s05DinCMWE_resN_recall.txt', 'w')

In [44]:
AFound1wDinB, AAverageFoundRanks1wDinB, DD_1Wp = countIntersectDictionaries(DB_1W, DC_1W, FOutputFOUND1wDinB_precision, FOutputMISSING1wDinB_precision, SortBy = 0, Rev = False)
AFound2wDinB, AAverageFoundRanks2wDinB, DD_2Wp = countIntersectDictionaries(DB_2W, DC_2W, FOutputFOUND2wDinB_precision, FOutputMISSING2wDinB_precision, SortBy = 0, Rev = False)
AFound3wDinB, AAverageFoundRanks3wDinB, DD_3Wp = countIntersectDictionaries(DB_3W, DC_3W, FOutputFOUND3wDinB_precision, FOutputMISSING3wDinB_precision, SortBy = 0, Rev = False)
AFoundMWEDinB, AAverageFoundRanksMWEDinB, DD_MWEp = countIntersectDictionaries(DB_MWE, DC_MWE, FOutputFOUNDMWEsDinB_precision, FOutputMISSINGMWEsDinB_precision, SortBy = 0, Rev = False)


Total len of Gold Standard: 9454
Found: 2398
Missing: 7056
Found2LenGS: 0.25364924899513436
Ave Found Ranks: 6.245204336947456 

Total len of Gold Standard: 7796
Found: 211
Missing: 7585
Found2LenGS: 0.02706516162134428
Ave Found Ranks: 1.5023696682464456 

Total len of Gold Standard: 2052
Found: 8
Missing: 2044
Found2LenGS: 0.003898635477582846
Ave Found Ranks: 1.125 

Total len of Gold Standard: 415
Found: 0
Missing: 415
Found2LenGS: 0.0
Ave Found Ranks: 0 



In [45]:
AFound1wDinC, AAverageFoundRanks1wDinC, DD_1Wr = countIntersectDictionaries(DC_1W, DB_1W, FOutputFOUND1wDinC_recall, FOutputMISSING1wDinC_recall, SortBy = 0, Rev = False)
AFound2wDinC, AAverageFoundRanks2wDinC, DD_2Wr = countIntersectDictionaries(DC_2W, DB_2W, FOutputFOUND2wDinC_recall, FOutputMISSING2wDinC_recall, SortBy = 0, Rev = False)
AFound3wDinC, AAverageFoundRanks3wDinC, DD_3Wr = countIntersectDictionaries(DC_3W, DB_3W, FOutputFOUND3wDinC_recall, FOutputMISSING3wDinC_recall, SortBy = 0, Rev = False)
AFoundMWEDinC, AAverageFoundRanksMWEDinC, DD_MWEr = countIntersectDictionaries(DC_MWE, DB_MWE, FOutputFOUNDMWEsDinC_recall, FOutputMISSINGMWEsDinC_recall, SortBy = 0, Rev = False)


Total len of Gold Standard: 2959
Found: 2398
Missing: 561
Found2LenGS: 0.8104089219330854
Ave Found Ranks: 5579.691409507924 

Total len of Gold Standard: 266
Found: 211
Missing: 55
Found2LenGS: 0.793233082706767
Ave Found Ranks: 921.3270142180095 

Total len of Gold Standard: 133
Found: 8
Missing: 125
Found2LenGS: 0.06015037593984962
Ave Found Ranks: 782.5 

Total len of Gold Standard: 16
Found: 0
Missing: 16
Found2LenGS: 0.0
Ave Found Ranks: 0 



In [46]:
# FOutResults1.write(f'NoKW\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
W1D = len(DD_1Wr)
W2D = len(DD_2Wr)
W3D = len(DD_3Wr)
W4D = len(DD_MWEr)

In [48]:
W1P = W1D / W1B
W1R = W1D / W1C

W2P = W2D / W2B
W2R = W2D / W2C

W3P = W3D / W3B
W3R = W3D / W3C

W4P = W4D / W4B
W4R = W4D / W4C


In [52]:
Run = 'NoKW' # NoKW = default, only frequency dictionary
# FOutResults1.write('NoKW\t') # which run
# FOutResults2.write('NoKW\t') # which run
FOutResults1.write(f'{Run}\t{W1A}\t{W1B}\t{W1C}\t{W1D}\t{W1P}\t{W1R}\t{W2A}\t{W2B}\t{W2C}\t{W2D}\t{W2P}\t{W2R}\t{W3A}\t{W3B}\t{W3C}\t{W3D}\t{W3P}\t{W3R}\t{W4A}\t{W4B}\t{W4C}\t{W4D}\t{W4P}\t{W4R}\n')
FOutResults2.write(f'{Run}\t{W1P}\t{W1R}\t{W2P}\t{W2R}\t{W3P}\t{W3R}\t{W4P}\t{W4R}\n') # only precision and recall figures
FOutResults1.flush()
FOutResults2.flush()