<a href="https://colab.research.google.com/github/iued-uni-heidelberg/DAAD-Training-2021/blob/main/Terminologieextraktion9EvaluationKeyWordsV04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Workflow
1. legal corpus > extraction of terms
- sequences;
- sub-sequences and patterns in sequences
- filtering for what has been extracted >> the same way as from annotation
- selecting the lemma field

2. Evaluation set (automatically annotating gold standard) (extracting candidates and automatically applying extraction based on Stage 1

3. gold standard terms (manual annotation);
- annotation
- extracting annotation:
- selecting lemma field;
- filtering by specific patterns


4. p/r measure -- baseline (max recall)

5. filtering: 
- keyness;
- association measures


## Part 0: global declarations

Libraries, functions


In [5]:
import os, re, sys

In [18]:
## to modify if necessary; however, we try to keep the code standard and parametrize as much as possible

# useful functions
# a useful function for recording / visualising current stage of dictionaries
def printDictionary(DictionaryFrq, FOut, K = 1, Rev = True): # printing a dictionary: by values or alphabetically
    for Word, Frq in sorted( DictionaryFrq.items() , key=lambda x: x[K], reverse=Rev):
        FOut.write(Word + '\t' + str(Frq) + '\n')
    FOut.flush()
    return

# another useful function to just read and return a 2-field dictionary, eg., frequency or keyness
def readDictionary(FIN, SkipComments = True, Caps=False):
    DScoresLarge = {} # keywords - scores
    for Line in FIN:
        if SkipComments and re.match('#', Line): 
            continue
        Line = Line.strip()
        if Caps: 
            Line = Line.upper() # convert to upper case
        LFieldsKW = re.split('\t', Line)
        SWord = LFieldsKW[0]
        AKScore = float(LFieldsKW[1])
        DScoresLarge[SWord] = AKScore   
    return DScoresLarge

# another possibly useful function: convert dictionary values to ranks (frequency, keyness weights, etc.)
# for understanding how far down the list the item has been found...
# currently not used ... 
def rankDict(DIN):
    '''
    reading a frequency dictionary from a file
    '''
    DTermRanks = {}
    i = 0
    IRank = 0
    IPrevFrq = 0
    SumRanks = 0
    for SKey, Frq in DIN.items():
        # if re.match('#', SKey): continue # skipping comments
        i+=1
        if IPrevFrq != Frq: IRank = i # rank is the number of the highest ranking element of the same frequency group
        IPrevFrq = Frq
        
        DTermRanks[SKey] = IRank
        SumRanks += IRank

    AAveRank = SumRanks / i
    print(f'MaxRank = {IRank}\nAve Rank = {AAveRank}\n')
    return DTermRanks, AAveRank


# Main evaluation function
# One-directional comparision of dictionaries
# one-directional comparison of two dictionaries; arguments: DGoldStandard (smaller) DTest (larger), file: GS items found in DTest; GS items missing from DText...
# usually testing: smaller vs. bigger dictionaries
def countIntersectDictionaries(DGS, DTest, FOutputPrecFOUND, FOutputPrecMISSING, SortBy = 0, Rev = False):
    '''
    general function: intersect dictionaries, return new intersection dictionaries, record "in" and "out" expressions
    
    3b: intersecting All possible MWEs in GS list with the "Extracted" list
    DA (smaller and going over each element) with D1W / DMWE lists 
    '''

    print('Total len of Gold Standard: ' + str(len(DGS.items())))
    IFound = 0
    IMissing = 0
    SumFoundRanks = 0
    DFound = {} # intersection dictionary

    for Word, Frq in sorted(DGS.items(),  key=lambda x: x[SortBy], reverse=Rev):
        if Word in DTest:
            IFound += 1
            try: # normally will not fire: if this word already exists with some rank, calculate the average of a new and old rank
                r0 = DFound[Word]
                r1 = DTest[Word]
                r = (r0+r1)/2
                DFound[Word] = r
                print('r?')
            except: # normal route: find the rank of the word in the dictionary
                DFound[Word] = DTest[Word]

            SumFoundRanks += DTest[Word] # add rank, to calculate average
            try: FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + str(DFound[Word]) + '\n') # record/calculate average rank, etc.
            except: 
                FOutputPrecFOUND.write(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
                print(Word + '\t' + str(Frq) + '\t' + 'KEY ERROR' + '\n')
        else:
            IMissing += 1
            FOutputPrecMISSING.write(Word + '\t' + str(Frq) + '\n') # record/calculate average rank, etc.

    
    # print(f'Found: {IFound}')
    # print(f'Missing: {IMissing}')
    try: ACoverage = IFound / len(DGS.items())
    except: ACoverage = 0
    # print(f'Found2LenGS: {ACoverage}')
    try: AAverageFoundRanks = SumFoundRanks / IFound
    except: AAverageFoundRanks = 0
    # print(f'Ave Found Ranks: {AAverageFoundRanks} \n')

    print(f'Found: {IFound} ; Missing: {IMissing} ; AveRank: {AAverageFoundRanks} ; ACoverage: {ACoverage} ')
    FOutputPrecFOUND.flush()
    FOutputPrecMISSING.flush()

    return ACoverage, AAverageFoundRanks, DFound



# extracting annotated terms from the gold standard in xml format
def vertCollectAnnotation(FInVert, SXmlTag, Caps = False):
    L3AnnotatedSegs = []
    L2Seg = [] # a list of the current segment -- eash string is added 
    BInTerm = False # boolean flag: inside / outside term
    RTagOpen = re.compile('<' + SXmlTag + '>')
    RTagClose = re.compile('</' + SXmlTag + '>')
    for SLine in FInVert:
        SLine = SLine.strip()
        if Caps:
            SLine = SLine.upper()
        if re.match(RTagOpen, SLine):
            BInTerm = True
        elif re.match(RTagClose, SLine):
            BInTerm = False
            L3AnnotatedSegs.append(L2Seg)
            L2Seg = []
        else:
            if BInTerm == True:
                LFields = re.split('\t', SLine)
                L2Seg.append(LFields)

    return L3AnnotatedSegs



# converting in-text annotation (e.g., in the bracket form) into proper XML format
def convertBrecket2Xml(FInAnnot, FOutAnnot, RInOpen, RInClose, SOutOpen, SOutClose):
    RCOpen = re.compile(RInOpen)
    RCClose = re.compile(RInClose)
    for SLine in FInAnnot:
        SLine.strip()
        SLine = re.sub(RCOpen, SOutOpen, SLine)
        SLine = re.sub(RCClose, SOutClose, SLine)

        FOutAnnot.write(SLine + '\n')
    FOutAnnot.flush()
    return

# a service function for creating a dictionary of field values
# is used for creating a dictionary of PoS patterns
# can destructively change the list, if Normalize = 2 (changing PoS codes as specified inside the function)
def createDictOfPatterns(L3AnnotatedSegs, IFieldN, Normalize = 0):
    '''
    take the list of annotated terms and return a dictionary of MWEs
    Normalize = 0 : do not normalize;
              = 1: normalize, but do not change the original list
              = 2: normalize and change the original list
    '''
    DPatternsFrq = {} # returned dictionary of PoS patterns, etc.
    for L2TermFlds in L3AnnotatedSegs:
        LFlds = [] # here we will collect the values of the selected field
        for LWordFlds in L2TermFlds:
            if Normalize < 2:
                LWordFlds0 = [] # making a copy of the list, not to modify original if Normalize is specified, or is it ok to normalize
                LWordFlds0.extend(LWordFlds)
            elif Normalize == 2:
                LWordFlds0 = LWordFlds # just use a reference to the same list
            
            if Normalize > 0:
                try: PoS = LWordFlds0[IFieldN]
                except: 
                    print('PoS not found')
                    PoS = ''
                if re.match('N', PoS): LWordFlds0[IFieldN] = 'N'
                if re.match('ADJ', PoS): LWordFlds0[IFieldN] = 'ADJ'
                if re.match('V', PoS): LWordFlds0[IFieldN] = 'V'
            try: LFlds.append(LWordFlds0[IFieldN])
            except: print('index Error')
        if len(LFlds) > 0: SFlds = ' '.join(LFlds)
        try: DPatternsFrq[SFlds] += 1
        except: DPatternsFrq[SFlds] = 1

    return DPatternsFrq






# change case for elements of the list
def changeCaseL3(L3Segs, LFlds2Caps = [0, 2], Mode='upper', StripB = True):
    '''
    Mode = upper -- to all caps;
        = lower -- to lowercase;
        = capitalize -- to sentence case;
    '''
    for L2Seg in L3Segs: # for each multiword term in the list of terms
        for LWordFlds in L2Seg: # for each word in the list of words in the multiword term
            for IFld in LFlds2Caps: # for each field that needs to change case
                SWord = LWordFlds[IFld]
                if StripB: SWord = SWord.strip('<>')
                if Mode == 'upper': 
                    SWord = SWord.upper()
                if Mode == 'lower':
                    SWord = SWord.lower()
                if Mode == 'capitalize':
                    SWord = SWord.capitalize()
                LWordFlds[IFld] = SWord
    return



def readDictKWAnnotations(FInputKW, AKStatThreshold = 1, Caps=True):
    '''
    reading a keyword file, returning a dictionary of keywords / not keywords
    AKStatThreshold = 1 (only sure keywords)
                    = 0.5 (unsure keywords)
                    = 0 (all annotated keywords)
    '''
    DScoresKW = {} # keywords - scores
    DScoresNK = {} # non-keywords
    DStatKW = {} # status: key/non-key-word
    for Line in FInputKW:
        LFieldsKW = re.split('\t', Line) # add: strip()
        SWord = LFieldsKW[1]
        if Caps: SWord = SWord.upper()
        AKScore = float(LFieldsKW[2])
        AKStat = float(LFieldsKW[3])
        DStatKW[SWord] = AKStat
        if AKStat >= AKStatThreshold: # change value to 0.5 if we need to restrict to 'sure' terms only (value 1)
            DScoresKW[SWord] = AKScore
        else:
            DScoresNK[SWord] = AKScore
    return DScoresKW, DScoresNK


In [None]:
# main functions -- will be used also in extraction process
# L3AnnotatedSegs[10]
# FInput = open('BGH0_s00GoldStandard.txt', 'r')
# for statistical purposes - separately single and multiword terms

# function(s) for selecting patterns in a list of candidates; 
# these can be either positive patterns, or, if postive are not specified, then negative pattenrs (start, edge or end restrictions on PoS codes)

class ContinueI(Exception):
    pass

continue_i = ContinueI()

def comparePattern(L2TermFlds, LPattern, IFldN):
    '''
    compares if a pattern is found in the term field
    '''
    for k in range(len(LPattern)):
        if re.match(LPattern[k], L2TermFlds[k][IFldN]): continue
        else: return False
    return True


def selectTerms(L3AnnotatedSegs, L2Patterns = None, LNoEdge = None, LNoStart = None, L2NoEnd = None,  SplitLen = False, IFldNumber = 0):
    '''
    function: 1. selects terms which match specified POS pattern; 2. divides them into dictionaries according to length
    the function can also visualise terms with a specific pos pattern, specified in L2Patterns, e.g., L2Patterns = [['N', '\$']]

    '''
    DGS = {}
    DGS1w = {} # dictionary of single words
    DGS2w = {} # dictionary of 2-word expressions
    DGS3w = {} # dictionary of 3-word expressions
    DGS4w = {} # dictionary of other mwes
    IGS = 0
    IGS1w = 0 # number of annotated tokens of single words
    IGS2w = 0
    IGS3w = 0
    IGS4w = 0 # number of annotated tokens of multiwords
    ISelectedTermsCount = 0
    L3SelectedTerms = [] # return not only dictionaries, but also a list of the selected terms, in the order how they apper

    if L2Patterns: # positive filter
        for L2AnnotatedSeg in L3AnnotatedSegs: # for each multiword term, where words are represented as fields
            ILenTerm = len(L2AnnotatedSeg)
            for LPattern in L2Patterns:
                if len(LPattern) == ILenTerm and comparePattern(L2AnnotatedSeg, LPattern, 1):
                    L3SelectedTerms.append(L2AnnotatedSeg) # last addition: adding the term to the list, if it matches the selected pattern
                    ISelectedTermsCount += 1
                    LTerm = []
                    for LTerm2Fields in L2AnnotatedSeg:
                        LTerm.append(LTerm2Fields[IFldNumber])
                    STerm = ' '.join(LTerm)
                    try: DGS[STerm] += 1
                    except: DGS[STerm] = 1

    else: # negative filter checking
        for L2AnnotatedSeg in L3AnnotatedSegs:
            if not L2AnnotatedSeg: continue
            try: SEnd = L2AnnotatedSeg[-1][1]
            except: print('index: L2AnnotatedSeg - end' + str(L2AnnotatedSeg))

            try: SStart = L2AnnotatedSeg[0][1]
            except: print('index: L2AnnotatedSeg - start' + str(L2AnnotatedSeg))
            
            try:
                if LNoEdge: # PoS which cannot apper at the edge
                    for SPoS in LNoEdge:
                        if re.match(SPoS, SEnd) or re.match(SPoS, SStart):
                            # print('edge: ' + SPoS + ' ' + SStart  + ' ' + SEnd)
                            raise continue_i
                if LNoStart:
                    for SPoS in LNoStart:
                        if re.match(SPoS, SStart): 
                            # print('start: ' + SPoS + ' ' + SStart)
                            raise continue_i
                if L2NoEnd:
                    for SPoS in L2NoEnd:
                        if re.match(SPoS, SEnd): 
                            # print('end: ' + SPoS + ' ' + SEnd)
                            raise continue_i
                L3SelectedTerms.append(L2AnnotatedSeg) # last addition: the term passed the negative filter, and is added to the list
                ISelectedTermsCount += 1
                LTerm = []
                for LTerm2Fields in L2AnnotatedSeg:
                    LTerm.append(LTerm2Fields[IFldNumber])
                STerm = ' '.join(LTerm)
                try: DGS[STerm] += 1
                except: DGS[STerm] = 1

            except ContinueI: 
                continue

    if SplitLen:
        for GSTerm, Frq in DGS.items():
            LGSTErms = re.split(' ', GSTerm)

            if len(LGSTErms) > 3:
                IGS4w += Frq
                IGS += Frq
                try: DGS4w[GSTerm] += Frq
                except: DGS4w[GSTerm] = Frq
            elif len(LGSTErms) > 2:
                IGS3w += Frq
                IGS += Frq
                try: DGS3w[GSTerm] += Frq
                except: DGS3w[GSTerm] = Frq
            elif len(LGSTErms) > 1:
                IGS2w += Frq
                IGS += Frq
                try: DGS2w[GSTerm] += Frq
                except: DGS2w[GSTerm] = Frq
            else:
                IGS1w += Frq
                IGS += Frq
                try: DGS1w[GSTerm] +=Frq
                except: DGS1w[GSTerm] = Frq


    print(IGS1w, IGS2w, IGS3w, IGS4w, IGS, ISelectedTermsCount)
    print(len(DGS1w), len(DGS2w), len(DGS3w), len(DGS4w), len(DGS), len(L3SelectedTerms))
    return DGS, DGS1w, DGS2w, DGS3w, DGS4w, L3SelectedTerms



In [None]:
# End: Stage 0: Some useful read/write and convert functions

## Part 1 - Legal corpus: Terminology extraction

This part of the workflow uses large files and may run for up to 20 minutes, needs to be run once

In [None]:
# Stage 1: preparing terminology extraction workflow (~ 1 min, but may run longer)
# 23.10.2021 part
# German legal corpus, lemmatized in a zip archive (archive = 641 MB in zip archive)
!wget https://heibox.uni-heidelberg.de/f/fd96c36723b741d4a972/?dl=1
# renaming file ()
!mv index.html?dl=1 BGH-utf8-lem.zip

--2021-11-05 05:48:52--  https://heibox.uni-heidelberg.de/f/fd96c36723b741d4a972/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Connecting to heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)|129.206.7.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://heibox.uni-heidelberg.de/seafhttp/files/d582f9b9-1698-479f-97ce-59ae7f79d825/output-utf8-lem.zip [following]
--2021-11-05 05:48:52--  https://heibox.uni-heidelberg.de/seafhttp/files/d582f9b9-1698-479f-97ce-59ae7f79d825/output-utf8-lem.zip
Reusing existing connection to heibox.uni-heidelberg.de:443.
HTTP request sent, awaiting response... 200 OK
Length: 673144348 (642M) [application/zip]
Saving to: ‘index.html?dl=1’


2021-11-05 05:49:36 (14.8 MB/s) - ‘index.html?dl=1’ saved [673144348/673144348]



In [None]:
# extraction ~ 1 min
!unzip BGH-utf8-lem.zip
!rm BGH-utf8-lem.zip

Archive:  BGH-utf8-lem.zip
  inflating: output-utf8-lem.txt     


In [None]:
!head --lines=10 output-utf8-lem.txt

<doc id="t1000001">
Nachschlagewerk	NN	Nachschlagewerk
:	$.	:
ja	ADV	ja
BGHSt	VVFIN	BGHSt
:	$.	:
nein	PTKANT	nein
Veröffentlichung	NN	Veröffentlichung
:	$.	:
ja	ADV	ja


In [None]:
!mv output-utf8-lem.txt BGHlem.txt

In [None]:
!head --lines=4000000 BGHlem.txt >BGHlem1k.txt

In [None]:
# OPTIONAL -- just to know how big is our corpus, on a long corpus it can take a lot of time...
# word counts: BGHlem.txt should be ~221 M lines long (vert lemmatized format; so one line is one word); runs ~ 1min
!wc BGHlem1k.txt
!wc BGHlem.txt

 4000000 11990777 66682067 BGHlem1k.txt
 220915764  662340855 3684207041 BGHlem.txt


In [None]:
# OPTIONAL -- just to demonstrate the component of the algorithm of term selection
# debugging and explaining the algorithm for Stage 1:
# taking the maximal possilbe PoS string; 
# sub-dividing it into other reasonable strings --> term candidates;
# filtering only possible / interesting patterns / terms matching these patterns
LLCandidates = []
LWords = ['Revisionsverfahren', 'entstanden', 'notwendig', 'Auslage']
for klen in range(len(LWords)): # lengths of candidate lists
    klength = klen+1 # true length: for 0 it is le = 1
    print(f'klen:{klength};')
    for i in range(len(LWords) - klen): # positions where candidates start
        print(f'i:{i};')
        LCandidate = LWords[i:i+klength]
        LLCandidates.append(LCandidate)
        print(LCandidate)

        # print(LWords[i])

for LEl in LLCandidates:
    # for el in LEl:
    print(LEl)

In [None]:
# Terminologieextraktion
# main automated pattern-based extraction
# todo:
# a. change -- lemmas: done
# b. BACK TO NEGATIVE FILTER FOR EFFICIENCY 
# use positive patterns for extraction for candidates (same function as for gold standard)
#     -- for this we need to move to collecting a 2D lists -- features
#     

class clProcCorpus(object):
    ''' we will read a text file and return a dictionary
    this will be done on the line by line basis
    The dictionary can be sorted later...
    '''
    # this class is processing a corpus

    def __init__(self, FileIN):
        self.DictFrq = {}
        self.processCorpus(FileIN)


    def procCorpSelectField4FrqDic(self, IFieldNumber, L2PossibleTerm):
        '''
        this functions takes a 2D list (just one possible term at a time, with all fields) and returns a string which represents the term -- either as inflected words or lemmas
        '''
        LSelectedField = []
        for LFields in L2PossibleTerm:
            LSelectedField.append(LFields[IFieldNumber])
        
        SSelectedField = ' '.join(LSelectedField)
        return SSelectedField

        


    def processCorpus(self, FileIN):
        # here we consider a larger MWE, which has been collected, e.g., 
        # Wahl Schluckebier Nachschlagewerk; or:
        # graphical user interface
        # in the default version only the longest string is preserved;
        # we try to split it into meaningful smaller units and preserve them as well:
        # Wahl Schluckebier
        # graphical user ; user interface ; interface
        # PoS restrictions apply: 
        # - Adj cannot be at the end; function words (Prepositions, articles, etc.) cannot be on either edge.
        # so we keep the list of tuples: and extract smaller MWEs from the larger strings using PoS restrictions

        # Stage 0: collecting the longest string, with allowed PoS codes, e.g., the longest contiuous string of N, Adj, Prep, Gen. articles
        LLTerm = [] # this is the List of Lists: list of words (with their fields), representing a sequence of allowed PoS codes
        n = 0 # count lines processed (monitoring progress over the corpus)
        for Line in FileIN:
            n+=1
            if n%1000000 == 0: print(n) # print every 1 Millonth line number
            Line = Line.strip()
            LLine = re.split('\t', Line)
            
            try:
                Word = LLine[0]
                PoS = LLine[1]
                Lemma = LLine[2]
            except:
                Word = ""
                PoS = ""
                Lemma = ""
            
            #Select the Tags for your langauge
            # update: this is now done in two stages: 
            # Stage 1: a set of any PoS codes which may contain candidates
            # Stage 2: restrictions on sequences (e.g., what is not allowed at the end, start...)
            # 
            # CHANGE HERE FOR YOUR LANGUAGE: stage 1
            # if re.match('N.*', PoS) or re.match('A.*', PoS): #Arm
            # if re.match('N.*', PoS) or re.match('J.*', PoS): #EN
            # if re.match('N.*', PoS) or re.match('ADJ.*', PoS): #DE
            # L2Patterns = [['N'], ['ADJ'], ['ADJ', 'N'], ['N', 'N'], ['N', 'ART', 'N'], ['N', 'APPR', 'N'], ['ADJ', 'ADJ', 'N'], ['ADJ', 'N', 'N'], ['N', 'N', 'N'], ['N', 'ADJ', 'N'], ['APPR', 'ART','N'], ['N', 'APPR', 'ART', 'N'], ['N', 'ART', 'ADJ', 'N']]
            if re.match('N.*', PoS) or re.match('ADJ.*', PoS) or re.match('ART.*', PoS) or re.match('APPR.*', PoS): #DE -- extended
                LLTerm.append(LLine) # all the field as a list, to form the list of lists

                # Terms as Words or Lemmas -- MOVED TO Stage 2
                # to compare with the gold standard do we need words, or do we need to generate words from lemas?
                # LTerm.append(Lemma)
                # here we keep all the fields, because sub-sections of the longer MWE would also be analysed using PoS codes

            else: # end of the 'candidate collection'
                ### changed to an algorithm based on the data format: list of lists

                # Stage 1: we generate candidate sub-n-grams using the longest allowed sequence, starting from single words to the longest one (the whole sequence)
                L3Candidates = [] # this will be the list of all MWE candidates (sub-sequences)
                IMaxMWE = len(LLTerm) # this is the longest MWE we can get from LLTerm: the length of the whole sequence
                for klen in range(IMaxMWE): # lengths of candidate lists: starting from 0 till, but not including the max length
                    klength = klen+1 # converting to true length: from length 1 till, and including the max length (we need this for slicing)
                    # print(f'klen:{klength};')
                    for i in range(IMaxMWE - klen): # positions where candidates start: position is from 0... up to (depending on length)
                        # print(f'i:{i};')
                        L2Candidate = LLTerm[i:i+klength] # L2 stands fro 'list of lists' (because we preserve 'word', 'lemma', 'pos' as a list for each item)
                        L3Candidates.append(L2Candidate) # L3 is 'list of lists of lists' (here we collect all candidate sub-sequences from the longest L2 sequence)

                # Stage 2: filtering by part-of-speech configurations: what is impossible, e.g., Adjectives can only appear in the beginning!
                # CHANGE HERE FOR YOUR LANGUAGE: stage 2
                L3PossibleTerms = [] # filtered lists of terms
                
                # DTermConfFrqC, DGS1wC, DGS2wC, DGS3wC, DGS4wC, L3SelectedTerms = selectTerms(L3AnnotatedSegs, L2Patterns = [['N'], ['ADJ'], ['ADJ', 'N'], ['N', 'N'], ['N', 'ART', 'N'], ['N', 'APPR', 'N'], ['ADJ', 'ADJ', 'N'], ['ADJ', 'N', 'N'], ['N', 'N', 'N'], ['N', 'ADJ', 'N'], ['APPR', 'ART','N'], ['N', 'APPR', 'ART', 'N'], ['N', 'ART', 'ADJ', 'N']], LNoEdge = None, LNoStart = None, L2NoEnd = None, SplitLen = True, IFldNumber = 2)
                # DTermConfFrqC, DGS1wC, DGS2wC, DGS3wC, DGS4wC, L3PossibleTerms = selectTerms(L3Candidates, L2Patterns = [['N'], ['ADJ'], ['ADJ', 'N'], ['N', 'N'], ['N', 'ART', 'N'], ['N', 'APPR', 'N'], ['ADJ', 'ADJ', 'N'], ['ADJ', 'N', 'N'], ['N', 'N', 'N'], ['N', 'ADJ', 'N'], ['APPR', 'ART','N'], ['N', 'APPR', 'ART', 'N'], ['N', 'ART', 'ADJ', 'N']], LNoEdge = None, LNoStart = None, L2NoEnd = None, SplitLen = True, IFldNumber = 2)

                ## selectTerms(L3AnnotatedSegs, L2Patterns = [['N'], ['ADJ'], ['ADJ', 'N'], ['N', 'N'], ['N', 'ART', 'N'], ['N', 'APPR', 'N'], ['ADJ', 'ADJ', 'N'], ['ADJ', 'N', 'N'], ['N', 'N', 'N'], ['N', 'ADJ', 'N'], ['APPR', 'ART','N'], ['N', 'APPR', 'ART', 'N'], ['N', 'ART', 'ADJ', 'N']], LNoEdge = None, LNoStart = None, L2NoEnd = None, SplitLen = True, IFldNumber = 2)

                # removing: replaced by a positive filter
                # putting back
                
                for L2Candidate in L3Candidates: # for each candidate configuration
                    # e.g., if Adj is the last element (at the end) -- do not add
                    # ADD MORE CONFIGURATIONS, eg, Gen case articles, prepositions
                    # FileDebug.write(str(L2Candidate) + '\n') # TEMPORARY -  DEBUGGING
                    # Word = L2Candidate[WNumber][0]; PoS = L2Candidate[WNumber][1]; Lemma = L2Candidate[WNumber][2]
                    # CONF 1: remove adjectives at the end: ListOfFields4LastWord = L2Candidate[-1] 
                    if re.match('ADJ.*', L2Candidate[-1][1]): 
                        # FileDebug.write('\t Removed:' + str(L2Candidate) + '\n')
                        continue
                    elif re.match('ART.*', L2Candidate[-1][1]) or re.match('ART.*', L2Candidate[0][1]): 
                        # FileDebug.write('\t Removed:' + str(L2Candidate) + '\n')
                        continue
                    elif re.match('APPR.*', L2Candidate[-1][1]) or re.match('APPR.*', L2Candidate[0][1]): 
                        # FileDebug.write('\t Removed:' + str(L2Candidate) + '\n')
                        continue                    
                    # ADD MORE CONFIGURATIONS HERE, e.g, Gen. case articles...
                    else: # when all filters passed -- add all fields; frq list of terms will be represented by the Word or Lemma field in the end
                        L3PossibleTerms.append(L2Candidate)
                # here we have L3PossibleTerms ready; con
                # FileDebug.write('---\n' + str(L3PossibleTerms) + '\n---\n')
                

                # end: removing : replaced by a positive filter
                # putting back

                # going over all elements in the list of kept PossibleTerms, converting to each to a string and adding each one to the frequency dictionary
                
                for L2PossibleTerm in L3PossibleTerms:
                    # CHOOSE SECOND ARGUMENT TO COLLECT: Word = 0; PoS = 1; Lemma = 2
                    # SPossibleTerm = self.procCorpSelectField4FrqDic(0, L2PossibleTerm)
                    SPossibleTerm = self.procCorpSelectField4FrqDic(2, L2PossibleTerm)

                    try:
                        # self.DictFrq[STerm] += 1
                        self.DictFrq[SPossibleTerm] += 1
                    except:
                        # self.DictFrq[STerm] = 1
                        self.DictFrq[SPossibleTerm] = 1  

                LLTerm = [] # now we clear the list represening allowed PoS sequence, and start over again (we are in the else, we encountered end of allowed PoS codes)
                     
        return



In [None]:
# TEST:
# FileIN = open('BGHlem1k.txt', 'r')
# PRODUCTION:
FileIN = open('BGHlem.txt', 'r')

FileOut1w = open('BGH_term1w.txt', 'w')
FileOutMWE = open('BGH_termMWE.txt', 'w')
# these files don't need to be re-imported in the later stage -- merge this together...

# save the frequency dictionary into file, by decreasing frequencies
# FileOutput.write( str( DictionaryFrq ) + '\n' )

OCorpus = clProcCorpus(FileIN)
DictionaryFrq = OCorpus.DictFrq


for Word, Frq in sorted( DictionaryFrq.items() , key=lambda x: x[1], reverse=True):
    if re.search(' ', Word):
        FileOutMWE.write(Word + '\t' + str(Frq) + '\n')
    else:
        FileOut1w.write(Word + '\t' + str(Frq) + '\n')

FileOutMWE.flush()
FileOutMWE.close()
FileOut1w.flush()
FileOut1w.close()



In [None]:
!zip BGH_term1w.zip BGH_term1w.txt
!zip BGH_termMWE.zip BGH_termMWE.txt

  adding: BGH_term1w.txt (deflated 63%)
  adding: BGH_termMWE.txt (deflated 84%)


In [None]:
# optional: how large are our output files for MWEs and single words?
!wc BGH_term1w.txt
!wc BGH_termMWE.txt

 441279  882130 8352855 BGH_term1w.txt
 10765229  67981861 520948169 BGH_termMWE.txt


In [None]:
!head --lines=10 BGH_term1w.txt
!head --lines=10 BGH_termMWE.txt

### Stage 1.1: manually: get onto heiBox

The archives are downloaded and saved in heibox directory. Further we download these files, if the next stages are performed independently.


### Stage 1.2: uploading the the file back to colab

Download is done from HeiBox

In [None]:
# Stage 1.2
# Stage 3: Reading a file with extracted terms; capitalizing everything...
# Reading test data - Possible Terms (extracted automatically): reading the text files of single and multiword terms, recording ranks
# single words candidates
### replaced with new run
### run is producing these files in Stage 1 (is ready, doesn't need to be run again)
#
# Warning: these files are 8 and 70 MB respectively (relatively large to view on-line)
# !wget https://heibox.uni-heidelberg.de/f/a9171080790f4932b7b1/?dl=1
# !mv index.html?dl=1 BGH0_s02term1w.txt

# multiword candidates
# !wget https://heibox.uni-heidelberg.de/f/2488701205e34e4683b1/?dl=1
# !mv index.html?dl=1 BGH0_s02termMWE.txt

# new run: with new pos patterns
# multiword terms (zip archive)
!wget https://heibox.uni-heidelberg.de/f/6e345e17e22d45bc8245/?dl=1
!mv index.html?dl=1 BGH_termMWE02.zip
!unzip BGH_termMWE02.zip
# single words archive
!wget https://heibox.uni-heidelberg.de/f/7a2fed4851a940958c43/?dl=1
!mv index.html?dl=1 BGH_term1w02.zip
!unzip BGH_term1w02.zip


In [2]:
!mv BGH_termMWE.txt BGH0_s02termMWE.txt
!mv BGH_term1w.txt BGH0_s02term1w.txt

In [3]:
!wc BGH0_s02termMWE.txt
!wc BGH0_s02term1w.txt
!head --lines=10 BGH0_s02term1w.txt
!head --lines=10 BGH0_s02termMWE.txt

 10765229  67981861 520948169 BGH0_s02termMWE.txt
 441279  882130 8352855 BGH0_s02term1w.txt
§	1886129
Beklagte	784204
rn	584370
Urteil	535254
Angeklagte	505748
vgl	487398
ZR	486364
Satz	438322
Beschluss	435239
Berufungsgericht	432318
@card@ rn	121146
Bundesgerichtshof Beschluss	94989
§ @card@ rn	85034
@card@ BGB	81532
Zivilsenat die Bundesgerichtshof	78478
§ @card@ BGB	78070
Richter Dr	72440
Revision die Angeklagte	61582
Vorsitzende Richter	60142
VI ZR	56532


In [7]:
# reading files into a dictionary - exe ~ 41 seconds;
FAutoTerms1w = open('BGH0_s02term1w.txt', 'r')
FAutoTermsMWE = open('BGH0_s02termMWE.txt', 'r')

# FoutAutoTerms1w = open('BGH0_s02term1w_out.txt', 'w')
# FoutAutoTermsMWE = open('BGH0_s02termMWE_out.txt', 'w')
# ... here we add functions for reading this dictionary (e.g., as ranked list, etc.)

DAutoTerms1w = readDictionary(FAutoTerms1w, Caps=True)
DAutoTermsMWE = readDictionary(FAutoTermsMWE, Caps=True)

In [9]:
try:
    print(DAutoTerms1w['ANGEKLAGTE'])
    print(DAutoTerms1w['VGL'])
except:
    print('keys not found')

try:
    print(DAutoTermsMWE['ZIVILSENAT DIE BUNDESGERICHTSHOF'])
    print(DAutoTermsMWE['VORSITZENDE RICHTER'])
except:
    print('keys not found')

505748.0
16.0
78478.0
60142.0


### End of Part 1: terminology extraction from legal corpus

## Part 2: The test set
(Test set is also manually annotated and used as a gold standard)

### 2.1. Concatenating the annotated test sets (5 out of 6 files)
Then -- this will be used for extracting annotation in further stages

In [None]:
## preparing for TreeTagger processing (do not have to re-run again once completed)
## Annotated Gold Standard
# Stage 1: Preparing Gold standard: Reading / extracting information from gold standard: creating a list of annotated terms
# set 1 (same text annotated by two annotators)

!wget https://heibox.uni-heidelberg.de/f/ae1110c4f9ad42b9a3d5/?dl=1
!mv index.html?dl=1 BGH1_s00Astghik.txt
!wget https://heibox.uni-heidelberg.de/f/398e7a10fa3241519f26/?dl=1
!mv index.html?dl=1 BGH1_s00Maia.txt

# set 2 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/0c787f26123f49178639/?dl=1
!mv index.html?dl=1 BGH2_s00Hayk.txt
!wget https://heibox.uni-heidelberg.de/f/356205b502fb4d759ad5/?dl=1
!mv index.html?dl=1 BGH2_s00Nino.txt

# set 3 (same text annotated by two annotators)
!wget https://heibox.uni-heidelberg.de/f/ed0c7af9a9d04967b449/?dl=1
!mv index.html?dl=1 BGH3_s00Tamar.txt
# !wget 
# !mv index.html?dl=1

# one more will be added: Frau Khachatryan
!cat BGH1_s00Astghik.txt BGH1_s00Maia.txt BGH2_s00Hayk.txt BGH2_s00Nino.txt BGH3_s00Tamar.txt >BGH0_s00GoldStandard.txt

FInBGH0_s00GoldStandard = open('BGH0_s00GoldStandard.txt', 'r')
FOutBGH0_s00GoldStandard = open('BGH0_s00GoldStandard_xml.txt', 'w')

convertBrecket2Xml(FInBGH0_s00GoldStandard, FOutBGH0_s00GoldStandard, '<<+', '>>+', '<TERM>', '</TERM>')
# this result is pos-tagged and uploaded in the next step
# command:
# tree-tagger-de.sh /Users/bogdan/Seafile/research/corpus/DAAD-corpus/daad-experiments/BGH0_s00GoldStandard_xml.txt >/Users/bogdan/Seafile/research/corpus/DAAD-corpus/daad-experiments/BGH0_s00GoldStandard_LEM.txt



### 2.2 Creating a version of 1 copy of the text and cleaning of the test set from human annotations 

(for automated processing)

In [12]:
# Stage04: preparing data for calculating precision and recall on the space of all possible MWEs, 1, 2, 3 words; (overlapping)
# keeping only 1 version of the text (2 annotators annotated the same text twice to measure interannotator agreement)
!cat BGH1_s00Astghik.txt BGH2_s00Hayk.txt BGH3_s00Tamar.txt >BGH0_s03GoldStandard1Version.txt
FInputGS1V = open('BGH0_s03GoldStandard1Version.txt', 'r')
FOutputGS1V = open('BGH0_s04GoldStandard1Version_text.txt', 'w')

# gold standard - 1 version:
# we clean up the document, with only 1 copy of the text; then we run it locally through TreeTagger and then process here

'''
The idea is to tokenise the gold standard (from Stage 0), and to generate all possible MWEs for each string / pargraph
    then we can test what is the coverage (non-overlapping) or precision (overlapping)
    or: we create a dictionary of potential single and MWE strings and check what has been identified ?
    or: comparing with 'oracle': known annotations are run as a point of comparision on the space; and we establish relations, i.e., the amount of over-generation

    tasks: 
        4a: create the "all possible strings" space from gold standard text
        4b: intersect 4a results with corpus list of extracted MWEs >> generate "extracted from gold standard" dictionary
        4c: intersect human annotation in gold standard with 4a >> generate "correct in gold standard" dictionary
        4d: intersect 4b and 4c, >> correctly extracted
        4e: calculate 4d/4b = precision
            calculate 4d/4c = recall

'''
# 3a: processing gold standard: tokenizing

LLParTokens = [] # List of paragraphs, each represented as a list of tokens
for SLine in FInputGS1V:
    # print(SLine)
    SLine = SLine.strip() # implement this change

    # remove annotation


    SLine = re.sub('[<>]+', ' ', SLine)
    SLine = re.sub(' +', ' ', SLine)

    FOutputGS1V.write(SLine + '\n')

FInputGS1V.close()
FOutputGS1V.flush()
FOutputGS1V.close()

In [None]:
!echo file1 BGH0_s03GoldStandard1Version.txt
!head --lines=10 BGH0_s03GoldStandard1Version.txt
!echo file2 BGH0_s04GoldStandard1Version_text.txt
!head --lines=10 BGH0_s04GoldStandard1Version_text.txt

a) manual stage: the file is downloaded, lemmatized with tree tagger and uploaded onto colab back again

In [None]:
!wget https://heibox.uni-heidelberg.de/f/d39b640f70504b4fb86a/?dl=1
!mv index.html?dl=1 BGH0_s04GoldStandard1Version_LEM.txt

In [16]:
!head --lines=10 BGH0_s04GoldStandard1Version_LEM.txt
!wc BGH0_s04GoldStandard1Version_LEM.txt
!wc BGH0_s04GoldStandard1Version_text.txt

BUNDESGERICHTSHOF	NN	Bundesgerichtshof
IM	NE	IM
NAMEN	NN	Name
DES	ART	die
VOLKES	NN	Volk
URTEIL	VVIMP	urteilen
1	CARD	1
StR	NN	StR
42/01	CARD	@card@
vom	APPRART	von
 198146  594342 3297455 BGH0_s04GoldStandard1Version_LEM.txt
   3469  169020 1215715 BGH0_s04GoldStandard1Version_text.txt


here we pass the output file to TreeTagger and download the result, the statistics is:

bash-3.2$ wc BGH0_s04GoldStandard1Version_LEM.txt 

BGH0_s04GoldStandard1Version_text.txt

    198146  594438 3297455 BGH0_s04GoldStandard1Version_LEM.txt
    3469  169054 1215715 BGH0_s04GoldStandard1Version_text.txt
    201615  763492 4335398 total



### 2.3. Automated extraction of MWEs from the test set
(baseline; ~100 recall)


In [25]:
FInputGS1V = open('BGH0_s04GoldStandard1Version_LEM.txt', 'r')

In [26]:
# tokenizing the test set function
# extracting annotated terms from the gold standard in xml format

def collectField(FInputGS1V, Caps = False):

    LLParTokens = [] # List of paragraphs, each represented as a list of tokens
    for SLine in FInputGS1V:
        # print(SLine)
        SLine = SLine.strip() # implement this change

        # remove annotation

        # SLine = re.sub('[<>]+', ' ', SLine)
        # SLine = re.sub(' +', ' ', SLine)
        if Caps: SLine = SLine.upper() # capitalize all words
        # print(SLine)

        LLine = re.split(' ', SLine) 

        # separate punctuation
        LLine = re.findall(r"[\w']+|[.,!?;()\-„“\"]", SLine)
        # SLine = re.sub(r'(,\.;:\-\!\?\(\)\[\]\“\")', r' \1 ', SLine)

        LLine = list(filter(None, LLine))
        LLParTokens.append(LLine)
        return LLParTokens

'''
def vertCollectField(FInVert, IFieldN, Caps = False):
    L2Pars = []
    LWords = []
    for SLine in FInVert:
        if re.match('<', SLine):
            L2Pars.append(LWords)
            LWords = []
            continue
        LFields = re.split('\t', SLine)
        LWords.append(LFields[IFieldN])
    L2Pars.append(LWords)  
    return L2Pars 
'''

In [27]:
LLParTokens = vertCollectField(FInputGS1V, 2, Caps = True)

In [31]:
len(LLParTokens[0])

198146

In [32]:
print(str(LLParTokens[0][9]))

von



In [None]:
FInputGS1V.close()

In [None]:
# Stage 4A
# generating candidate MWEs for cheking if / when they have been identified as terms
# algorithm from Terminologieextraktion3 notebook
# 4a: creating space of all possible overlapping MWEs in gold standard

def tokens2candNGrams(LWords, N): # working with specific N-gram size, to keep number of candidates under control
    '''
    convert a list of tokens into a list of all possible MWEs (works for each paragraph)
    '''
    LLCandidates = [] # lists - tokenised results
    # LSCandidates = [] # strings - joint results

    for i in range(N): # for up to the required N-gram length

        for IPosition in range(len(LWords) - i): # unigrams -- no change; bigrams: up to penultimate, etc.
            LCandidate = LWords[IPosition : IPosition + i + 1]
            # SCandidate = ' '.join(LCandidate)
            LLCandidates.append(LCandidate)
        # LSCandidates.append(SCandidate)
    
    return LLCandidates





In [None]:
L2TestSetLemmas = vertCollectField()

### functions

In [None]:
# import useful libraries, files
import re, os, sys

In [None]:
# file for recording results of different configurations
# run this only once
!rm AllTermExtractionResultsV01.txt
!rm AllTermExtractionResultsV02.txt

FOutResults1 = open('AllTermExtractionResultsV01.txt', 'a')
FOutResults2 = open('AllTermExtractionResultsV02.txt', 'a')
FOutResults1.write('Run\tW1A\tW1B\tW1C\tW1D\tW1P\tW1R\tW2A\tW2B\tW2C\tW2D\tW2P\tW2R\tW3A\tW3B\tW3C\tW3D\tW3P\tW3R\tW4A\tW4B\tW4C\tW4D\tW4P\tW4R\n')
FOutResults2.write('Run\tW1P\tW1R\tW2P\tW2R\tW3P\tW3R\tW4P\tW4R\n') # only precision and recall figures
FOutResults1.flush()
FOutResults2.flush()

rm: cannot remove 'AllTermExtractionResultsV01.txt': No such file or directory
rm: cannot remove 'AllTermExtractionResultsV02.txt': No such file or directory


In [None]:
# filter functions -- e.g., by keyness
# here we will add / combine information about keyness...
# to be implemented...
# only allow those terms into the Auto dictionary, which have weights; replace frq by keyness weights (or sum)
# function to be used on all dictionaries:
# we can modify different parameters : both/one product or maximum...

def filterDictByKWDict(DAuto, DKeyness, Threshold = 1, Mode='prod', Req = 2):
    '''
    Mode = max: we take maximum value of keyness
         = prod -- we take the product of keyness, 
    Req = 1: we require at least one word in to be in the keyness dictionary
        = 2...N: we require at least 2, N words to be in the keyness dictionary (if there are as many in the list)
    '''
    DAutoFiltered = {}
    for SAutoTerm, Frq in DAuto.items():
        if Mode == 'max': AKeynessAll = 0
        elif Mode == 'prod': AKeynessAll = 1
        LAutoTermWs = re.split(' ', SAutoTerm)
        ICountFound = 0
        for STerm in LAutoTermWs:
            if STerm in DKeyness:
                ICountFound += 1
                if Mode == 'max' and DKeyness[STerm] > AKeynessAll:
                    AKeynessAll = DKeyness[STerm] # we take the maximum keyness
                elif Mode == 'prod':
                    AKeynessAll = AKeynessAll * DKeyness[STerm]
        if len(LAutoTermWs) < Req: Req0 = len(LAutoTermWs)
        else: Req0 = Req
        if ICountFound < Req0: continue
        if AKeynessAll > Threshold:
            DAutoFiltered[SAutoTerm] = AKeynessAll

    return DAutoFiltered


In [None]:
# dowloading and creating necessary files
# term extraction will be integrated here

In [None]:
# downloading PoS-tagged files after TT processing
# !wget https://heibox.uni-heidelberg.de/f/4e719e0466a143c0b1b5/?dl=1
# Vahram's file, manually checked (some breckets remaining...)
!wget https://heibox.uni-heidelberg.de/f/4e719e0466a143c0b1b5/?dl=1
# alternative file created in this workflow
# !wget https://heibox.uni-heidelberg.de/f/d8f1bb53632d40538e0d/?dl=1

!mv index.html?dl=1 BGH0_s00GS_LEM.txt

--2021-11-05 09:10:00--  https://heibox.uni-heidelberg.de/f/4e719e0466a143c0b1b5/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Connecting to heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)|129.206.7.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://heibox.uni-heidelberg.de/seafhttp/files/df5015c9-c1f5-4bbd-a09d-3fbd86626d16/BGH_GS_LEM.txt [following]
--2021-11-05 09:10:01--  https://heibox.uni-heidelberg.de/seafhttp/files/df5015c9-c1f5-4bbd-a09d-3fbd86626d16/BGH_GS_LEM.txt
Reusing existing connection to heibox.uni-heidelberg.de:443.
HTTP request sent, awaiting response... 200 OK
Length: 5936140 (5.7M) [text/plain]
Saving to: ‘index.html?dl=1’


2021-11-05 09:10:02 (5.19 MB/s) - ‘index.html?dl=1’ saved [5936140/5936140]



In [None]:
!head --lines=10 BGH0_s00GS_LEM.txt

<TERM>
BUNDESGERICHTSHOF	NN	Bundesgerichtshof
</TERM>
IM	NE	IM
NAMEN	NN	Name
DES	ART	die
VOLKES	NN	Volk
<TERM>
URTEIL	NN	Urteil
</TERM>


In [None]:
# read / generate all the necessary texts
# first define functions, then download files and read them into dictionaries...

# further versions of the Gold Standard: 
# Annotated and PoS-tagged Gold Standard -- for extraction of the correct evaluation set

# printing words of different length
# FOutput = open('BGH0_s01GoldS_Terms.txt', 'w')
# FOutputDict1w = open('BGH0_s01GoldS_D1w.txt', 'w') # 1-word terms
# FOutputDict2w = open('BGH0_s01GoldS_D2w.txt', 'w') # 2-word terminological expressions
# FOutputDict3w = open('BGH0_s01GoldS_D3w.txt', 'w') # 3-word terminological expressions
# FOutputDict4w = open('BGH0_s01GoldS_D4w.txt', 'w') # more than 3 words


In [None]:
# reading datasets
# list of gold-standard annotated terms, with lemmatization and pos fileds
FInBGH0_s00GS_LEM = open('BGH0_s00GS_LEM.txt', 'r')
# FOutBGH0_s00GS_Terms = open('BGH0_s00GS_Terms.txt', 'w')
L3AnnotatedSegs = vertCollectAnnotation(FInBGH0_s00GS_LEM, 'TERM', Caps = False)

# testing the file read
# for LSegment in L3AnnotatedSegs: FOutBGH0_s00GS_Terms.write(str(LSegment) + '\n')
# FOutBGH0_s00GS_Terms.flush()

In [None]:
print(len(L3AnnotatedSegs))
L3AnnotatedSegs[10]

16498


[['OBERSTAATSANWALT', 'NN', 'OBERSTAATSANWALT'],
 ['BEIM', 'APPRART', 'BEI'],
 ['BUNDESGERICHTSHOF', 'NN', 'BUNDESGERICHTSHOF']]

In [None]:
changeCaseL3(L3AnnotatedSegs, LFlds2Caps = [0, 2], Mode='upper')


In [None]:
# create a dictionary of PoS patterns in the gold standard annotation
DPatternsFrq = createDictOfPatterns(L3AnnotatedSegs, 1, Normalize = 1)
FOutTermPOS = open('BGH0_s00GoldStandard_pos.txt', 'w')

printDictionary(DPatternsFrq, FOutTermPOS)

In [None]:
try:
    del DTermConfFrq
    del DGS1w
    del DGS2w
    del DGS3w
    del DGS4w
except:
    print('dictionaries not defined yet...')

dictionaries not defined yet...


In [None]:
# selecting terms from the gold standard, which fit the description
# DTermConfFrq = selectTerms(L3AnnotatedSegs, L2Patterns = [['N', '\$']], SplitLen = False, IFldNumber = 0)
# FOutExamples = open('BGH0_s00GoldStandard_examples_ADJ_N.txt', 'w')

# DTermConfFrq, DGS1w, DGS2w, DGS3w, DGS4w = selectTerms(L3AnnotatedSegs, L2Patterns = None, LNoEdge = None, LNoStart = None, L2NoEnd = None, SplitLen = True, IFldNumber = 0)
# DTermConfFrq, DGS1w, DGS2w, DGS3w, DGS4w = selectTerms(L3AnnotatedSegs, L2Patterns = [['N'], ['ADJ', 'N'], ['N', 'N']], LNoEdge = None, LNoStart = None, L2NoEnd = None, SplitLen = True, IFldNumber = 0)
# removing the pattern: ['APPR', 'ART','N'],
# ['APPR', 'ART','N'], 
DTermConfFrq, DGS1w, DGS2w, DGS3w, DGS4w, L3SelectedTerms = selectTerms(L3AnnotatedSegs, L2Patterns = [['N'], ['ADJ'], ['ADJ', 'N'], ['N', 'N'], ['N', 'ART', 'N'], ['N', 'APPR', 'N'], ['ADJ', 'ADJ', 'N'], ['ADJ', 'N', 'N'], ['N', 'N', 'N'], ['N', 'ADJ', 'N'], ['N', 'APPR', 'ART', 'N'], ['N', 'ART', 'ADJ', 'N']], LNoEdge = None, LNoStart = None, L2NoEnd = None, SplitLen = True, IFldNumber = 2)


FOutExamples = open('BGH0_s00GoldStandard_examples.txt', 'w')
FOutExamples1w = open('BGH0_s00GoldStandard1w_examples.txt', 'w')
FOutExamples2w = open('BGH0_s00GoldStandard2w_examples.txt', 'w')
FOutExamples3w = open('BGH0_s00GoldStandard3w_examples.txt', 'w')
FOutExamples4w = open('BGH0_s00GoldStandard4w_examples.txt', 'w')

printDictionary(DTermConfFrq, FOutExamples)
printDictionary(DGS1w, FOutExamples1w)
printDictionary(DGS2w, FOutExamples2w)
printDictionary(DGS3w, FOutExamples3w)
printDictionary(DGS4w, FOutExamples4w)


14698 297 280 9 15284
2285 177 105 9 2576


In [None]:
# print(len(DTermConfFrq))

In [None]:
# Stage 2: preparing keyness dictionary
!wget https://heibox.uni-heidelberg.de/f/aa4560e627bd4b1d8055/?dl=1
!mv index.html?dl=1 TK_KW_Verif_V02.csv

!wget https://heibox.uni-heidelberg.de/f/a83ba95576a244a59966/?dl=1
!mv index.html?dl=1 KW_BGH_10000.tsv


--2021-11-04 14:21:22--  https://heibox.uni-heidelberg.de/f/aa4560e627bd4b1d8055/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Connecting to heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)|129.206.7.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://heibox.uni-heidelberg.de/seafhttp/files/8e20ed8e-f61b-4a0f-9664-1b1c818070a4/TK_KW_Verif_V02.csv [following]
--2021-11-04 14:21:23--  https://heibox.uni-heidelberg.de/seafhttp/files/8e20ed8e-f61b-4a0f-9664-1b1c818070a4/TK_KW_Verif_V02.csv
Reusing existing connection to heibox.uni-heidelberg.de:443.
HTTP request sent, awaiting response... 200 OK
Length: 38812 (38K) [application/octet-stream]
Saving to: ‘index.html?dl=1’


2021-11-04 14:21:23 (145 KB/s) - ‘index.html?dl=1’ saved [38812/38812]

--2021-11-04 14:21:23--  https://heibox.uni-heidelberg.de/f/a83ba95576a244a59966/?dl=1
Resolving heibox.uni-heidelberg.de (heibox.uni-heidelberg.de)... 129.206.7.113
Conn

In [None]:
# Preparing a dictionnary of keyness weights, checking the 'approval' status
FInputKW = open('TK_KW_Verif_V02.csv', 'r')
FInputKWLarge = open('KW_BGH_10000.tsv', 'r') # for experiments with Precision / Recall
FOutputKW = open('TK_KW_Verif_V02.txt', 'w')

In [None]:
try:
    del DScoresKW
    del DScoresNK
    del DStatKW
    del DScoresKWLarge
    del DScoresREKWquick
except:
    print('dictionaries not defined')

dictionaries not defined


In [None]:
# reading an annotated dictionary: keyword or not: 1, 0.5, 0 (yes, unsure, no)
DScoresKW, DScoresNK = readDictKWAnnotations(FInputKW, AKStatThreshold = 1)

In [None]:
try:
    print(DScoresKW['ANGEKLAGTE'])
except:
    print('keys not found')
try:
    print(DScoresNK['VGL'])
except:
    print('keys not found')
try:
    print(DScoresNK['NACHPRÜFUNG'])
except:
    print('keys not found')
try:
    print(DScoresNK['RECHTLICH'])
except:
    print('keys not found')
try:
    print(DScoresNK['JURISTISCH'])
except:
    print('keys not found')
try:
    print(DScoresNK['PERSON'])

    # 'JURISTISCHE PERSON'
    # 'RECHTLICHER NACHPRÜFUNG'
except:
    print('keys not found')

317698.5625
345.6272888184
3406.630859375
keys not found
keys not found
keys not found


In [None]:
# reading the large keyword dictionary
DScoresKWLarge = readDictionary(FInputKWLarge, Caps=True)

In [None]:
try:
    print(DScoresKWLarge['ANGEKLAGTE'])
    print(DScoresKWLarge['VGL'])
except:
    print('keys not found')

317698.56
258061.06


In [None]:
# Stage 3.1 combine extracted words with keyness (e.g., filter by keyness, etc.)


In [None]:
try:
    del DAutoTerms01KW_H1w
    del DAutoTerms01KW_HMWE
except:
    print('dictionaries not defined yet...')

dictionaries not defined yet...


In [None]:
DAutoTerms01KW_H1w = filterDictByKWDict(DAutoTerms1w, DScoresKWLarge, Threshold = 1, Req = 1) # autoterms filtered by human annotated items
DAutoTerms01KW_HMWE = filterDictByKWDict(DAutoTermsMWE, DScoresKWLarge, Threshold = 1, Req = 1)


In [None]:
try:
    print(DAutoTerms01KW_H1w['ANGEKLAGTE'])
    print(DAutoTerms01KW_H1w['VGL'])

    print(DAutoTerms01KW_HMWE['LANDGERICHT AUGSBURG'])
    print(DAutoTerms01KW_HMWE['VORSITZENDER RICHTER'])
    print(DAutoTerms01KW_HMWE['RECHTLICHER NACHPRÜFUNG'])
    print(DAutoTerms01KW_HMWE['JURISTISCHER PERSON'])

    # 'RECHTLICHER NACHPRÜFUNG'  
except:
    print('keys not found')

# JURISTISCHE PERSON >> lemmas or text forms in keywords???

# print statistics
print(len(DAutoTerms1w))
print(len(DAutoTermsMWE))

print(len(DScoresKWLarge))

print(len(DAutoTerms01KW_H1w))
print(len(DAutoTerms01KW_HMWE))

317698.56
258061.06
208549.09
105619.59
24163.04
keys not found
466669
2434043
9979
8247
1117210
