In [1]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='poslog/'

# Set to number of directories below project root if the notebook is in a subdirectory of the project, so you can use relative paths
SUBDIR_LEVEL = 1
if SUBDIR_LEVEL>0:
    import sys 
    new_path = '../'*SUBDIR_LEVEL
    if new_path not in sys.path:
        sys.path.append(new_path)

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), '../'*SUBDIR_LEVEL, 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: '../out/poslog'


In [2]:
# Precons
INPUT_FILE=os.path.join(OUT_DIR, '2_examples_tagged_upos.csv')

# if file does not exist, throw an error
if not os.path.exists(INPUT_FILE):
    precon='2_pos_tagging_create.ipynb'
    raise FileNotFoundError(f"File '{INPUT_FILE}' not found. Run '{precon}' first.")

OUTPUT_FILE=os.path.join(OUT_DIR, '2-2_examples_tagged_upos_majority.csv')
print(f'Output file: {OUTPUT_FILE}')

Output file: ../out/poslog/2-2_examples_tagged_upos_majority.csv


In [3]:
import pandas as pd
example_df=pd.read_csv(INPUT_FILE, converters={'nltk': eval, 'stanza': eval, 'spacy': eval, 'hanta': eval, 'treetagger': eval, 'Tokens': eval})
example_df.head(1)

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"[Receiving, block, blk_5614249702379360530, sr...","[VERB, NOUN, NOUN, NOUN, PUNCT, ADJ, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NUM, NOUN, PUNCT, PUNCT, INTJ, PU...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN..."


In [4]:
taggers=[c for c in example_df.columns if c.islower()]
taggers

['nltk', 'stanza', 'spacy', 'hanta', 'treetagger']

# Compare PoS Tags

For each row in df one column with the tags (must be of equal length, otherwise sort out):

```
nltk:   [VERB, NOUN, NOUN]
stanza: [VERB, JJ, AJD]
hanta:  [VERB, NOUN, VERB]
```

For each row in df calculate the majority, confidence (of majority) and minority.
Based on the length of the row (idices of the words in the list).

If there is no majority, then majority=None and confidence=0 (called parity).

Note that majority does not mean 'absolute majority', therefore you have to take confidence into account.

```
majority:   [VERB, NOUN,         None],                                  #list[str]
confidence: [1.,   0.66,         0]                                      #list[float]
minority:   [{},   {stanza: JJ}, {nltk: NOUN, stanza: ADJ, hanta: VERB}] #list[dict[str,str]]
```

Hold as list with same index as df row.

```
[
    { # #0 corresponds to the first row in the df
        majority:   [VERB, NOUN,         None],
        confidence: [1.,   0.66,         0],
        minority:   [{},   {stanza: JJ}, {nltk: NOUN, stanza: ADJ, hanta: VERB}]
    },
    { # #1 corresponds to the second row in the df
        majority:   [...],
        confidence: [...],
        minority:   [...]
    },
    ...
]
```

Define dataclass:

```
@dataclass
class TagComparison:
    majority:   list[str|None]
    confidence: list[float]
    minority:   list[dict[str, str]]
```

In [None]:
from util.pos import TagComparison, make_tag_comparison, print_stats_on_tag_comparison
tag_comparisons:list[TagComparison]=[]
# tag_comparisons=[make_tag_comparison(tc) for tc in example_df[list(tagger.keys())[:-1]].to_dict(orient='records')]
tag_comparisons=[make_tag_comparison(tc) for tc in example_df[list(taggers)].to_dict(orient='records')]
print_stats_on_tag_comparison(tag_comparisons)


## Tag Correction

In [None]:
from poslog.words import KnownWordsDetector, WordKind, RegexTokenClassMatcher, TokenClass

matcher=RegexTokenClassMatcher()
known_words_det=KnownWordsDetector()

from collections import Counter
correction_stats=Counter()

def _set_tag(tag_comparison:TagComparison, word_i:int, new_tag:str):
    tag_comparison.majority[word_i]=new_tag
    # Mark confidence as 1.1, since we are overriding the tag
    tag_comparison.confidence[word_i]=1.1 
    tag_comparison.minority[word_i]={}

def override_tagging_errors(tokens:list[str], tag_comparison:TagComparison)->TagComparison:
    for i,token in enumerate(tokens):

        token_class=matcher.token_class(token)

        if token_class==TokenClass.KEY_VALUE_PAIR:
            # Place this _before_ majority test, since there are often tagged key value pairs as NOUN or PROPN with 60% confidence
            _set_tag(tag_comparison, i, 'X')
            correction_stats['key value pair']+=1
            continue

        if tag_comparison.confidence[i]>.8:
            # Skip if 80% or more confident
            continue

        word_kind=known_words_det.kind_of_known_word(token)
        if word_kind==WordKind.NUMBER:
            _set_tag(tag_comparison, i, 'NUM')
            correction_stats['number cast']+=1
            continue


        if token_class==TokenClass.PUNCTUATION:
            _set_tag(tag_comparison, i, 'PUNCT')
            correction_stats['punctuation']+=1
            continue
        if token_class==TokenClass.SYMBOL:
            if token!='-':
                # skip dash for now, since it is often used as a separator
                _set_tag(tag_comparison, i, 'SYM')
                correction_stats['symbol']+=1
                continue
        if token_class==TokenClass.NUMBER or token_class==TokenClass.IDENTIFIER or token_class==TokenClass.DATE_TIME:
            _set_tag(tag_comparison, i, 'NUM')
            correction_stats['number regex']+=1
            continue

        
        if word_kind==WordKind.UNKNOWN:
            # Only continue if the word is not known

            if tag_comparison.confidence[i]>.5:
                # Skip if 50% or more confident, since on unknown words we choose absolute majority
                continue

            # Thesis: All known words cannot be PROPN        
            if token_class in [TokenClass.LOCATION, TokenClass.VARIABLE]:
                _set_tag(tag_comparison, i, 'PROPN')
                correction_stats['location or variable']+=1

            #MISC = 'Misc'
            #UNKNOWN = 'Unknown'

    return tag_comparison

examples_splitted=example_df['Tokens'].to_list()
tag_comparisons=[override_tagging_errors(tokens, tc) for tokens,tc in zip(examples_splitted, tag_comparisons)]
print_stats_on_tag_comparison(tag_comparisons)


For whole log lines (5548 in total):
Majority found: 4985 times (89.85%)
- Clear majority: 392 times (7.07%)
- Eighty percent: 1358 times (24.48%)
- Absolute majority: 4819 times (86.86%)
Parity (None in majority): 563 times (10.15%)

For words (64542 in total):
Majority found: 63853 times (98.93%)
- Clear majority: 48453 times (75.07%)
- Eighty percent: 55552 times (86.07%)
- Absolute majority: 63635 times (98.59%)
Parity (None in words): 689 times (1.07%)


### Corrections with knowledge of comparison

In [7]:
from collections import Counter
import re

def correct_individual_cases(tokens:list[str], tag_comparison:TagComparison)->TagComparison:
    for i,token in enumerate(tokens):
        
        # 'to' is ADP if it’s not followed by a verb (not PART)
        if token.lower()=='to':
            if tag_comparison.confidence[i]>=.8:
                continue
            #if tag_comparison.majority[i]!='ADP':
            if i+1<len(tokens):
                if tag_comparison.majority[i+1]!='VERB':
                    _set_tag(tag_comparison, i, 'ADP')
                    correction_stats['to']+=1
                    continue

        if tag_comparison.confidence[i]>.5:
            # if majority could be found...
            
            # Only unknown words can be PROPN
            if tag_comparison.majority[i]=='PROPN':
                word_kind=known_words_det.kind_of_known_word(token)
                if word_kind!=WordKind.UNKNOWN:
                    # if it’s known, it’s not PROPN
                    c=Counter([v for v in tag_comparison.minority[i].values()])
                    if len(c)>0:
                        majority_tag=c.most_common(1)[0][0]
                        _set_tag(tag_comparison, i, majority_tag)
                        correction_stats['known propn to most common']+=1
                        continue

            # if majority says NOUN or ADJ, but it’s a typical variable (camelCase, kebab-case, snake_case)
            if tag_comparison.majority[i] in ['NOUN', 'ADJ', 'VERB']:
                
                # VARIABLES
                # camelCase words
                # at least one uppercase letter in the middle with optional trailing digits and optional leading underscore
                camel_case=re.compile(r'_*[a-zA-Z]+[a-z]([A-Z][a-z]+)+\d*')
                # snake_case (underscore separated) words
                snake_case=re.compile(r'_*[a-zA-Z]+(_[a-zA-Z]+)+\d*')
                # kebap (dash separated) words
                kebap_case=re.compile(r'[a-zA-Z]+(-[a-zA-Z]+)+\d*')

                #TODO: Too much?
                # at least one letter and one digit, with optional underscore, slash or dash
                word_digit_mix=re.compile(r'(.*[a-zA-Z_\-/]+[0-9]+.*)|(.*[0-9]+[a-zA-Z_\-/]+.*)')
                if camel_case.fullmatch(token) or snake_case.fullmatch(token) or kebap_case.fullmatch(token) or word_digit_mix.fullmatch(token):
                    _set_tag(tag_comparison, i, 'PROPN')
                    correction_stats['variable']+=1
                    continue

                # PATHES & URLS
                # at least one dot or slash (optional colon), optional digits
                path_regex=re.compile(r'\w*:?([\.\/\\]+[\w\-:]+)+')
                if path_regex.fullmatch(token):
                    _set_tag(tag_comparison, i, 'PROPN')
                    correction_stats['path or url']+=1
                    continue
            
            if token=='-':
                left=None
                right=None
                if i>0:
                    left=tag_comparison.majority[i-1]
                if i+1<len(tokens):
                    right=tag_comparison.majority[i+1]
                if left=='SYM' or right=='SYM':
                    _set_tag(tag_comparison, i, 'SYM')
                    correction_stats['dash as sym']+=1
                elif left=='NUM' and right=='NUM':
                    _set_tag(tag_comparison, i, 'SYM')
                    correction_stats['dash as sym']+=1
                else:
                    _set_tag(tag_comparison, i, 'PUNCT')
                    correction_stats['dash as punctuation']+=1
                continue

            if token in ['<','>']:
                left=tokens.count('<')
                right=tokens.count('>')
                if left==right:
                    _set_tag(tag_comparison, i, 'PUNCT')
                    correction_stats['angle brackets']+=1
                    continue
                # else it was set to SYM already


        # word_kind=known_words_det.kind_of_known_word(token)
    return tag_comparison

examples_splitted=example_df['Tokens'].to_list()
tag_comparisons=[correct_individual_cases(tokens, tc) for tokens,tc in zip(examples_splitted, tag_comparisons)]
print_stats_on_tag_comparison(tag_comparisons)



For whole log lines (5548 in total):
Majority found: 4988 times (89.91%)
- Clear majority: 1041 times (18.76%)
- Eighty percent: 2377 times (42.84%)
- Absolute majority: 4822 times (86.91%)
Parity (None in majority): 560 times (10.09%)

For words (64542 in total):
Majority found: 63856 times (98.94%)
- Clear majority: 53418 times (82.76%)
- Eighty percent: 58913 times (91.28%)
- Absolute majority: 63638 times (98.60%)
Parity (None in words): 686 times (1.06%)


In [None]:
def correct_individual_cases_2(tokens:list[str], tag_comparison:TagComparison)->TagComparison:
    for i,token in enumerate(tokens):
        

        if tag_comparison.confidence[i]<.8:
            if token.lower()=='not':
                _set_tag(tag_comparison, i, 'ADV')
                correction_stats['not']+=1
                continue
            if token.lower()=='none':
                _set_tag(tag_comparison, i, 'PRON')
                correction_stats['none']+=1
                continue
            if token.lower()=='this':
                _set_tag(tag_comparison, i, 'PRON')
                correction_stats['this']+=1
                continue
            if token.lower()=='there':
                _set_tag(tag_comparison, i, 'PRON')
                correction_stats['there']+=1
                continue

        next_tag=None
        if i+1<len(tokens):
            next_tag=tag_comparison.majority[i+1]
        after_next_tag=None
        if i+2<len(tokens):
            after_next_tag=tag_comparison.majority[i+2]
        
        if token.lower()=='got':
            if next_tag=='VERB':
                _set_tag(tag_comparison, i, 'AUX')
                correction_stats['got']+=1
                continue
        
        if token.lower()=='about':
            # 'to' was corrected already
            if next_tag=='PART':
                if after_next_tag=='VERB':
                    _set_tag(tag_comparison, i, 'ADV')
                    correction_stats['about to']+=1
                    continue
                


    return tag_comparison

examples_splitted=example_df['Tokens'].to_list()
tag_comparisons=[correct_individual_cases_2(tokens, tc) for tokens,tc in zip(examples_splitted, tag_comparisons)]
print_stats_on_tag_comparison(tag_comparisons)



For whole log lines (5548 in total):
Majority found: 4990 times (89.94%)
- Clear majority: 1068 times (19.25%)
- Eighty percent: 2559 times (46.12%)
- Absolute majority: 4824 times (86.95%)
Parity (None in majority): 558 times (10.06%)

For words (64542 in total):
Majority found: 63860 times (98.94%)
- Clear majority: 53801 times (83.36%)
- Eighty percent: 59295 times (91.87%)
- Absolute majority: 63642 times (98.61%)
Parity (None in words): 682 times (1.06%)


In [9]:
correction_stats

Counter({'key value pair': 3976,
         'variable': 2916,
         'symbol': 2484,
         'location or variable': 1827,
         'number regex': 1345,
         'known propn to most common': 1092,
         'path or url': 885,
         'angle brackets': 648,
         'number cast': 444,
         'punctuation': 411,
         'not': 353,
         'to': 254,
         'dash as punctuation': 212,
         'dash as sym': 53,
         'this': 17,
         'there': 7,
         'about to': 4,
         'got': 2})

## Build Majority

In [None]:
from poslog.words import KnownWordsDetector, WordKind
kwdet=KnownWordsDetector()

majorities=[]
for j,tc in enumerate(tag_comparisons):
    m=[]
    for word_i in range(len(tc.majority)):
        m_i=None
        if tc.confidence[word_i]>=.8:
            m_i=tc.majority[word_i]
        elif tc.confidence[word_i]>.5:
            if kwdet.kind_of_known_word(examples_splitted[j][word_i])==WordKind.UNKNOWN:
                m_i=tc.majority[word_i]

            # if only NOUN and PROPN, then go for absolute majority
            elif tc.majority[word_i] in ['NOUN', 'PROPN']:
                s=set([v for v in tc.minority[word_i].values()])
                s.add(tc.majority[word_i])
                if len(s)==2:
                    if s == {'PROPN', 'NOUN'}:
                        m_i=tc.majority[word_i]
        m.append(m_i)
    majorities.append(m)        

In [11]:
nones_sum=0
nones_in_lines=0
token_count=0
full_tagged_indices=[]
ragged_tagged_indices=[]
for majority in majorities:
    nones=majority.count(None)
    token_count+=len(majority)
    nones_sum+=nones
    if nones>0:
        nones_in_lines+=1
        ragged_tagged_indices.append(majorities.index(majority))
    else:
        full_tagged_indices.append(majorities.index(majority))

print(f"None count: {nones_sum} of {token_count} tokens ({nones_sum/token_count:.2%})")
print(f"Lines with None: {nones_in_lines} of {len(majorities)} ({nones_in_lines/len(majorities):.2%})")
print(f"Lines with full tagging: {len(full_tagged_indices)} of {len(majorities)} ({len(full_tagged_indices)/len(majorities):.2%})")


None count: 2511 of 64542 tokens (3.89%)
Lines with None: 1789 of 5548 (32.25%)
Lines with full tagging: 3759 of 5548 (67.75%)


In [12]:
example_df['TagComparison']=tag_comparisons
example_df['Majority']=majorities
example_df.to_csv(OUTPUT_FILE, index=False)
example_df.head()

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"[Receiving, block, blk_5614249702379360530, sr...","[VERB, NOUN, NOUN, NOUN, PUNCT, ADJ, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NUM, NOUN, PUNCT, PUNCT, INTJ, PU...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","[VERB, NOUN, PROPN, NOUN, PUNCT, NUM, NOUN, PU..."
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1,"[BLOCK, *, NameSystem.addStoredBlock, ., block...","[PROPN, PROPN, PROPN, PUNCT, NOUN, VERB, PUNCT...","[NOUN, PUNCT, PROPN, PUNCT, NOUN, VERB, PUNCT,...","[PROPN, PUNCT, PROPN, PUNCT, AUX, VERB, PUNCT,...","[NOUN, NUM, NOUN, PUNCT, NOUN, VERB, PUNCT, NO...","[NOUN, SYM, PROPN, PUNCT, NOUN, VERB, PUNCT, N...","TagComparison(majority=['NOUN', 'SYM', 'PROPN'...","[NOUN, SYM, PROPN, PUNCT, PROPN, VERB, PUNCT, ..."
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2,"[BLOCK, *, NameSystem.allocateBlock, ., /user/...","[PROPN, PROPN, PROPN, PUNCT, NOUN, PUNCT, NOUN]","[NOUN, PUNCT, PROPN, PUNCT, PROPN, PUNCT, PROPN]","[PROPN, PUNCT, PROPN, PUNCT, INTJ, PUNCT, NUM]","[NOUN, NUM, NOUN, PUNCT, NOUN, PUNCT, NOUN]","[NOUN, SYM, PROPN, PUNCT, NOUN, PUNCT, NOUN]","TagComparison(majority=['NOUN', 'SYM', 'PROPN'...","[NOUN, SYM, PROPN, PUNCT, PROPN, PUNCT, PROPN]"
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3,"[Received, block, blk_-6232712486646639079, of...","[VERB, ADJ, NOUN, ADP, NOUN, NUM, ADP, NOUN]","[VERB, NOUN, PROPN, ADP, NOUN, NUM, ADP, NUM]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, PUNCT]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, NUM]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, NUM]","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","[VERB, NOUN, PROPN, ADP, NOUN, NUM, ADP, NUM]"
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4,"[PacketResponder, 1, for, block, blk_-68777711...","[NOUN, NUM, ADP, NOUN, NOUN, NOUN]","[PROPN, NUM, ADP, NOUN, PROPN, NOUN]","[NOUN, NUM, ADP, NOUN, NOUN, VERB]","[ADJ, NUM, ADP, NOUN, NOUN, VERB]","[PROPN, NUM, ADP, NOUN, NOUN, VERB]","TagComparison(majority=['PROPN', 'NUM', 'ADP',...","[PROPN, NUM, ADP, NOUN, PROPN, None]"
