In [30]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='pos_log/'

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: 'out/pos_log'
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
# Precons
INPUT_FILE=os.path.join(OUT_DIR, '..','numb_var','examples_10000_each_seed-42_numb_var.csv')

# if file does not exist, throw an error
if not os.path.exists(INPUT_FILE):
    precon='1_templates_collect_numb_var.ipynb'
    raise FileNotFoundError(f"File '{INPUT_FILE}' not found. Run '{precon}' first.")

OUTPUT_FILE=os.path.join(OUT_DIR, 'examples_tagged_upos.csv')
print(f'Output file: {OUTPUT_FILE}')

RANDOM_SAMPLE_SIZE=100
RANDOM_SAMPLE_SEED=42
OUTPUT_FILE_RANDOM_SAMPLE=os.path.join(OUT_DIR, f'examples_tagged_{RANDOM_SAMPLE_SIZE}_random_sample_seed-{RANDOM_SAMPLE_SEED}_upos.csv')
print(f'Output file: {OUTPUT_FILE_RANDOM_SAMPLE}')

Output file: out/pos_log/examples_tagged_upos.csv
Output file: out/pos_log/examples_tagged_100_random_sample_seed-42_upos.csv


In [32]:
import pandas as pd
# columns: Dataset,Line,Example,Template,ClusterId
example_df=pd.read_csv(INPUT_FILE)
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4
...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539


# Tokenizer

In [33]:
from nlp import PrometeusTokenizer
tokenizer=PrometeusTokenizer()

In [34]:
example_df['Tokens']=example_df['Example'].apply(tokenizer.tokenize)
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"[Receiving, block, blk_5614249702379360530, sr..."
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1,"[BLOCK, *, NameSystem.addStoredBlock, :, block..."
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2,"[BLOCK, *, NameSystem.allocateBlock, :, /user/..."
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3,"[Received, block, blk_-6232712486646639079, of..."
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4,"[PacketResponder, 1, for, block, blk_-68777711..."
...,...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536,"[UNINSTALLED, :, com.skype.skype.ShareExtensio..."
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537,"[Uncommited, CATransaction, ., Set, CA_DEBUG_T..."
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538,"[objc, [, 35448, ], :, Class, TSUDurationLocal..."
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539,"[[, 00:29:25.872, ], HTTPRequest, figHttpReque..."


# Unify Punctuation Tokens

In [35]:
def unify_punct_tokens(tokens:list[str])->list[str]:
    for i, token in enumerate(tokens):
        if len(token)==1:
            if token in ['(', '[', '{']:
                tokens[i]='('
            elif token in [')', ']', '}']:
                tokens[i]=')'
            elif token in ['.',',',';',':','!','?']:
                # esp. for PTB to generalize punctuation
                tokens[i]='.'
        # TODO: More of these?
    return tokens

example_df['Tokens']=example_df['Tokens'].apply(unify_punct_tokens)

# PoS-Tagging

In [None]:
# This cell installes all taggers (if not already installed)
# So the first run may take a while.

from nlp.pos import AbstractPosTagger, NLTKPosTagger, SpacyPosTagger, StanzaPosTagger, HanTaPosTagger, TreeTaggerPosTagger
nltk_tagger=NLTKPosTagger()
stanza_tagger=StanzaPosTagger()
spacy_tagger=SpacyPosTagger()
hanta_tagger=HanTaPosTagger()
treetagger_tagger=TreeTaggerPosTagger()

tagger:dict[str,AbstractPosTagger]={}

tagger['nltk']=nltk_tagger.pos_tag
tagger['stanza']=stanza_tagger.pos_tag
tagger['spacy']=spacy_tagger.pos_tag
tagger['hanta']=hanta_tagger.pos_tag
tagger['treetagger']=treetagger_tagger.pos_tag


In [37]:
import logging
# deactivate logging for treetagger (since there was a pipe error otherwise)
logging.getLogger().setLevel(logging.WARNING)

i=50
for tagger_name, tagger_func in tagger.items():
    print(f'{tagger_name} tags line {i}: {tagger_func(example_df["Tokens"][i])}')

nltk tags line 50: ['NOUN', 'PUNCT', 'NOUN', 'VERB', 'VERB', 'PUNCT', 'CCONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'VERB']
stanza tags line 50: ['NOUN', 'PUNCT', 'PROPN', 'AUX', 'VERB', 'PUNCT', 'CCONJ', 'AUX', 'ADP', 'DET', 'NOUN', 'ADP', 'VERB']
spacy tags line 50: ['PROPN', 'PUNCT', 'PROPN', 'AUX', 'VERB', 'PUNCT', 'CCONJ', 'AUX', 'ADP', 'DET', 'NOUN', 'ADP', 'VERB']
hanta tags line 50: ['NOUN', 'PUNCT', 'NOUN', 'AUX', 'VERB', 'PUNCT', 'CCONJ', 'AUX', 'ADP', 'DET', 'NOUN', 'ADP', 'VERB']
treetagger tags line 50: ['NOUN', 'PUNCT', 'NOUN', 'VERB', 'VERB', 'PUNCT', 'CCONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'VERB']


# Tag with all POS-Taggers

In [None]:
# add a new column for each tagger
import time

for name, tagger_func in tagger.items():
    print(f"Tagging with {name}")
    start_time = time.time()

    example_df[name]=example_df['Tokens'].apply(tagger_func)

    print(f"- Time taken: {time.time() - start_time:.2f} seconds")
example_df

In [None]:
# write to file
example_df.to_csv(OUTPUT_FILE, index=False)

### Add a breakpoint here to not always run all taggers for further development

In [38]:
example_df=pd.read_csv(OUTPUT_FILE, converters={'nltk': eval, 'stanza': eval, 'spacy': eval, 'hanta': eval, 'treetagger': eval, 'Tokens': eval})
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"[Receiving, block, blk_5614249702379360530, sr...","[VERB, NOUN, NOUN, NOUN, PUNCT, ADJ, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NUM, NOUN, PUNCT, PUNCT, INTJ, PU...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","TagComparison(majority=['VERB', 'NOUN', 'NOUN'...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM..."
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1,"[BLOCK, *, NameSystem.addStoredBlock, ., block...","[PROPN, PROPN, PROPN, PUNCT, NOUN, VERB, PUNCT...","[NOUN, PUNCT, PROPN, PUNCT, NOUN, VERB, PUNCT,...","[PROPN, PUNCT, PROPN, PUNCT, AUX, VERB, PUNCT,...","[NOUN, NUM, NOUN, PUNCT, NOUN, VERB, PUNCT, NO...","[NOUN, SYM, PROPN, PUNCT, NOUN, VERB, PUNCT, N...","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'NOUN', '..."
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2,"[BLOCK, *, NameSystem.allocateBlock, ., /user/...","[PROPN, PROPN, PROPN, PUNCT, NOUN, PUNCT, NOUN]","[NOUN, PUNCT, PROPN, PUNCT, PROPN, PUNCT, PROPN]","[PROPN, PUNCT, PROPN, PUNCT, INTJ, PUNCT, NUM]","[NOUN, NUM, NOUN, PUNCT, NOUN, PUNCT, NOUN]","[NOUN, SYM, PROPN, PUNCT, NOUN, PUNCT, NOUN]","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'NOUN', '..."
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3,"[Received, block, blk_-6232712486646639079, of...","[VERB, ADJ, NOUN, ADP, NOUN, NUM, ADP, NOUN]","[VERB, NOUN, PROPN, ADP, NOUN, NUM, ADP, NUM]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, PUNCT]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, NUM]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, NUM]","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'..."
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4,"[PacketResponder, 1, for, block, blk_-68777711...","[NOUN, NUM, ADP, NOUN, NOUN, NOUN]","[PROPN, NUM, ADP, NOUN, PROPN, NOUN]","[NOUN, NUM, ADP, NOUN, NOUN, VERB]","[ADJ, NUM, ADP, NOUN, NOUN, VERB]","[PROPN, NUM, ADP, NOUN, NOUN, VERB]","TagComparison(majority=[None, 'NUM', 'ADP', 'N...","[None, 'NUM', 'ADP', 'NOUN', 'PROPN', None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536,"[UNINSTALLED, ., com.skype.skype.ShareExtensio...","[PROPN, PUNCT, NOUN, NOUN, PUNCT, NUM, PUNCT, ...","[VERB, PUNCT, PROPN, PROPN, PUNCT, NUM, PUNCT,...","[VERB, PUNCT, PROPN, PROPN, PUNCT, NUM, PUNCT,...","[VERB, PUNCT, NOUN, NOUN, PUNCT, NUM, PUNCT, N...","[ADJ, PUNCT, NOUN, NOUN, PUNCT, NUM, PUNCT, AD...","TagComparison(majority=['VERB', 'PUNCT', 'PROP...","[None, 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'NU..."
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537,"[Uncommited, CATransaction, ., Set, CA_DEBUG_T...","[VERB, PROPN, PUNCT, PROPN, PROPN, ADP, NOUN, ...","[ADJ, NOUN, PUNCT, VERB, PROPN, ADP, NOUN, PAR...","[PROPN, PROPN, PUNCT, VERB, PROPN, ADP, NOUN, ...","[ADJ, NOUN, PUNCT, NOUN, NOUN, ADP, NOUN, ADP,...","[PROPN, PROPN, PUNCT, VERB, ADJ, ADP, NOUN, PA...","TagComparison(majority=[None, 'PROPN', 'PUNCT'...","[None, 'PROPN', 'PUNCT', None, 'PROPN', 'ADP',..."
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538,"[objc, (, 35448, ), ., Class, TSUDurationLocal...","[NOUN, PUNCT, NUM, PUNCT, PUNCT, NOUN, NOUN, V...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, PROPN, PROPN...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, NOUN, NOUN, ...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, NOUN, NOUN, ...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, PROPN, PROPN...","TagComparison(majority=['PROPN', 'PUNCT', 'NUM...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'P..."
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539,"[(, 00:29:25.872, ), HTTPRequest, figHttpReque...","[PUNCT, NUM, PUNCT, PROPN, NOUN, PUNCT, PROPN,...","[PUNCT, NUM, PUNCT, NOUN, PROPN, PUNCT, NOUN, ...","[PUNCT, PROPN, PUNCT, PROPN, NOUN, PUNCT, NOUN...","[PUNCT, NUM, PUNCT, ADJ, NOUN, PUNCT, NOUN, NO...","[PUNCT, NUM, PUNCT, PROPN, NOUN, PUNCT, NOUN, ...","TagComparison(majority=['PUNCT', 'NUM', 'PUNCT...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'PROPN', 'P..."


# Compare PoS Tags

For each row in df one column with the tags (must be of equal length, otherwise sort out):

```
nltk:   [VERB, NOUN, NOUN]
stanza: [VERB, JJ, AJD]
hanta:  [VERB, NOUN, VERB]
```

For each row in df calculate the majority, confidence (of majority) and minority.
Based on the length of the row (idices of the words in the list).

If there is no majority, then majority=None and confidence=0 (called parity).

Note that majority does not mean 'absolute majority', therefore you have to take confidence into account.

```
majority:   [VERB, NOUN,         None],                                  #list[str]
confidence: [1.,   0.66,         0]                                      #list[float]
minority:   [{},   {stanza: JJ}, {nltk: NOUN, stanza: ADJ, hanta: VERB}] #list[dict[str,str]]
```

Hold as list with same index as df row.

```
[
    { # #0 corresponds to the first row in the df
        majority:   [VERB, NOUN,         None],
        confidence: [1.,   0.66,         0],
        minority:   [{},   {stanza: JJ}, {nltk: NOUN, stanza: ADJ, hanta: VERB}]
    },
    { # #1 corresponds to the second row in the df
        majority:   [...],
        confidence: [...],
        minority:   [...]
    },
    ...
]
```

Define dataclass:

```
@dataclass
class TagComparison:
    majority:   list[str|None]
    confidence: list[float]
    minority:   list[dict[str, str]]
```

In [39]:
from nlp.pos import TagComparison, make_tag_comparison, print_stats_on_tag_comparison
tag_comparisons:list[TagComparison]=[]
# tag_comparisons=[make_tag_comparison(tc) for tc in example_df[list(tagger.keys())[:-1]].to_dict(orient='records')]
tag_comparisons=[make_tag_comparison(tc) for tc in example_df[list(tagger.keys())].to_dict(orient='records')]
print_stats_on_tag_comparison(tag_comparisons)


For whole log lines (5548 in total):
Majority found: 3758 times (67.74%)
- Clear majority: 446 times (8.04%)
- Eighty percent: 976 times (17.59%)
- Absolute majority: 2820 times (50.83%)
Parity (None in majority): 1790 times (32.26%)

For words (64542 in total):
Majority found: 61834 times (95.80%)
- Clear majority: 44613 times (69.12%)
- Eighty percent: 50017 times (77.50%)
- Absolute majority: 58335 times (90.38%)
Parity (None in words): 2708 times (4.20%)


In [40]:
from nlp.pos import KnownWordsDetector, WordKind, RegexTokenClassMatcher, TokenClass

known_words_det=KnownWordsDetector()

def override_number_words(tokens:list[str], tag_comparison:TagComparison)->TagComparison:
    for i,token in enumerate(tokens):
        word_kind=known_words_det.kind_of_known_word(token)
        if word_kind==WordKind.NUMBER:
            tag_comparison.majority[i]='NUM'
            tag_comparison.confidence[i]=1.0
            tag_comparison.minority[i]={}
    return tag_comparison

examples_splitted=example_df['Tokens'].to_list()
tag_comparisons=[override_number_words(tokens, tc) for tokens,tc in zip(examples_splitted, tag_comparisons)]
print_stats_on_tag_comparison(tag_comparisons)

For whole log lines (5548 in total):
Majority found: 3760 times (67.77%)
- Clear majority: 469 times (8.45%)
- Eighty percent: 989 times (17.83%)
- Absolute majority: 2821 times (50.85%)
Parity (None in majority): 1788 times (32.23%)

For words (64542 in total):
Majority found: 61838 times (95.81%)
- Clear majority: 45056 times (69.81%)
- Eighty percent: 50134 times (77.68%)
- Absolute majority: 58342 times (90.39%)
Parity (None in words): 2704 times (4.19%)


In [41]:

def override_symbols(tokens:list[str], tag_comparison:TagComparison)->TagComparison:
    for i,token in enumerate(tokens):
        # TODO: Upos only so far
        if token in ['#','$']:
            tag_comparison.majority[i]='SYM'
            tag_comparison.confidence[i]=1.0
            tag_comparison.minority[i]={}
    return tag_comparison

examples_splitted=example_df['Tokens'].to_list()
tag_comparisons=[override_number_words(tokens, tc) for tokens,tc in zip(examples_splitted, tag_comparisons)]
print_stats_on_tag_comparison(tag_comparisons)

For whole log lines (5548 in total):
Majority found: 3760 times (67.77%)
- Clear majority: 469 times (8.45%)
- Eighty percent: 989 times (17.83%)
- Absolute majority: 2821 times (50.85%)
Parity (None in majority): 1788 times (32.23%)

For words (64542 in total):
Majority found: 61838 times (95.81%)
- Clear majority: 45056 times (69.81%)
- Eighty percent: 50134 times (77.68%)
- Absolute majority: 58342 times (90.39%)
Parity (None in words): 2704 times (4.19%)


In [None]:
matcher=RegexTokenClassMatcher()
known_words_det=KnownWordsDetector()


def _set_tag(tag_comparison:TagComparison, word_i:int, new_tag:str):
    tag_comparison.majority[word_i]=new_tag
    tag_comparison.confidence[word_i]=1.0
    tag_comparison.minority[word_i]={}

def override_punctuation_and_stuff(tokens:list[str], tag_comparison:TagComparison)->TagComparison:

    for i,token in enumerate(tokens):

        token_class=matcher.token_class(token)

        if token_class==TokenClass.PUNCTUATION:
            _set_tag(tag_comparison, i, 'PUNCT')
            continue
        if token_class==TokenClass.NUMBER or token_class==TokenClass.IDENTIFIER or token_class==TokenClass.DATE_TIME:
            _set_tag(tag_comparison, i, 'NUM')
            continue

        # TODO
        # Make sure to only mask tokens as PROMETEUS if they are unknown
        word_kind=known_words_det.kind_of_known_word(token)
        if word_kind!=WordKind.UNKNOWN:
            continue

        
        #
        #TODO
        continue
        if tag_comparison.confidence[i]<=.5:

            if token_class==TokenClass.KEY_VALUE_PAIR:
                _set_tag(tag_comparison, i, 'PROMETEUS_KV')
                continue
            if token_class==TokenClass.LOCATION:
                _set_tag(tag_comparison, i, 'PROMETEUS_LOC')
                continue

            if token_class==TokenClass.VARIABLE:
                _set_tag(tag_comparison, i, 'PROMETEUS_VAR')
                continue
            
            if token_class==TokenClass.SPECIAL_CHAR:
                _set_tag(tag_comparison, i, 'PROMETEUS_CHAR')
                continue
            
            continue
            #MISC = 'Misc'
            #UNKNOWN = 'Unknown'

            # TODO
            if token_class!=TokenClass.UNKNOWN:
            #if token_class==TokenClass.UNKNOWN:
                # only set PROMETEUS, if its not replaced already (esp. PUNCT and NUM)
                if len(tag_comparison.minority[i])>0:
                    #print(f"Token '{token}': {tag_comparison.majority[i]}, {tag_comparison.minority[i]}'")

                    #tag_comparison.majority[i]='PROPN'
                    tag_comparison.majority[i]='PROMETEUS_PROPN'
                    tag_comparison.confidence[i]=1.0
                    tag_comparison.minority[i]={}
    return tag_comparison

examples_splitted=example_df['Tokens'].to_list()
tag_comparisons=[override_punctuation_and_stuff(tokens, tc) for tokens,tc in zip(examples_splitted, tag_comparisons)]
print_stats_on_tag_comparison(tag_comparisons)


For whole log lines (5548 in total):
Majority found: 3887 times (70.06%)
- Clear majority: 563 times (10.15%)
- Eighty percent: 1166 times (21.02%)
- Absolute majority: 3123 times (56.29%)
Parity (None in majority): 1661 times (29.94%)

For words (64542 in total):
Majority found: 62164 times (96.32%)
- Clear majority: 48892 times (75.75%)
- Eighty percent: 53140 times (82.33%)
- Absolute majority: 60015 times (92.99%)
Parity (None in words): 2378 times (3.68%)


In [43]:
from nlp.pos import KnownWordsDetector, WordKind
kwdet=KnownWordsDetector()

majorities=[]
for j,tc in enumerate(tag_comparisons):
    m=[]
    for word_i in range(len(tc.majority)):
        m_i=None
        if tc.confidence[word_i]>=.8:
            m_i=tc.majority[word_i]
        elif tc.confidence[word_i]>.5:
            if kwdet.kind_of_known_word(examples_splitted[j][word_i])==WordKind.UNKNOWN:
                m_i=tc.majority[word_i]
        m.append(m_i)
    majorities.append(m)        

In [44]:
example_df['Majority']=majorities

In [45]:
nones_sum=0
nones_in_lines=0
token_count=0
full_tagged_indices=[]
ragged_tagged_indices=[]
for majority in majorities:
    nones=majority.count(None)
    token_count+=len(majority)
    nones_sum+=nones
    if nones>0:
        nones_in_lines+=1
        ragged_tagged_indices.append(majorities.index(majority))
    else:
        full_tagged_indices.append(majorities.index(majority))

print(f"None count: {nones_sum} of {token_count} tokens ({nones_sum/token_count:.2%})")
print(f"Lines with None: {nones_in_lines} of {len(majorities)} ({nones_in_lines/len(majorities):.2%})")
print(f"Lines with full tagging: {len(full_tagged_indices)} of {len(majorities)} ({len(full_tagged_indices)/len(majorities):.2%})")

print(full_tagged_indices)

None count: 6941 of 64542 tokens (10.75%)
Lines with None: 3369 of 5548 (60.72%)
Lines with full tagging: 2179 of 5548 (39.28%)
[0, 2, 3, 7, 13, 18, 21, 30, 31, 34, 42, 43, 42, 49, 50, 52, 57, 52, 65, 68, 80, 81, 82, 83, 86, 87, 88, 92, 94, 98, 100, 101, 102, 103, 104, 106, 111, 113, 101, 122, 124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 139, 140, 141, 149, 153, 154, 159, 162, 163, 111, 167, 168, 169, 140, 174, 176, 178, 179, 180, 184, 185, 186, 189, 101, 199, 200, 203, 208, 211, 221, 222, 223, 224, 128, 106, 231, 232, 233, 236, 237, 241, 249, 250, 249, 249, 249, 256, 258, 249, 249, 261, 263, 266, 267, 268, 270, 272, 88, 277, 279, 280, 241, 284, 31, 291, 291, 299, 300, 301, 303, 249, 299, 267, 308, 313, 31, 317, 319, 321, 324, 326, 327, 330, 313, 333, 336, 339, 343, 346, 347, 350, 21, 355, 357, 358, 333, 362, 367, 369, 333, 31, 31, 380, 386, 390, 394, 308, 399, 400, 404, 405, 406, 409, 410, 411, 413, 415, 417, 400, 413, 420, 424, 400, 427, 436, 437, 439, 413, 424, 442, 424, 4

In [46]:
example_df['TagComparison']=tag_comparisons
example_df.to_csv(OUTPUT_FILE, index=False)
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"[Receiving, block, blk_5614249702379360530, sr...","[VERB, NOUN, NOUN, NOUN, PUNCT, ADJ, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NUM, NOUN, PUNCT, PUNCT, INTJ, PU...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN...","TagComparison(majority=['VERB', 'NOUN', 'NOUN'...","[VERB, NOUN, NOUN, NOUN, PUNCT, NUM, NOUN, PUN..."
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1,"[BLOCK, *, NameSystem.addStoredBlock, ., block...","[PROPN, PROPN, PROPN, PUNCT, NOUN, VERB, PUNCT...","[NOUN, PUNCT, PROPN, PUNCT, NOUN, VERB, PUNCT,...","[PROPN, PUNCT, PROPN, PUNCT, AUX, VERB, PUNCT,...","[NOUN, NUM, NOUN, PUNCT, NOUN, VERB, PUNCT, NO...","[NOUN, SYM, PROPN, PUNCT, NOUN, VERB, PUNCT, N...","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","[PROPN, PUNCT, PROPN, PUNCT, NOUN, VERB, PUNCT..."
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2,"[BLOCK, *, NameSystem.allocateBlock, ., /user/...","[PROPN, PROPN, PROPN, PUNCT, NOUN, PUNCT, NOUN]","[NOUN, PUNCT, PROPN, PUNCT, PROPN, PUNCT, PROPN]","[PROPN, PUNCT, PROPN, PUNCT, INTJ, PUNCT, NUM]","[NOUN, NUM, NOUN, PUNCT, NOUN, PUNCT, NOUN]","[NOUN, SYM, PROPN, PUNCT, NOUN, PUNCT, NOUN]","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","[PROPN, PUNCT, PROPN, PUNCT, NOUN, PUNCT, NOUN]"
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3,"[Received, block, blk_-6232712486646639079, of...","[VERB, ADJ, NOUN, ADP, NOUN, NUM, ADP, NOUN]","[VERB, NOUN, PROPN, ADP, NOUN, NUM, ADP, NUM]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, PUNCT]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, NUM]","[VERB, NOUN, NOUN, ADP, NOUN, NUM, ADP, NUM]","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","[VERB, NOUN, PROPN, ADP, NOUN, NUM, ADP, NUM]"
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4,"[PacketResponder, 1, for, block, blk_-68777711...","[NOUN, NUM, ADP, NOUN, NOUN, NOUN]","[PROPN, NUM, ADP, NOUN, PROPN, NOUN]","[NOUN, NUM, ADP, NOUN, NOUN, VERB]","[ADJ, NUM, ADP, NOUN, NOUN, VERB]","[PROPN, NUM, ADP, NOUN, NOUN, VERB]","TagComparison(majority=[None, 'NUM', 'ADP', 'N...","[None, NUM, ADP, NOUN, PROPN, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536,"[UNINSTALLED, ., com.skype.skype.ShareExtensio...","[PROPN, PUNCT, NOUN, NOUN, PUNCT, NUM, PUNCT, ...","[VERB, PUNCT, PROPN, PROPN, PUNCT, NUM, PUNCT,...","[VERB, PUNCT, PROPN, PROPN, PUNCT, NUM, PUNCT,...","[VERB, PUNCT, NOUN, NOUN, PUNCT, NUM, PUNCT, N...","[ADJ, PUNCT, NOUN, NOUN, PUNCT, NUM, PUNCT, AD...","TagComparison(majority=['VERB', 'PUNCT', 'PROP...","[None, PUNCT, PROPN, PROPN, PUNCT, NUM, PUNCT,..."
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537,"[Uncommited, CATransaction, ., Set, CA_DEBUG_T...","[VERB, PROPN, PUNCT, PROPN, PROPN, ADP, NOUN, ...","[ADJ, NOUN, PUNCT, VERB, PROPN, ADP, NOUN, PAR...","[PROPN, PROPN, PUNCT, VERB, PROPN, ADP, NOUN, ...","[ADJ, NOUN, PUNCT, NOUN, NOUN, ADP, NOUN, ADP,...","[PROPN, PROPN, PUNCT, VERB, ADJ, ADP, NOUN, PA...","TagComparison(majority=[None, 'PROPN', 'PUNCT'...","[None, PROPN, PUNCT, None, PROPN, ADP, NOUN, P..."
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538,"[objc, (, 35448, ), ., Class, TSUDurationLocal...","[NOUN, PUNCT, NUM, PUNCT, PUNCT, NOUN, NOUN, V...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, PROPN, PROPN...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, NOUN, NOUN, ...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, NOUN, NOUN, ...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, PROPN, PROPN...","TagComparison(majority=['PROPN', 'PUNCT', 'NUM...","[PROPN, PUNCT, NUM, PUNCT, PUNCT, PROPN, PROPN..."
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539,"[(, 00:29:25.872, ), HTTPRequest, figHttpReque...","[PUNCT, NUM, PUNCT, PROPN, NOUN, PUNCT, PROPN,...","[PUNCT, NUM, PUNCT, NOUN, PROPN, PUNCT, NOUN, ...","[PUNCT, PROPN, PUNCT, PROPN, NOUN, PUNCT, NOUN...","[PUNCT, NUM, PUNCT, ADJ, NOUN, PUNCT, NOUN, NO...","[PUNCT, NUM, PUNCT, PROPN, NOUN, PUNCT, NOUN, ...","TagComparison(majority=['PUNCT', 'NUM', 'PUNCT...","[PUNCT, NUM, PUNCT, PROPN, PROPN, PUNCT, PROPN..."


### Create Random Samples

In [49]:
import random
random.seed(RANDOM_SAMPLE_SEED)
print(full_tagged_indices)
random_sample_indices=random.sample(full_tagged_indices, RANDOM_SAMPLE_SIZE)
print(random_sample_indices)
random_sample_df=example_df.iloc[random_sample_indices]
random_sample_df.to_csv(OUTPUT_FILE_RANDOM_SAMPLE, index=False)
print(f'Saved random samples to: {OUTPUT_FILE_RANDOM_SAMPLE}')
random_sample_df


[0, 2, 3, 7, 13, 18, 21, 30, 31, 34, 42, 43, 42, 49, 50, 52, 57, 52, 65, 68, 80, 81, 82, 83, 86, 87, 88, 92, 94, 98, 100, 101, 102, 103, 104, 106, 111, 113, 101, 122, 124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 139, 140, 141, 149, 153, 154, 159, 162, 163, 111, 167, 168, 169, 140, 174, 176, 178, 179, 180, 184, 185, 186, 189, 101, 199, 200, 203, 208, 211, 221, 222, 223, 224, 128, 106, 231, 232, 233, 236, 237, 241, 249, 250, 249, 249, 249, 256, 258, 249, 249, 261, 263, 266, 267, 268, 270, 272, 88, 277, 279, 280, 241, 284, 31, 291, 291, 299, 300, 301, 303, 249, 299, 267, 308, 313, 31, 317, 319, 321, 324, 326, 327, 330, 313, 333, 336, 339, 343, 346, 347, 350, 21, 355, 357, 358, 333, 362, 367, 369, 333, 31, 31, 380, 386, 390, 394, 308, 399, 400, 404, 405, 406, 409, 410, 411, 413, 415, 417, 400, 413, 420, 424, 400, 427, 436, 437, 439, 413, 424, 442, 424, 413, 413, 447, 447, 449, 450, 452, 413, 456, 458, 464, 466, 464, 469, 464, 481, 489, 491, 493, 496, 499, 502, 505, 507, 514, 515,

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority
915,Thunderbird,28439275,[INFO]: Generate SM IN_SERVICE trap for GID=0x...,[INFO]: Generate SM IN_SERVICE trap for <*>,233,"[(, INFO, ), ., Generate, SM, IN_SERVICE, trap...","[PUNCT, PROPN, PUNCT, PUNCT, PROPN, PROPN, PRO...","[PUNCT, NOUN, PUNCT, PUNCT, VERB, PROPN, PROPN...","[PUNCT, PROPN, PUNCT, PUNCT, VERB, PROPN, PROP...","[PUNCT, PROPN, PUNCT, PUNCT, VERB, PROPN, PROP...","[PUNCT, NOUN, PUNCT, PUNCT, VERB, PROPN, PROPN...","TagComparison(majority=['PUNCT', 'PROPN', 'PUN...","[PUNCT, PROPN, PUNCT, PUNCT, VERB, PROPN, PROP..."
263,Spark,7109853,Received new token for : mesos-master-1:35426,Received new token for : <*>,27,"[Received, new, token, for, ., mesos-master-1:...","[VERB, ADJ, NOUN, ADP, PUNCT, NOUN]","[VERB, ADJ, NOUN, ADP, PUNCT, PROPN]","[VERB, ADJ, NOUN, ADP, PUNCT, X]","[VERB, ADJ, NOUN, ADP, PUNCT, NOUN]","[VERB, ADJ, NOUN, ADP, PUNCT, NOUN]","TagComparison(majority=['VERB', 'ADJ', 'NOUN',...","[VERB, ADJ, NOUN, ADP, PUNCT, PROPN]"
2117,Android,851,onStartedWakingUp(),onStartedWakingUp(),5,"[onStartedWakingUp, (, )]","[NOUN, PUNCT, PUNCT]","[PROPN, PUNCT, PUNCT]","[NOUN, PUNCT, PUNCT]","[PROPN, PUNCT, PUNCT]","[NOUN, PUNCT, PUNCT]","TagComparison(majority=['PROPN', 'PUNCT', 'PUN...","[PROPN, PUNCT, PUNCT]"
2585,Android,153035,2016-12-17 20:31:43.073 T:24286 INFO: IQiyi...,<*> INFO: IQiyiLiveDataProvider::StartModule: ...,473,"[2016-12-17, 20:31:43.073, T:24286, INFO, ., I...","[ADJ, NUM, PROPN, PROPN, PUNCT, PROPN, PUNCT, ...","[NUM, NUM, NUM, NOUN, PUNCT, PROPN, PUNCT, PRO...","[NUM, NUM, ADJ, PROPN, PUNCT, INTJ, PUNCT, X, ...","[NUM, NUM, PROPN, PROPN, PUNCT, PROPN, PUNCT, ...","[NUM, NUM, PROPN, NOUN, PUNCT, PROPN, PUNCT, N...","TagComparison(majority=['NUM', 'NUM', 'PROPN',...","[NUM, NUM, PROPN, PROPN, PUNCT, PROPN, PUNCT, ..."
2343,Android,60802,random_mix_pool - hexdump(len=5): [REMOVED],random_mix_pool - <*> [REMOVED],231,"[random_mix_pool, -, hexdump, (, len=5, ), ., ...","[NOUN, PUNCT, NOUN, PUNCT, NOUN, PUNCT, PUNCT,...","[NOUN, PUNCT, NOUN, PUNCT, NOUN, PUNCT, PUNCT,...","[NOUN, PUNCT, NOUN, PUNCT, NOUN, PUNCT, PUNCT,...","[NOUN, ADP, NOUN, PUNCT, NUM, PUNCT, PUNCT, PU...","[NOUN, NOUN, NOUN, PUNCT, NOUN, PUNCT, PUNCT, ...","TagComparison(majority=['NOUN', 'PUNCT', 'NOUN...","[NOUN, PUNCT, NOUN, PUNCT, NOUN, PUNCT, PUNCT,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,BGL,383488,data storage interrupt,data storage interrupt,23,"[data, storage, interrupt]","[NOUN, NOUN, NOUN]","[NOUN, NOUN, VERB]","[NOUN, NOUN, VERB]","[NOUN, NOUN, VERB]","[NOUN, NOUN, VERB]","TagComparison(majority=['NOUN', 'NOUN', 'VERB'...","[NOUN, NOUN, VERB]"
4610,OpenSSH,569,Failed password for root from 187.141.143.180 ...,Failed password for root from <*> port <*>,5,"[Failed, password, for, root, from, 187.141.14...","[VERB, NOUN, ADP, NOUN, ADP, NUM, NOUN, NUM, N...","[VERB, NOUN, ADP, NOUN, ADP, NUM, NOUN, NUM, N...","[VERB, NOUN, ADP, NOUN, ADP, NUM, NOUN, NUM, P...","[VERB, NOUN, ADP, NOUN, ADP, NOUN, NOUN, NUM, ...","[VERB, NOUN, ADP, NOUN, ADP, NUM, NOUN, NUM, N...","TagComparison(majority=['VERB', 'NOUN', 'ADP',...","[VERB, NOUN, ADP, NOUN, ADP, NUM, NOUN, NUM, N..."
638,HPC,68897,ServerFileSystem: ServerFileSystem domain stor...,ServerFileSystem: ServerFileSystem domain <*> ...,27,"[ServerFileSystem, ., ServerFileSystem, domain...","[PROPN, PUNCT, PROPN, NOUN, NOUN, VERB, ADJ]","[PROPN, PUNCT, PROPN, NOUN, NOUN, AUX, ADJ]","[ADV, PUNCT, PROPN, NOUN, PROPN, AUX, ADJ]","[INTJ, PUNCT, X, NOUN, NUM, AUX, ADJ]","[PROPN, PUNCT, PROPN, NOUN, NOUN, VERB, ADJ]","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","[PROPN, PUNCT, PROPN, NOUN, NOUN, AUX, ADJ]"
4146,Android,1369467,#04 pc 000170d8 /system/lib/libc.so (abort+4),<*> pc <*> /system/lib/libc.so <*>,2034,"[#, 04, pc, 000170d8, /system/lib/libc.so, (, ...","[SYM, NUM, NOUN, NUM, NOUN, PUNCT, NOUN, PUNCT]","[SYM, NUM, NOUN, NUM, NOUN, PUNCT, NOUN, PUNCT]","[SYM, NUM, NOUN, PROPN, PUNCT, PUNCT, NOUN, PU...","[NUM, NUM, NUM, NUM, NOUN, PUNCT, NUM, PUNCT]","[SYM, NUM, NOUN, NOUN, NOUN, PUNCT, NOUN, PUNCT]","TagComparison(majority=['PUNCT', 'NUM', 'NOUN'...","[PUNCT, NUM, NOUN, NUM, NOUN, PUNCT, NOUN, PUNCT]"
