In [6]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='poslog/'

# Set to number of directories below project root if the notebook is in a subdirectory of the project, so you can use relative paths
SUBDIR_LEVEL = 1
if SUBDIR_LEVEL>0:
    import sys 
    new_path = '../'*SUBDIR_LEVEL
    if new_path not in sys.path:
        sys.path.append(new_path)

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), '../'*SUBDIR_LEVEL, 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: '../out/poslog'
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
POS_LOG_MODEL='pos_log_upos_crf_10k_model2'

# Precons
INPUT_FILE=os.path.join(OUT_DIR, '4_ground_truth_wip_upos.csv')

# if file does not exist, throw an error
if not os.path.exists(INPUT_FILE):
    precon='2-2_tag_comparison_correction.ipynb'
    raise FileNotFoundError(f"File '{INPUT_FILE}' not found. Run '{precon}' first.")

In [8]:
import pandas as pd
example_df=pd.read_csv(INPUT_FILE)
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority,ManualTagging
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"['Receiving', 'block', 'blk_561424970237936053...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'ADJ...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","['VERB', 'NOUN', 'NUM', 'NOUN', 'PUNCT', 'PUNC...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","['VERB', 'NOUN', 'PROPN', 'NOUN', 'PUNCT', 'NU...","['VERB', 'NOUN', 'PROPN', 'NOUN', 'PUNCT', 'NU..."
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1,"['BLOCK', '*', 'NameSystem.addStoredBlock', '....","['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'NOUN', '...","['NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'NOUN', 'V...","['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'AUX', 'V...","['NOUN', 'NUM', 'NOUN', 'PUNCT', 'NOUN', 'VERB...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'NOUN', 'VER...","TagComparison(majority=['NOUN', 'SYM', 'PROPN'...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'VE...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'VE..."
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2,"['BLOCK', '*', 'NameSystem.allocateBlock', '.'...","['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'NOUN', '...","['NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', '...","['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'INTJ', '...","['NOUN', 'NUM', 'NOUN', 'PUNCT', 'NOUN', 'PUNC...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'NOUN', 'PUN...","TagComparison(majority=['NOUN', 'SYM', 'PROPN'...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'PU...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'PU..."
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3,"['Received', 'block', 'blk_-623271248664663907...","['VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NUM', ...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'...","['VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NUM',...","['VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NUM',...","['VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NUM',...","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'..."
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4,"['PacketResponder', '1', 'for', 'block', 'blk_...","['NOUN', 'NUM', 'ADP', 'NOUN', 'NOUN', 'NOUN']","['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', 'NOUN']","['NOUN', 'NUM', 'ADP', 'NOUN', 'NOUN', 'VERB']","['ADJ', 'NUM', 'ADP', 'NOUN', 'NOUN', 'VERB']","['PROPN', 'NUM', 'ADP', 'NOUN', 'NOUN', 'VERB']","TagComparison(majority=['PROPN', 'NUM', 'ADP',...","['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', None]","['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', 'VERB']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536,"['UNINSTALLED', '.', 'com.skype.skype.ShareExt...","['PROPN', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'N...","['VERB', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', '...","['VERB', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', '...","['VERB', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'NU...","['ADJ', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","TagComparison(majority=['VERB', 'PUNCT', 'PROP...","[None, 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'NU...","[None, 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'NU..."
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537,"['Uncommited', 'CATransaction', '.', 'Set', 'C...","['VERB', 'PROPN', 'PUNCT', 'PROPN', 'PROPN', '...","['ADJ', 'NOUN', 'PUNCT', 'VERB', 'PROPN', 'ADP...","['PROPN', 'PROPN', 'PUNCT', 'VERB', 'PROPN', '...","['ADJ', 'NOUN', 'PUNCT', 'NOUN', 'NOUN', 'ADP'...","['PROPN', 'PROPN', 'PUNCT', 'VERB', 'ADJ', 'AD...","TagComparison(majority=['PROPN', 'PROPN', 'PUN...","['PROPN', 'PROPN', 'PUNCT', None, 'X', 'ADP', ...","['PROPN', 'PROPN', 'PUNCT', None, 'X', 'ADP', ..."
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538,"['objc', '(', '35448', ')', '.', 'Class', 'TSU...","['NOUN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'NO...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'P...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'P...","TagComparison(majority=['PROPN', 'PUNCT', 'NUM...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N..."
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539,"['(', '00:29:25.872', ')', 'HTTPRequest', 'fig...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'NOUN', 'PU...","['PUNCT', 'NUM', 'PUNCT', 'NOUN', 'PROPN', 'PU...","['PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'NOUN', '...","['PUNCT', 'NUM', 'PUNCT', 'ADJ', 'NOUN', 'PUNC...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'NOUN', 'PU...","TagComparison(majority=['PUNCT', 'NUM', 'PUNCT...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'PROPN', 'P...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'PROPN', 'P..."


In [9]:
majorities=[eval(x) for x in example_df['Majority'].to_list()]
manual_tags=[eval(x) for x in example_df['ManualTagging'].to_list()]
examples_splitted=[eval(x) for x in example_df['Tokens'].to_list()]

In [10]:
majorities=manual_tags

In [11]:
nones_sum=0
nones_in_lines=0
token_count=0
full_tagged_indices=[]
ragged_tagged_indices=[]
for majority in majorities:
    nones=majority.count(None)
    token_count+=len(majority)
    nones_sum+=nones
    if nones>0:
        nones_in_lines+=1
        ragged_tagged_indices.append(majorities.index(majority))
    else:
        full_tagged_indices.append(majorities.index(majority))

print(f"None count: {nones_sum} of {token_count} tokens ({nones_sum/token_count:.2%})")
print(f"Lines with None: {nones_in_lines} of {len(majorities)} ({nones_in_lines/len(majorities):.2%})")
print(f"Lines with full tagging: {len(full_tagged_indices)} of {len(majorities)} ({len(full_tagged_indices)/len(majorities):.2%})")

print(full_tagged_indices)

None count: 2182 of 64542 tokens (3.38%)
Lines with None: 1548 of 5548 (27.90%)
Lines with full tagging: 4000 of 5548 (72.10%)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 52, 61, 62, 63, 64, 65, 66, 52, 68, 69, 71, 72, 73, 74, 75, 76, 77, 38, 66, 80, 81, 82, 83, 84, 85, 86, 87, 47, 39, 90, 91, 92, 93, 94, 95, 31, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 101, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 77, 19, 136, 137, 139, 140, 141, 142, 143, 144, 145, 146, 147, 109, 146, 150, 151, 152, 153, 154, 155, 156, 41, 158, 159, 160, 161, 162, 163, 111, 165, 166, 167, 168, 169, 140, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 42, 187, 188, 189, 152, 191, 192, 193, 194, 195, 196, 197, 101, 199,

In [12]:
tagged_sents=[]
for word_i in full_tagged_indices:
    tokens=examples_splitted[word_i]
    tags=majorities[word_i]
    tagged_sents.append(list(zip(tokens,tags)))
print(len(tagged_sents), sum([len(x) for x in tagged_sents]))

4000 40205


# Train/Test Split

80/20 split

In [16]:
TRAIN_SPLIT=0.8

train_sents=[]
test_sents=[]

shuffeled_tagged_sents=tagged_sents.copy()
import random
random.seed(42)
random.shuffle(shuffeled_tagged_sents)

split_index = int(len(shuffeled_tagged_sents) * TRAIN_SPLIT)
train_sents = shuffeled_tagged_sents[:split_index]
test_sents = shuffeled_tagged_sents[split_index:]
print(f"Train size: {len(train_sents)} ({sum([len(x) for x in train_sents])})")
print(f"Test size: {len(test_sents)} ({sum([len(x) for x in test_sents])})")

Train size: 3200 (32267)
Test size: 800 (7938)


## Evaluate

In [17]:
def remove_none_values_from_ys(y, y_pred, skip_x=True):
    yas=[]
    yps=[]
    for i in range(len(y)):
        ya=[]
        yp=[]
        for j in range(len(y[i])):
            if y[i][j] is not None:
                if skip_x and y[i][j]=='X':
                    continue
                ya.append(y[i][j])
                yp.append(y_pred[i][j])
        yas.append(ya)
        yps.append(yp)
    return yas, yps

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def print_accuracy(y, y_pred, report=False, skip_x=True)->float:
    yas, yps = remove_none_values_from_ys(y, y_pred, skip_x=skip_x)
    y_t=[tag for tags in yas for tag in tags]
    y_p=[tag for tags in yps for tag in tags]

    acc=accuracy_score(y_t, y_p)
    print(acc)
    if report:
        print(classification_report(y_t, y_p))
    return acc



In [20]:
from util.pos import AbstractPosTagger

import time
def tag_and_meas_time(tagger:AbstractPosTagger, X:list[list[str]])->tuple[list[list[str]], float]:
    time_start = time.time()
    y_pred=[tagger.pos_tag(x) for x in X]
    elapsed_time=time.time()-time_start
    return y_pred, elapsed_time

In [19]:
def tag_and_evaluate(tagger:AbstractPosTagger, X:list[list[str]], y:list[list[str]], last_pred:tuple[list[list[str]],float]|None=None, report=False, skip_x=True)->list[list[str]]:
    print(tagger.__class__.__name__)

    if last_pred is None:
        y_pred,elapsed_time=tag_and_meas_time(tagger, X)
    else:
        y_pred,elapsed_time=last_pred

    acc=print_accuracy(y, y_pred, report, skip_x)
    lines=len(X)
    tokens=sum([len(x) for x in X])
    print(f"Lines: {lines}")
    print(f"Tokens: {tokens}")
    print(f"Elapsed time: {elapsed_time}")
    print(f"Time per 1000 lines: {elapsed_time/lines*1000}")
    print(f"Time per 1,000,000 tokens: {elapsed_time/tokens*1000000}\n")
    return (y_pred, elapsed_time, acc)


In [None]:
from poslog import PosLogCRF
pos_log=PosLogCRF(POS_LOG_MODEL)
pos_log.train_from_tagged_sents(train_sents)

In [22]:
X_test=[[token for token, tag in sent] for sent in test_sents]
y_test=[[tag for token, tag in sent] for sent in test_sents]
y_pred, elapsed_time, acc=tag_and_evaluate(pos_log, X_test, y_test, report=True, skip_x=False)

PosLogCRF
0.9827412446460065
              precision    recall  f1-score   support

         ADJ       0.99      0.90      0.94       265
         ADP       0.99      0.98      0.99       301
         ADV       0.98      0.97      0.98        61
         AUX       0.99      0.99      0.99       113
       CCONJ       1.00      1.00      1.00        19
         DET       0.98      1.00      0.99        58
        INTJ       0.00      0.00      0.00         1
        NOUN       0.96      0.98      0.97      1477
         NUM       1.00      0.99      0.99       900
        PART       0.92      0.92      0.92        38
        PRON       1.00      0.92      0.96        12
       PROPN       0.98      0.98      0.98      1212
       PUNCT       1.00      1.00      1.00      2204
       SCONJ       0.93      1.00      0.96        13
         SYM       0.97      0.97      0.97       174
        VERB       0.96      0.95      0.95       514
           X       1.00      1.00      1.00       57

## Other taggers

In [None]:
from poslog import AbstractPosTagger
from util.pos import NLTKPosTagger, StanzaPosTagger, SpacyPosTagger, HanTaPosTagger, TreeTaggerPosTagger
nltk_tagger=NLTKPosTagger()
stanza_tagger=StanzaPosTagger()
spacy_tagger=SpacyPosTagger()
hanta_tagger=HanTaPosTagger()
treetagger_tagger=TreeTaggerPosTagger()

In [None]:
# Disabling logging for TreeTagger
# Maybe you have to restart the kernel
import logging
level_before=logging.getLogger().level
logging.getLogger().setLevel(logging.WARNING)

y_pred_treetagger=tag_and_evaluate(treetagger_tagger, X_test, y_test)

logging.getLogger().setLevel(level_before)

TreeTaggerPosTagger
0.7958435207823961
Lines: 800
Tokens: 7938
Elapsed time: 0.9398319721221924
Time per 1000 lines: 1.1747899651527405
Time per 1,000,000 tokens: 118.39656993224898



In [None]:
y_pred_nltk=tag_and_evaluate(nltk_tagger, X_test, y_test)
#y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)
y_pred_spacy=tag_and_evaluate(spacy_tagger, X_test, y_test)
y_pred_hanta=tag_and_evaluate(hanta_tagger, X_test, y_test)


NLTKPosTagger
0.7727519695734855
Lines: 800
Tokens: 7938
Elapsed time: 0.3036789894104004
Time per 1000 lines: 0.3795987367630005
Time per 1,000,000 tokens: 38.256360469942095

SpacyPosTagger
0.8088834555827221
Lines: 800
Tokens: 7938
Elapsed time: 3.2392539978027344
Time per 1000 lines: 4.049067497253418
Time per 1,000,000 tokens: 408.06928669724545

HanTaPosTagger
0.7874218962238522
Lines: 800
Tokens: 7938
Elapsed time: 3.994476079940796
Time per 1000 lines: 4.993095099925995
Time per 1,000,000 tokens: 503.2093827085911



In [None]:
y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)

StanzaPosTagger
0.9024721543058951
Lines: 800
Tokens: 7938
Elapsed time: 46.08687400817871
Time per 1000 lines: 57.60859251022339
Time per 1,000,000 tokens: 5805.854624361137



## Repeat tagging for better time evaluation

In [None]:
X_test = X_test * 10
y_test = y_test * 10

In [None]:
y_pred, elapsed_time, acc=tag_and_evaluate(pos_log, X_test, y_test, report=True, skip_x=False)

PosLogCRF
0.9827412446460065
              precision    recall  f1-score   support

         ADJ       0.99      0.90      0.94      2650
         ADP       0.99      0.98      0.99      3010
         ADV       0.98      0.97      0.98       610
         AUX       0.99      0.99      0.99      1130
       CCONJ       1.00      1.00      1.00       190
         DET       0.98      1.00      0.99       580
        INTJ       0.00      0.00      0.00        10
        NOUN       0.96      0.98      0.97     14770
         NUM       1.00      0.99      0.99      9000
        PART       0.92      0.92      0.92       380
        PRON       1.00      0.92      0.96       120
       PROPN       0.98      0.98      0.98     12120
       PUNCT       1.00      1.00      1.00     22040
       SCONJ       0.93      1.00      0.96       130
         SYM       0.97      0.97      0.97      1740
        VERB       0.96      0.95      0.95      5140
           X       1.00      1.00      1.00      576

In [None]:
# Disabling logging for TreeTagger
# Maybe you have to restart the kernel
import logging
level_before=logging.getLogger().level
logging.getLogger().setLevel(logging.WARNING)

y_pred_treetagger=tag_and_evaluate(treetagger_tagger, X_test, y_test)

logging.getLogger().setLevel(level_before)

TreeTaggerPosTagger
0.7958435207823961
Lines: 8000
Tokens: 79380
Elapsed time: 3.3434009552001953
Time per 1000 lines: 0.4179251194000244
Time per 1,000,000 tokens: 42.11893367599137



In [None]:
y_pred_nltk=tag_and_evaluate(nltk_tagger, X_test, y_test)
#y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)
y_pred_spacy=tag_and_evaluate(spacy_tagger, X_test, y_test)
y_pred_hanta=tag_and_evaluate(hanta_tagger, X_test, y_test)


NLTKPosTagger
0.7727519695734855
Lines: 8000
Tokens: 79380
Elapsed time: 2.1758291721343994
Time per 1000 lines: 0.2719786465167999
Time per 1,000,000 tokens: 27.410294433539928

SpacyPosTagger
0.8088834555827221
Lines: 8000
Tokens: 79380
Elapsed time: 30.621372938156128
Time per 1000 lines: 3.827671617269516
Time per 1,000,000 tokens: 385.7567767467388

HanTaPosTagger
0.7874218962238522
Lines: 8000
Tokens: 79380
Elapsed time: 39.11181902885437
Time per 1000 lines: 4.888977378606796
Time per 1,000,000 tokens: 492.71628910121404



In [None]:
y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)

StanzaPosTagger
0.9024721543058951
Lines: 8000
Tokens: 79380
Elapsed time: 449.5919780731201
Time per 1000 lines: 56.198997259140015
Time per 1,000,000 tokens: 5663.794130424794

