In [4]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='poslog/'

# Set to number of directories below project root if the notebook is in a subdirectory of the project, so you can use relative paths
SUBDIR_LEVEL = 1
if SUBDIR_LEVEL>0:
    import sys 
    new_path = '../'*SUBDIR_LEVEL
    if new_path not in sys.path:
        sys.path.append(new_path)

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), '../'*SUBDIR_LEVEL, 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: '../out/poslog'
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
POS_LOG_MODEL='pos_log_upos_crf_10k_model'

# Precons
INPUT_FILE=os.path.join(OUT_DIR, '4_ground_truth_wip_upos.csv')

# if file does not exist, throw an error
if not os.path.exists(INPUT_FILE):
    precon='2-2_tag_comparison_correction.ipynb'
    raise FileNotFoundError(f"File '{INPUT_FILE}' not found. Run '{precon}' first.")

In [3]:
import pandas as pd
example_df=pd.read_csv(INPUT_FILE)
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority,ManualTagging
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0,"['Receiving', 'block', 'blk_561424970237936053...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'ADJ...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","['VERB', 'NOUN', 'NUM', 'NOUN', 'PUNCT', 'PUNC...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","['VERB', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","['VERB', 'NOUN', 'PROPN', 'NOUN', 'PUNCT', 'NU...","['VERB', 'NOUN', 'PROPN', 'NOUN', 'PUNCT', 'NU..."
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1,"['BLOCK', '*', 'NameSystem.addStoredBlock', '....","['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'NOUN', '...","['NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'NOUN', 'V...","['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'AUX', 'V...","['NOUN', 'NUM', 'NOUN', 'PUNCT', 'NOUN', 'VERB...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'NOUN', 'VER...","TagComparison(majority=['NOUN', 'SYM', 'PROPN'...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'VE...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'VE..."
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2,"['BLOCK', '*', 'NameSystem.allocateBlock', '.'...","['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'NOUN', '...","['NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', '...","['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'INTJ', '...","['NOUN', 'NUM', 'NOUN', 'PUNCT', 'NOUN', 'PUNC...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'NOUN', 'PUN...","TagComparison(majority=['NOUN', 'SYM', 'PROPN'...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'PU...","['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'PU..."
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3,"['Received', 'block', 'blk_-623271248664663907...","['VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NUM', ...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'...","['VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NUM',...","['VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NUM',...","['VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NUM',...","TagComparison(majority=['VERB', 'NOUN', 'PROPN...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'...","['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM'..."
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4,"['PacketResponder', '1', 'for', 'block', 'blk_...","['NOUN', 'NUM', 'ADP', 'NOUN', 'NOUN', 'NOUN']","['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', 'NOUN']","['NOUN', 'NUM', 'ADP', 'NOUN', 'NOUN', 'VERB']","['ADJ', 'NUM', 'ADP', 'NOUN', 'NOUN', 'VERB']","['PROPN', 'NUM', 'ADP', 'NOUN', 'NOUN', 'VERB']","TagComparison(majority=['PROPN', 'NUM', 'ADP',...","['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', None]","['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', 'VERB']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536,"['UNINSTALLED', '.', 'com.skype.skype.ShareExt...","['PROPN', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'N...","['VERB', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', '...","['VERB', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', '...","['VERB', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'NU...","['ADJ', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'NUM...","TagComparison(majority=['VERB', 'PUNCT', 'PROP...","[None, 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'NU...","[None, 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'NU..."
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537,"['Uncommited', 'CATransaction', '.', 'Set', 'C...","['VERB', 'PROPN', 'PUNCT', 'PROPN', 'PROPN', '...","['ADJ', 'NOUN', 'PUNCT', 'VERB', 'PROPN', 'ADP...","['PROPN', 'PROPN', 'PUNCT', 'VERB', 'PROPN', '...","['ADJ', 'NOUN', 'PUNCT', 'NOUN', 'NOUN', 'ADP'...","['PROPN', 'PROPN', 'PUNCT', 'VERB', 'ADJ', 'AD...","TagComparison(majority=['PROPN', 'PROPN', 'PUN...","['PROPN', 'PROPN', 'PUNCT', None, 'X', 'ADP', ...","['PROPN', 'PROPN', 'PUNCT', None, 'X', 'ADP', ..."
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538,"['objc', '(', '35448', ')', '.', 'Class', 'TSU...","['NOUN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'NO...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'P...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'P...","TagComparison(majority=['PROPN', 'PUNCT', 'NUM...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N...","['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'N..."
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539,"['(', '00:29:25.872', ')', 'HTTPRequest', 'fig...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'NOUN', 'PU...","['PUNCT', 'NUM', 'PUNCT', 'NOUN', 'PROPN', 'PU...","['PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'NOUN', '...","['PUNCT', 'NUM', 'PUNCT', 'ADJ', 'NOUN', 'PUNC...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'NOUN', 'PU...","TagComparison(majority=['PUNCT', 'NUM', 'PUNCT...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'PROPN', 'P...","['PUNCT', 'NUM', 'PUNCT', 'PROPN', 'PROPN', 'P..."


In [4]:
majorities=[eval(x) for x in example_df['Majority'].to_list()]
manual_tags=[eval(x) for x in example_df['ManualTagging'].to_list()]
examples_splitted=[eval(x) for x in example_df['Tokens'].to_list()]

In [5]:
majorities=manual_tags

In [6]:
nones_sum=0
nones_in_lines=0
token_count=0
full_tagged_indices=[]
ragged_tagged_indices=[]
for majority in majorities:
    nones=majority.count(None)
    token_count+=len(majority)
    nones_sum+=nones
    if nones>0:
        nones_in_lines+=1
        ragged_tagged_indices.append(majorities.index(majority))
    else:
        full_tagged_indices.append(majorities.index(majority))

print(f"None count: {nones_sum} of {token_count} tokens ({nones_sum/token_count:.2%})")
print(f"Lines with None: {nones_in_lines} of {len(majorities)} ({nones_in_lines/len(majorities):.2%})")
print(f"Lines with full tagging: {len(full_tagged_indices)} of {len(majorities)} ({len(full_tagged_indices)/len(majorities):.2%})")

print(full_tagged_indices)

None count: 2182 of 64542 tokens (3.38%)
Lines with None: 1548 of 5548 (27.90%)
Lines with full tagging: 4000 of 5548 (72.10%)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 52, 61, 62, 63, 64, 65, 66, 52, 68, 69, 71, 72, 73, 74, 75, 76, 77, 38, 66, 80, 81, 82, 83, 84, 85, 86, 87, 47, 39, 90, 91, 92, 93, 94, 95, 31, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 101, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 77, 19, 136, 137, 139, 140, 141, 142, 143, 144, 145, 146, 147, 109, 146, 150, 151, 152, 153, 154, 155, 156, 41, 158, 159, 160, 161, 162, 163, 111, 165, 166, 167, 168, 169, 140, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 42, 187, 188, 189, 152, 191, 192, 193, 194, 195, 196, 197, 101, 199,

In [7]:
tagged_sents=[]
for word_i in full_tagged_indices:
    tokens=examples_splitted[word_i]
    tags=majorities[word_i]
    tagged_sents.append(list(zip(tokens,tags)))
print(len(tagged_sents), sum([len(x) for x in tagged_sents]))

4000 40205


In [None]:
from util import Tokenizer
tokenizer=Tokenizer()
s="Tag this sentence."
tokens=tokenizer.tokenize(s)
print(tokens)
from poslog import PosLogCRF
pos_log=PosLogCRF('pos_log_upos_crf_10k_model_pre_manual')
pos=pos_log.predict(tokens)
print(pos)

['Tag', 'this', 'sentence', '.']
['VERB' 'DET' 'NOUN' 'PUNCT']


# Train/Test Split

80/20 split

In [9]:
TRAIN_SPLIT=0.8

train_sents=[]
test_sents=[]

shuffeled_tagged_sents=tagged_sents.copy()
import random
random.seed(42)
random.shuffle(shuffeled_tagged_sents)

split_index = int(len(shuffeled_tagged_sents) * TRAIN_SPLIT)
train_sents = shuffeled_tagged_sents[:split_index]
test_sents = shuffeled_tagged_sents[split_index:]
print(f"Train size: {len(train_sents)} ({sum([len(x) for x in train_sents])})")
print(f"Test size: {len(test_sents)} ({sum([len(x) for x in test_sents])})")

Train size: 3200 (32267)
Test size: 800 (7938)


## Evaluate

In [10]:
def remove_none_values_from_ys(y, y_pred, skip_x=True):
    yas=[]
    yps=[]
    for i in range(len(y)):
        ya=[]
        yp=[]
        for j in range(len(y[i])):
            if y[i][j] is not None:
                if skip_x and y[i][j]=='X':
                    continue
                ya.append(y[i][j])
                yp.append(y_pred[i][j])
        yas.append(ya)
        yps.append(yp)
    return yas, yps

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def print_accuracy(y, y_pred, report=False, skip_x=True)->float:
    yas, yps = remove_none_values_from_ys(y, y_pred, skip_x=skip_x)
    y_t=[tag for tags in yas for tag in tags]
    y_p=[tag for tags in yps for tag in tags]

    acc=accuracy_score(y_t, y_p)
    print(acc)
    if report:
        print(classification_report(y_t, y_p))
    return acc



In [None]:
from util.pos import AbstractPosTagger

import time
def tag_and_meas_time(tagger:AbstractPosTagger, X:list[list[str]])->tuple[list[list[str]], float]:
    time_start = time.time()
    y_pred=[tagger.pos_tag(x) for x in X]
    elapsed_time=time.time()-time_start
    return y_pred, elapsed_time

  from .autonotebook import tqdm as notebook_tqdm
  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [12]:
def tag_and_evaluate(tagger:AbstractPosTagger, X:list[list[str]], y:list[list[str]], last_pred:tuple[list[list[str]],float]|None=None, report=False, skip_x=True)->list[list[str]]:
    print(tagger.__class__.__name__)

    if last_pred is None:
        y_pred,elapsed_time=tag_and_meas_time(tagger, X)
    else:
        y_pred,elapsed_time=last_pred

    acc=print_accuracy(y, y_pred, report, skip_x)
    lines=len(X)
    tokens=sum([len(x) for x in X])
    print(f"Lines: {lines}")
    print(f"Tokens: {tokens}")
    print(f"Elapsed time: {elapsed_time}")
    print(f"Time per 1000 lines: {elapsed_time/lines*1000}")
    print(f"Time per 1,000,000 tokens: {elapsed_time/tokens*1000000}\n")
    return (y_pred, elapsed_time, acc)


In [13]:
from poslog import PosLogCRF
pos_log=PosLogCRF(POS_LOG_MODEL)
pos_log.train_from_tagged_sents(train_sents)

INFO:poslog.PosLogCRF:Saved model to '/Users/surfbook/Documents/HsH_WiMi/poslog_git/pipeline/../poslog/models/pos_log_upos_crf_10k_model.pkl'


In [14]:
X_test=[[token for token, tag in sent] for sent in test_sents]
y_test=[[tag for token, tag in sent] for sent in test_sents]
y_pred, elapsed_time, acc=tag_and_evaluate(pos_log, X_test, y_test, report=True, skip_x=False)

PosLogCRF
0.9827412446460065
              precision    recall  f1-score   support

         ADJ       0.99      0.90      0.94       265
         ADP       0.99      0.98      0.99       301
         ADV       0.98      0.97      0.98        61
         AUX       0.99      0.99      0.99       113
       CCONJ       1.00      1.00      1.00        19
         DET       0.98      1.00      0.99        58
        INTJ       0.00      0.00      0.00         1
        NOUN       0.96      0.98      0.97      1477
         NUM       1.00      0.99      0.99       900
        PART       0.92      0.92      0.92        38
        PRON       1.00      0.92      0.96        12
       PROPN       0.98      0.98      0.98      1212
       PUNCT       1.00      1.00      1.00      2204
       SCONJ       0.93      1.00      0.96        13
         SYM       0.97      0.97      0.97       174
        VERB       0.96      0.95      0.95       514
           X       1.00      1.00      1.00       57

## Other taggers

In [None]:
from poslog import AbstractPosTagger
from util.pos import NLTKPosTagger, StanzaPosTagger, SpacyPosTagger, HanTaPosTagger, TreeTaggerPosTagger
nltk_tagger=NLTKPosTagger()
stanza_tagger=StanzaPosTagger()
spacy_tagger=SpacyPosTagger()
hanta_tagger=HanTaPosTagger()
treetagger_tagger=TreeTaggerPosTagger()

INFO:nlp.pos.PosTagger:Initializing NLTKPosTagger
INFO:nlp.pos.PosTagger:NLTKPosTagger initialized
INFO:nlp.pos.PosTagger:Initializing StanzaPosTagger
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 108MB/s]                     
INFO:stanza:Downloaded file to /Users/surfbook/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!
INFO:nlp.pos.PosTagger:StanzaPosTagger initialized
INFO:nlp.pos.PosTagger:Initializing SpacyPosTagger
INFO:nlp.

In [16]:
# Disabling logging for TreeTagger
# Maybe you have to restart the kernel
import logging
level_before=logging.getLogger().level
logging.getLogger().setLevel(logging.WARNING)

y_pred_treetagger=tag_and_evaluate(treetagger_tagger, X_test, y_test)

logging.getLogger().setLevel(level_before)

TreeTaggerPosTagger
0.7958435207823961
Lines: 800
Tokens: 7938
Elapsed time: 0.9398319721221924
Time per 1000 lines: 1.1747899651527405
Time per 1,000,000 tokens: 118.39656993224898



In [17]:
y_pred_nltk=tag_and_evaluate(nltk_tagger, X_test, y_test)
#y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)
y_pred_spacy=tag_and_evaluate(spacy_tagger, X_test, y_test)
y_pred_hanta=tag_and_evaluate(hanta_tagger, X_test, y_test)


NLTKPosTagger
0.7727519695734855
Lines: 800
Tokens: 7938
Elapsed time: 0.3036789894104004
Time per 1000 lines: 0.3795987367630005
Time per 1,000,000 tokens: 38.256360469942095

SpacyPosTagger
0.8088834555827221
Lines: 800
Tokens: 7938
Elapsed time: 3.2392539978027344
Time per 1000 lines: 4.049067497253418
Time per 1,000,000 tokens: 408.06928669724545

HanTaPosTagger
0.7874218962238522
Lines: 800
Tokens: 7938
Elapsed time: 3.994476079940796
Time per 1000 lines: 4.993095099925995
Time per 1,000,000 tokens: 503.2093827085911



In [18]:
y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)

StanzaPosTagger
0.9024721543058951
Lines: 800
Tokens: 7938
Elapsed time: 46.08687400817871
Time per 1000 lines: 57.60859251022339
Time per 1,000,000 tokens: 5805.854624361137



## Repeat tagging for better time evaluation

In [19]:
X_test = X_test * 10
y_test = y_test * 10

In [20]:
y_pred, elapsed_time, acc=tag_and_evaluate(pos_log, X_test, y_test, report=True, skip_x=False)

PosLogCRF
0.9827412446460065
              precision    recall  f1-score   support

         ADJ       0.99      0.90      0.94      2650
         ADP       0.99      0.98      0.99      3010
         ADV       0.98      0.97      0.98       610
         AUX       0.99      0.99      0.99      1130
       CCONJ       1.00      1.00      1.00       190
         DET       0.98      1.00      0.99       580
        INTJ       0.00      0.00      0.00        10
        NOUN       0.96      0.98      0.97     14770
         NUM       1.00      0.99      0.99      9000
        PART       0.92      0.92      0.92       380
        PRON       1.00      0.92      0.96       120
       PROPN       0.98      0.98      0.98     12120
       PUNCT       1.00      1.00      1.00     22040
       SCONJ       0.93      1.00      0.96       130
         SYM       0.97      0.97      0.97      1740
        VERB       0.96      0.95      0.95      5140
           X       1.00      1.00      1.00      576

In [21]:
# Disabling logging for TreeTagger
# Maybe you have to restart the kernel
import logging
level_before=logging.getLogger().level
logging.getLogger().setLevel(logging.WARNING)

y_pred_treetagger=tag_and_evaluate(treetagger_tagger, X_test, y_test)

logging.getLogger().setLevel(level_before)

TreeTaggerPosTagger
0.7958435207823961
Lines: 8000
Tokens: 79380
Elapsed time: 3.3434009552001953
Time per 1000 lines: 0.4179251194000244
Time per 1,000,000 tokens: 42.11893367599137



In [22]:
y_pred_nltk=tag_and_evaluate(nltk_tagger, X_test, y_test)
#y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)
y_pred_spacy=tag_and_evaluate(spacy_tagger, X_test, y_test)
y_pred_hanta=tag_and_evaluate(hanta_tagger, X_test, y_test)


NLTKPosTagger
0.7727519695734855
Lines: 8000
Tokens: 79380
Elapsed time: 2.1758291721343994
Time per 1000 lines: 0.2719786465167999
Time per 1,000,000 tokens: 27.410294433539928

SpacyPosTagger
0.8088834555827221
Lines: 8000
Tokens: 79380
Elapsed time: 30.621372938156128
Time per 1000 lines: 3.827671617269516
Time per 1,000,000 tokens: 385.7567767467388

HanTaPosTagger
0.7874218962238522
Lines: 8000
Tokens: 79380
Elapsed time: 39.11181902885437
Time per 1000 lines: 4.888977378606796
Time per 1,000,000 tokens: 492.71628910121404



In [23]:
y_pred_stanza=tag_and_evaluate(stanza_tagger, X_test, y_test)

StanzaPosTagger
0.9024721543058951
Lines: 8000
Tokens: 79380
Elapsed time: 449.5919780731201
Time per 1000 lines: 56.198997259140015
Time per 1,000,000 tokens: 5663.794130424794



# Now tag with all taggers not fully corrected lines and evaluate while skipping None Values in Tags
# This should give an Comparison of PosLog and Taggers

In [24]:
# these are all lines and their counted majority containing Nones
majorities

[['VERB', 'NOUN', 'PROPN', 'NOUN', 'PUNCT', 'NUM', 'NOUN', 'PUNCT', 'NUM'],
 ['NOUN',
  'SYM',
  'PROPN',
  'PUNCT',
  'PROPN',
  'VERB',
  'PUNCT',
  'NUM',
  'AUX',
  'VERB',
  'ADP',
  'PROPN',
  'NOUN',
  'NUM'],
 ['NOUN', 'SYM', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN'],
 ['VERB', 'NOUN', 'PROPN', 'ADP', 'NOUN', 'NUM', 'ADP', 'NUM'],
 ['PROPN', 'NUM', 'ADP', 'NOUN', 'PROPN', 'VERB'],
 ['NUM', 'PUNCT', 'VERB', 'NOUN', 'PROPN', 'ADP', 'PROPN'],
 ['NUM', 'VERB', 'NOUN', 'PART', 'VERB', 'NOUN', 'PROPN', 'ADP', 'NUM'],
 ['NOUN', 'VERB', 'ADP', 'PROPN'],
 ['NOUN',
  'SYM',
  'PROPN',
  'PUNCT',
  'PROPN',
  'AUX',
  'VERB',
  'ADP',
  'PROPN',
  'ADP',
  'NUM'],
 ['VERB', 'NOUN', 'PROPN', 'NOUN', 'PROPN'],
 ['NUM',
  'PUNCT',
  'VERB',
  'NOUN',
  'SCONJ',
  'VERB',
  'PROPN',
  'ADP',
  'NUM',
  'PUNCT'],
 ['NUM', 'VERB', 'NOUN', 'PROPN', 'ADP', 'PROPN'],
 ['PROPN',
  'PROPN',
  'VERB',
  'NOUN',
  'PROPN',
  'PUNCT',
  'AUX',
  'ADV',
  'VERB',
  'ADP',
  'NOUN'],
 ['NOUN',
  'SYM'

In [25]:
# line indices that was not trained on
len(ragged_tagged_indices)

1548

In [26]:
ragged_lines = [examples_splitted[i] for i in ragged_tagged_indices]
ragged_majorities = [majorities[i] for i in ragged_tagged_indices]
X=ragged_lines
y=ragged_majorities
#ragged_zipped = [list(zip(line, majority)) for line, majority in zip(ragged_lines, ragged_majorities)]
#ragged_zipped

In [27]:
def remove_none_values_from_ys(y, y_pred, skip_x=True):
    yas=[]
    yps=[]
    for i in range(len(y)):
        ya=[]
        yp=[]
        for j in range(len(y[i])):
            #if y[i][j] is not None and not y[i][j].startswith('PROMETEUS'):
            if y[i][j] is not None:# and y[i][j]!='X':
                if skip_x and y[i][j]=='X':
                    continue
                ya.append(y[i][j])
                yp.append(y_pred[i][j])
        yas.append(ya)
        yps.append(yp)
    return yas, yps

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def print_accuracy(y, y_pred, report=False, skip_x=True)->float:
    yas, yps = remove_none_values_from_ys(y, y_pred, skip_x=skip_x)
    y_t=[tag for tags in yas for tag in tags]
    y_p=[tag for tags in yps for tag in tags]

    acc=accuracy_score(y_t, y_p)
    print(acc)
    if report:
        print(classification_report(y_t, y_p))
    return acc



In [None]:
from util.pos import AbstractPosTagger

import time
def tag_and_meas_time(tagger:AbstractPosTagger, X:list[list[str]])->tuple[list[list[str]], float]:
    time_start = time.time()
    y_pred=[tagger.pos_tag(x) for x in X]
    elapsed_time=time.time()-time_start
    return y_pred, elapsed_time

In [29]:
def tag_and_evaluate(tagger:AbstractPosTagger, X:list[list[str]], y:list[list[str]], last_pred:tuple[list[list[str]],float]|None=None, report=False, skip_x=True)->list[list[str]]:
    print(tagger.__class__.__name__)

    if last_pred is None:
        y_pred,elapsed_time=tag_and_meas_time(tagger, X)
    else:
        y_pred,elapsed_time=last_pred

    acc=print_accuracy(y, y_pred, report, skip_x)
    lines=len(X)
    tokens=sum([len(x) for x in X])
    print(f"Lines: {lines}")
    print(f"Tokens: {tokens}")
    print(f"Elapsed time: {elapsed_time}")
    print(f"Time per 1000 lines: {elapsed_time/lines*1000}")
    print(f"Time per 1000.000 tokens: {elapsed_time/tokens*1000000}\n")
    return (y_pred, elapsed_time, acc)


### Train First CRF

In [None]:
from util.pos import KnownWordsDetector, RegexTokenClassMatcher, WordKind, TokenClass
import re
import string
kwdet=KnownWordsDetector()
rgtcm=RegexTokenClassMatcher()

feature:str=None
def make_features(words:list[str])->list[dict[str,str]]:
    features_list=[]
    for i, word in enumerate(words):

        features = {}

        features['word']=word

        #features['kind_of_known_word'] = kind_of_known_word(word) or 'unknown'
        kind_of_known_word = kwdet.kind_of_known_word(word)
        #features['kind_of_known_word'] = self.kwdet.kind_of_known_word(word).value
        features['is_stopword'] = 1 if kind_of_known_word == WordKind.STOP_WORD else 0
        features['is_wordnet'] = 1 if kind_of_known_word == WordKind.WORD_NET else 0
        features['is_wordnet'] = 1 if kind_of_known_word == WordKind.WORD_NET or kind_of_known_word == WordKind.WORDS_DICTIONARY else 0
        #features['is_words_dictionary'] = 1 if kind_of_known_word == WordKind.WORDS_DICTIONARY else 0
        features['is_domain_word'] = 1 if kind_of_known_word == WordKind.DOMAIN_WORD else 0
        features['is_number'] = 1 if kind_of_known_word == WordKind.NUMBER else 0
        features['is_unknown'] = 1 if kind_of_known_word == WordKind.UNKNOWN else 0



        #features['mask_type'] = get_mask_for_token(word) or 'unknown'
        #features['mask_type']=masker._determine_token_type(word).value
        token_class = rgtcm.token_class(word)
        features['word_class'] = token_class.value
        # features['tc_number'] = 1 if token_class == TokenClass.NUMBER else 0
        # features['tc_identifier'] = 1 if token_class == TokenClass.IDENTIFIER else 0
        # features['tc_key_value_pair'] = 1 if token_class == TokenClass.KEY_VALUE_PAIR else 0
        # features['tc_date_time'] = 1 if token_class == TokenClass.DATE_TIME else 0
        # features['tc_location'] = 1 if token_class == TokenClass.LOCATION else 0
        # features['tc_variable'] = 1 if token_class == TokenClass.VARIABLE else 0
        # features['tc_symbol'] = 1 if token_class == TokenClass.SYMBOL else 0
        # features['tc_punctuation'] = 1 if token_class == TokenClass.PUNCTUATION else 0
        # features['tc_misc'] = 1 if token_class == TokenClass.MISC else 0
        # features['tc_unknown'] = 1 if token_class == TokenClass.UNKNOWN else 0


        #features['upcase'] = 1 if word[0].isupper() else 0

        features['has_upper'] = 1 if re.search(r'[A-Z]',word) else 0

        # camel_case=re.compile(r'_*[a-zA-Z]+[a-z]([A-Z][a-z]+)+\d*')
        # snake_case=re.compile(r'_*[a-zA-Z]+(_[a-zA-Z]+)+\d*')
        # kebap_case=re.compile(r'[a-zA-Z]+(-[a-zA-Z]+)+\d*')
        # word_digit_mix=re.compile(r'(.*[a-zA-Z_\-/]+[0-9]+.*)|(.*[0-9]+[a-zA-Z_\-/]+.*)')
        # features['is-var']=1 if camel_case.fullmatch(word) or snake_case.fullmatch(word) or kebap_case.fullmatch(word) or word_digit_mix.fullmatch(word) else 0

        path_regex=re.compile(r'\w*:?([\.\/\\]+[\w\-:]+)+')
        features['is-path']=1 if path_regex.fullmatch(word) else 0

        # For better distinguish between 'to' as ADP and PART
        features['is_to']=1 if word.lower() == 'to' else 0

        #features['has_equal_sign'] = 1 if re.search(r'=',word) else 0


        #features['number'] = 1 if is_number(word) else 0
        #features['number'] = 1 if word.isdigit() else 0
        #features['number'] = 1 if word.isdigit() or features['mask_type']=='Number' else 0

        features['contains_number'] = 1 if re.search(r'[0-9]',word) else 0
        
        #features['is_punct'] = 1 if re.fullmatch(r'['+string.punctuation+']',word) else 0

        features['contains_punct']=1 if re.search(r'['+string.punctuation+']',word) else 0

        punct_chars=re.escape(r""".,;:!?()[]{}_…“”‘’"'/\|·«»`~¿¡•""")
        #features['is_punct'] = 1 if re.fullmatch(r'['+punct_chars+']',word) else 0
        sym_chars=re.escape(r"""+-=*^%$&§¤#@<>©®™°±×÷√∞∑∏∫∆µπΩ≠≈∈∩∪⊂⊃∅∇⊕⊗⇒⇔""")
        #features['is_sym'] = 1 if re.fullmatch(r'['+sym_chars+']',word) else 0
        #features['is_dash']=1 if re.fullmatch(r'[-]',word) else 0

        #features['key_value'] = 1 if re.search(r'[=:]',word) else 0

        # ideas from: https://www.geeksforgeeks.org/conditional-random-fields-crfs-for-pos-tagging-in-nlp/

        features['is_first'] = i == 0
        features['is_last'] = i == len(words) - 1

        features['all_caps'] = 1 if word.upper() == word else 0
        features['all_lower'] = 1 if word.lower() == word else 0

        #features['prev_word']= '' if i == 0 else tagged_sent[i-1][0]

        # Next word to better distinguish between 'to' as ADP and PART
        features['next_word']= '' if i == len(words)-1 else words[i+1]
        
        features['prev_char']= '' if i == 0 else words[i-1][-1]
        features['next_char']= '' if i == len(words)-1 else words[i+1][0]

        features['prefix-1'] = word[0]
        features['prefix-2'] = word[:2]
        # features['prefix-3'] = word[:3]

        features['suffix-1'] = word[-1]
        features['suffix-2'] = word[-2:]
        features['suffix-3'] = word[-3:]
        #features['suffix-4'] = word[-4:]
        
        
        word_lower = word.lower()
        # features['word'] = word
        features['word.lower'] = word_lower
        # features['word.isupper'] = str(word.isupper())
        # features['word.istitle'] = str(word.istitle())
        # features['word.isdigit'] = str(word.isdigit())

        # Prefixes and suffixes — useful for ADJ
        features['suffix3'] = word_lower[-3:]
        features['suffix2'] = word_lower[-2:]
        features['prefix2'] = word_lower[:2]
        features['prefix3'] = word_lower[:3]

        # Punctuation — useful to catch INTJ
        features['is_punct'] = str(word in "!?.;,")

        # Position-aware features
        if i > 0:
            features['prev_word'] = words[i - 1].lower()
            features['prev_is_upper'] = str(words[i - 1].isupper())
        else: 
            features['prev_word'] = ''
            features['prev_is_upper'] = False

        # else:
        #     features['BOS'] = 'True'  # Beginning of sentence

        if i < len(words) - 1:
            # features['next_word'] = words[i + 1].lower()
            features['next_is_upper'] = str(words[i + 1].isupper())
        else:
            features['next_is_upper'] = False

        #     features['EOS'] = 'True'  # End of sentence

        # Shape-based features
        features['word_shape'] = get_shape(word)

        # features['has_hyphen'] = str('-' in word)
        # features['has_digit'] = str(any(char.isdigit() for char in word))
        # # features['has_alpha'] = str(any(char.isalpha() for char in word))
        # features['is_short'] = str(len(word) <= 3)  # Often INTJ/PART
        # features['is_long'] = str(len(word) >= 10)  # Often ADJ

        # particles = {'not', 'off', 'up', 'down'}
        # interjections = {'oh', 'ah', 'wow', 'hey', 'oops', 'ouch', 'ok', 'bye', 'yes'}
        adjective_suffixes = ('ous', 'ful', 'ive', 'able', 'al', 'ic', 'less', 'ish')

        # features['is_particle'] = str(word_lower in particles)
        # features['is_interjection'] = str(word_lower in interjections)
        features['adj_suffix_match'] = str(any(word_lower.endswith(suf) for suf in adjective_suffixes))

        noun_suffixes = ('tion', 'ment', 'ness', 'ity', 'ship', 'age', 'ism', 'ence', 'ance', 'hood', 'dom')
        features['noun_suffix'] = str(any(word_lower.endswith(suf) for suf in noun_suffixes))

        # e.g., "the big ___" → likely NOUN
        determiners = {'the', 'a', 'an', 'this', 'that', 'these', 'those'}
        features['prev_is_determiner'] = str(i > 0 and words[i-1].lower() in determiners)

        # if i > 1:
        #     features['prev2_word'] = words[i-2].lower()
        # if i < len(words) - 2:
        #     features['next2_word'] = words[i+2].lower()
        
        #features['shape_affix'] = get_shape_affix(word)
        
        #features['word.no_digits'] = re.sub(r'\d', '0', word.lower())
        #features['digit_count'] = str(sum(c.isdigit() for c in word))
        #features['starts_with_digit'] = str(word[0].isdigit())
        #features['digit_pattern'] = re.sub(r'\d+', '0', word.lower())  # collapses "2023rd" to "0rd"
        #features['word.nodigits'] = re.sub(r'\d+', '', word.lower())  # "R2D2" → "rd"

        if feature is not None:
            features={feature: features[feature]} 
            
        features_list.append(features)
    
    return features_list           

def get_shape(word: str) -> str:
    shape = ''
    for char in word:
        if char.isupper():
            shape += 'X'
        elif char.islower():
            shape += 'x'
        elif char.isdigit():
            shape += 'd'
        else:
            #shape += '_'
            shape += char
    return shape

ImportError: cannot import name 'KnownWordsDetector' from 'nlp.pos' (/Users/surfbook/Documents/HsH_WiMi/poslog_git/pipeline/../nlp/pos/__init__.py)

In [None]:
from itertools import chain, combinations
from util.pos import PosLogCRF

res_d={}
feature=None
x=make_features(['Hello', 'world', '123', '!', 'test'])
ls=list(x[0].keys())
i=0
for l in ls:
    global feature
    feature=l
    print(f"{i}/{len(ls)}: Feature: {feature}")
    i+=1
    crf_run=PosLogCRF(model_path=POS_LOG_MODEL, make_features=make_features)
    crf_run.train_from_tagged_sents(tagged_sents)
    y_pred, elapsed_time, acc = tag_and_evaluate(crf_run, X, y, report=True)
    res_d[l] = acc    
res_d



In [None]:
res_d

In [None]:
import pandas as pd
res_df2=pd.DataFrame.from_dict(res_d, orient='index', columns=['Accuracy'])
res_df2

In [None]:
res_df['only']=res_df2['Accuracy']

In [None]:
res_df

In [None]:
from util.pos import PosLogCRF
crf=PosLogCRF(model_path=POS_LOG_MODEL)#, make_features=make_features)
crf.train_from_tagged_sents(tagged_sents)


In [None]:
x=tag_and_evaluate(crf, X, y, report=True, skip_x=False)
# 0.9514655233007662
#is_var: 0.9512898010824489
#is_path: 0.952379278836016
#is_to: 0.952765867716314
#suffix-3: 0.9544879454558234
# 0.9559991565333521
#gpt: 0.9582484009278133
#wordkind: 0.9592675897940536
#0.9595136008996977
#0.9598299008926688


In [None]:
# # Filter rows where 'Majority' contains 'INTJ'
# intj_examples = example_df[example_df['Majority'].apply(lambda x: 'PART' in x)]
# intj_examples

In [None]:
# poslog=PosLogCRF(model_path=POS_LOG_MODEL)
# x=tag_and_evaluate(poslog, X, y, report=True)
# #x=tag_and_evaluate(crf, X, y, report=True)

# Other Taggers

In [None]:
from util.pos import AbstractPosTagger, NLTKPosTagger, StanzaPosTagger, SpacyPosTagger, HanTaPosTagger, TreeTaggerPosTagger
nltk_tagger=NLTKPosTagger()
stanza_tagger=StanzaPosTagger()
spacy_tagger=SpacyPosTagger()
hanta_tagger=HanTaPosTagger()
treetagger_tagger=TreeTaggerPosTagger()

In [None]:
# Disabling logging for TreeTagger
# Maybe you have to restart the kernel
import logging
level_before=logging.getLogger().level
logging.getLogger().setLevel(logging.WARNING)

y_pred_treetagger=tag_and_evaluate(treetagger_tagger, X, y)

logging.getLogger().setLevel(level_before)

In [None]:
y_pred_nltk=tag_and_evaluate(nltk_tagger, X, y)
#y_pred_stanza=tag_and_evaluate(stanza_tagger, X, y)
y_pred_spacy=tag_and_evaluate(spacy_tagger, X, y)
y_pred_hanta=tag_and_evaluate(hanta_tagger, X, y)


Output:
```
NLTKPosTagger
0.7657043964945872
Lines: 2124
Tokens: 31553
Elapsed time: 0.8707699775695801
Time per 1000 lines: 0.40996703275403956
Time per 1000.000 tokens: 27.59705820586252

StanzaPosTagger
0.8959790853523824
Lines: 2124
Tokens: 31553
Elapsed time: 161.13187313079834
Time per 1000 lines: 75.8624638092271
Time per 1000.000 tokens: 5106.705325350945

SpacyPosTagger
0.7986228735547537
Lines: 2124
Tokens: 31553
Elapsed time: 10.024458169937134
Time per 1000 lines: 4.7196130743583495
Time per 1000.000 tokens: 317.7022207060227

HanTaPosTagger
0.7788865159437367
Lines: 2124
Tokens: 31553
Elapsed time: 11.70390510559082
Time per 1000 lines: 5.510313138225433
Time per 1000.000 tokens: 370.9284412129059

TreeTaggerPosTagger
0.8014213123204948
Lines: 2124
Tokens: 31553
Elapsed time: 1.6282150745391846
Time per 1000 lines: 0.7665796019487686
Time per 1000.000 tokens: 51.602544117490716

```

In [None]:
# poslog=PosLogCRF(model_path=POS_LOG_MODEL)
# x=tag_and_evaluate(poslog, X, y, report=True)
# #x=tag_and_evaluate(crf, X, y, report=True)

In [None]:
t = example_df['Tokens'].tolist()[9]
print(t)


for name, tagger_func in tagger.items():
    if name=='pos_log':
        tagger_func=poslog.pos_tag
    #pos_tags = poslog.pos_tag(tokenizer.tokenize(s))
    #t=tokenizer.tokenize(s)
    pos_tags = tagger_func(t)
    #pos_tags = [tag for token, tag in pos_tags]
    print(name, list(zip(t, pos_tags)))


In [None]:
print_accuracy(y, y_pred)

In [None]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import words


In [None]:
words_dictionary: dict[str, None] = dict.fromkeys(
    [w.lower() for w in words.words()], None)
stopwords_dict: dict[str, None] = dict.fromkeys(
    stopwords.words('english'), None)
#domain_words: dict[str, None] = self._read_domain_words()
print(f'len(words_dictionary): {len(words_dictionary)}')
print(f'len(stopwords_dict): {len(stopwords_dict)}')

In [None]:
wordnet

# Evaluate (old from somewhere)

In [None]:
majorities=[eval(x) for x in examples_df['Majority'].to_list()]
corrected=[x for x in examples_df['ManualTagging'].to_list()]
tokens=[x for x in examples_df['Tokens'].to_list()]

In [None]:
# make sure there are no Nones in the corrected list
num_lines=len(corrected)
for i in range(num_lines):
    c=corrected[i]
    m=majorities[i]
    num_tokens=len(c)
    for j in range(num_tokens):
        if c[j] is None:
            print(f"WARNING: Index {i} has None in corrected list. Replacing with majority tag.")
        if m[j] is None:
            print(f"WARNING: Index {i} has None in manually tagged list. Replacing with majority tag.")


In [None]:
from collections import Counter

num_lines=len(corrected)

mistakes_sum=0
mistakes_lines=0
mistakes_counter=Counter()
mistake_indexes=[]
for i in range(num_lines):
    cs=corrected[i]
    ms=majorities[i]
    #print(f"Line {i}: {cs} vs {ms}")
    
    num_tokens=len(cs)
    mistake_line=False
    for j in range(num_tokens):
        c=cs[j]
        m=ms[j]
        if c!=m:
            mistakes_sum+=1
            mistakes_counter[(m,c)]+=1
            mistake_line=True
            print(f"Line {i}: {m} -> {c} ({tokens[i][j]})")
    if mistake_line:
        mistakes_lines+=1
        mistake_indexes.append(i)

print(f'Number of Lines: {num_lines}')
print(f'Number of Tokens: {sum(len(m) for m in majorities)}')
print(f"Number of lines with mistakes: {mistakes_lines}")
print(f"Number of mistakes: {mistakes_sum}")
print(f'Mistake indexes: {mistake_indexes}')
mistakes_counter

In [None]:
taggers=[c for c in examples_df.columns if c.islower()]
taggers

In [None]:
# truth is: corrected

evaluate_taggers={}

for tagger in taggers:
    tagger_tags=examples_df[tagger].to_list()#[eval(x) for x in examples_df[tagger].to_list()]
    mistake_in_lines=0
    mistakes=0
    mistakes_counter=Counter()
    xs=0
    for i in range(len(corrected)):
        c=corrected[i]
        t=tagger_tags[i]
        wrong_line=False
        for j in range(len(c)):
            if c[j]!=t[j]:
                mistakes+=1
                #mistakes_counter[(t[j],c[j])]+=1
                wrong_line=True
            if c[j]=='X':
                xs+=1
        if wrong_line:
            mistake_in_lines+=1
    evaluate_taggers[tagger]={
        'mistakes': mistakes, 
        'mistake_in_lines': mistake_in_lines, 
        'mistakes_counter': mistakes_counter,
        'percentage_tokens': mistakes/sum(len(m) for m in majorities)*100,
        'xs': xs,
        'percentage_tokens_without_x': (mistakes-xs)/sum(len(m) for m in majorities)*100,
    }
evaluate_taggers

                
                
    