In [1]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='poslog/'

# Set to number of directories below project root if the notebook is in a subdirectory of the project, so you can use relative paths
SUBDIR_LEVEL = 1
if SUBDIR_LEVEL>0:
    import sys 
    new_path = '../'*SUBDIR_LEVEL
    if new_path not in sys.path:
        sys.path.append(new_path)

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), '../'*SUBDIR_LEVEL, 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: '../out/poslog'


In [2]:
# Precons
INPUT_FILE=os.path.join(OUT_DIR,'1_examples_10000_each_seed-42_numb_var.csv')

# if file does not exist, throw an error
if not os.path.exists(INPUT_FILE):
    precon='1_templates_collect_numb_var.ipynb'
    raise FileNotFoundError(f"File '{INPUT_FILE}' not found. Run '{precon}' first.")

OUTPUT_FILE=os.path.join(OUT_DIR, '2_examples_tagged_upos.csv')
print(f'Output file: {OUTPUT_FILE}')

Output file: ../out/poslog/2_examples_tagged_upos.csv


In [3]:
import pandas as pd
# columns: Dataset,Line,Example,Template,ClusterId
example_df=pd.read_csv(INPUT_FILE)
example_df

Unnamed: 0,Dataset,Line,Example,Template,ClusterId
0,HDFS,1549,Receiving block blk_5614249702379360530 src: /...,Receiving block <*> src: <*> dest: <*>,0
1,HDFS,2847,BLOCK* NameSystem.addStoredBlock: blockMap upd...,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1
2,HDFS,3092,BLOCK* NameSystem.allocateBlock: /user/root/ra...,BLOCK* NameSystem.allocateBlock: <*>,2
3,HDFS,4819,Received block blk_-6232712486646639079 of siz...,Received block <*> of size <*> from <*>,3
4,HDFS,9108,PacketResponder 1 for block blk_-6877771159587...,PacketResponder <*> for block <*> terminating,4
...,...,...,...,...,...
5543,Mac,110150,UNINSTALLED:com.skype.skype.ShareExtension com...,UNINSTALLED:com.skype.skype.ShareExtension <*>...,536
5544,Mac,110239,Uncommited CATransaction. Set CA_DEBUG_TRANSAC...,Uncommited CATransaction. Set <*> in environme...,537
5545,Mac,63596,objc[35448]: Class TSUDurationLocaleSpecificSt...,<*> Class TSUDurationLocaleSpecificStorage is ...,538
5546,Mac,64616,[00:29:25.872] HTTPRequest figHttpRequestDidFa...,<*> HTTPRequest figHttpRequestDidFailCallback:...,539


# Tokenizer

In [None]:
from poslog import PosLogTokenizer
tokenizer=PosLogTokenizer()

In [None]:
example_df['Tokens']=example_df['Example'].apply(tokenizer.tokenize)
example_df

# Unify Punctuation Tokens

In [None]:
def unify_punct_tokens(tokens:list[str])->list[str]:
    for i, token in enumerate(tokens):
        if len(token)==1:
            if token in ['(', '[', '{']:
                tokens[i]='('
            elif token in [')', ']', '}']:
                tokens[i]=')'
            elif token in ['.',',',';',':','!','?']:
                # esp. for PTB to generalize punctuation
                tokens[i]='.'
        # TODO: More of these?
    return tokens

example_df['Tokens']=example_df['Tokens'].apply(unify_punct_tokens)

# PoS-Tagging

In [None]:
# This cell installes all taggers (if not already installed)
# So the first run may take a while.

from poslog import AbstractPosTagger
from util.pos import NLTKPosTagger, SpacyPosTagger, StanzaPosTagger, HanTaPosTagger, TreeTaggerPosTagger
nltk_tagger=NLTKPosTagger()
stanza_tagger=StanzaPosTagger()
spacy_tagger=SpacyPosTagger()
hanta_tagger=HanTaPosTagger()
treetagger_tagger=TreeTaggerPosTagger()

tagger:dict[str,AbstractPosTagger]={}

tagger['nltk']=nltk_tagger.pos_tag
tagger['stanza']=stanza_tagger.pos_tag
tagger['spacy']=spacy_tagger.pos_tag
tagger['hanta']=hanta_tagger.pos_tag
tagger['treetagger']=treetagger_tagger.pos_tag


In [None]:
import logging
# deactivate logging for treetagger (since there was a pipe error otherwise)
logging.getLogger().setLevel(logging.WARNING)

i=50
for tagger_name, tagger_func in tagger.items():
    print(f'{tagger_name} tags line {i}: {tagger_func(example_df["Tokens"][i])}')

# Tag with all POS-Taggers

In [None]:
# add a new column for each tagger
import time

for name, tagger_func in tagger.items():
    print(f"Tagging with {name}")
    start_time = time.time()

    example_df[name]=example_df['Tokens'].apply(tagger_func)

    print(f"- Time taken: {time.time() - start_time:.2f} seconds")
example_df

In [None]:
# write to file
example_df.to_csv(OUTPUT_FILE, index=False)