In [1]:
# GENERIC FIRST CELL FOR DEVELOPING A NEW METHODOLOGY

# Define an expressive name for the methodology, that will be used to create the output directory
# The complete path to the output directory will be provided in OUT_DIR
METHODOLOGY_NAME='pos_log/'

import os
OUT_DIR = os.path.relpath(os.path.join(os.getcwd(), 'out', METHODOLOGY_NAME))
if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
print(f"Current output directory: '{OUT_DIR}'")

%load_ext autoreload
%autoreload 2

Current output directory: 'out/pos_log'


In [2]:
# Precons

# INPUT_FILE=os.path.join(OUT_DIR,'..','pos_log',f'examples_tagged_100_random_sample_seed-42_upos.csv')
# INPUT_FILE=os.path.join(OUT_DIR,'..','pos_log',f'_random_sample_wip_upos_2025-04-22_old_.csv')
# INPUT_FILE=os.path.join(OUT_DIR,'..','pos_log',f'_examples_tagged_upos_.csv')
INPUT_FILE=os.path.join(OUT_DIR,'..','pos_log',f'examples_tagged_upos_majority.csv')

# if file does not exist, throw an error
if not os.path.exists(INPUT_FILE):
    precon='2-2_tag_comparison_correction.ipynb'
    raise FileNotFoundError(f"File '{INPUT_FILE}' not found. Run '{precon}' first.")

#OUTPUT_FILE=os.path.join(OUT_DIR,'..','pos_log','random_sample_wip_upos.csv')
OUTPUT_FILE=os.path.join(OUT_DIR,'..','pos_log','random_sample_wip_upos_2025-04-24_kd.csv')
print(f'Output file: {OUTPUT_FILE}')

RANDOM_SAMPLE_SIZE=100
RANDOM_SAMPLE_SEED=42


Output file: out/pos_log/../pos_log/random_sample_wip_upos_2025-04-24_kd.csv


In [3]:
import pandas as pd
tagged_examples=pd.read_csv(INPUT_FILE)

In [17]:
from nlp.pos import TagComparison

def read_lists_from_df(tagged_examples):
    log_lines=tagged_examples['Example'].to_list()
    log_lines_splitted=[eval(x) for x in tagged_examples['Tokens'].to_list()]
    tag_comparisons:list[TagComparison]=[eval(x) for x in tagged_examples['TagComparison'].to_list()]
    majorities:list[list[str]]=[eval(x) for x in tagged_examples['Majority'].to_list()]
    return log_lines, log_lines_splitted, tag_comparisons, majorities

log_lines,log_lines_splitted,tag_comparisons,majorities=read_lists_from_df(tagged_examples)


## Create Random Samples

In [5]:
nones_sum=0
nones_in_lines=0
token_count=0
full_tagged_indices=[]
ragged_tagged_indices=[]
for majority in majorities:
    nones=majority.count(None)
    token_count+=len(majority)
    nones_sum+=nones
    if nones>0:
        nones_in_lines+=1
        ragged_tagged_indices.append(majorities.index(majority))
    else:
        full_tagged_indices.append(majorities.index(majority))

print(f"None count: {nones_sum} of {token_count} tokens ({nones_sum/token_count:.2%})")
print(f"Lines with None: {nones_in_lines} of {len(majorities)} ({nones_in_lines/len(majorities):.2%})")
print(f"Lines with full tagging: {len(full_tagged_indices)} of {len(majorities)} ({len(full_tagged_indices)/len(majorities):.2%})")

print(full_tagged_indices)

None count: 3099 of 64542 tokens (4.80%)
Lines with None: 2124 of 5548 (38.28%)
Lines with full tagging: 3424 of 5548 (61.72%)
[0, 2, 3, 7, 8, 13, 14, 17, 18, 19, 20, 21, 24, 27, 29, 30, 31, 34, 37, 38, 42, 43, 42, 47, 49, 50, 51, 52, 54, 56, 57, 58, 52, 62, 65, 52, 68, 71, 72, 74, 38, 80, 81, 82, 83, 84, 86, 87, 47, 89, 92, 93, 94, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 113, 116, 101, 122, 124, 125, 126, 127, 128, 130, 131, 132, 133, 19, 136, 139, 140, 141, 142, 144, 146, 109, 146, 153, 154, 155, 156, 159, 160, 162, 163, 111, 167, 168, 169, 140, 171, 174, 176, 177, 178, 179, 180, 181, 183, 184, 185, 42, 188, 189, 101, 199, 200, 202, 203, 206, 208, 211, 214, 221, 222, 223, 224, 128, 142, 106, 231, 232, 233, 236, 237, 240, 241, 244, 241, 241, 250, 241, 241, 241, 256, 258, 241, 241, 261, 263, 265, 266, 267, 21, 269, 270, 272, 47, 277, 279, 280, 281, 241, 284, 31, 291, 291, 298, 299, 300, 301, 303, 241, 299, 267, 308, 313, 31, 317, 318, 319, 320, 321, 324, 325,

In [6]:
import random
random.seed(RANDOM_SAMPLE_SEED)
print(f'Full tagged indices: {full_tagged_indices}')
random_sample_indices=random.sample(full_tagged_indices, RANDOM_SAMPLE_SIZE)
print(f'Random sample indices: {random_sample_indices}')

Full tagged indices: [0, 2, 3, 7, 8, 13, 14, 17, 18, 19, 20, 21, 24, 27, 29, 30, 31, 34, 37, 38, 42, 43, 42, 47, 49, 50, 51, 52, 54, 56, 57, 58, 52, 62, 65, 52, 68, 71, 72, 74, 38, 80, 81, 82, 83, 84, 86, 87, 47, 89, 92, 93, 94, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 113, 116, 101, 122, 124, 125, 126, 127, 128, 130, 131, 132, 133, 19, 136, 139, 140, 141, 142, 144, 146, 109, 146, 153, 154, 155, 156, 159, 160, 162, 163, 111, 167, 168, 169, 140, 171, 174, 176, 177, 178, 179, 180, 181, 183, 184, 185, 42, 188, 189, 101, 199, 200, 202, 203, 206, 208, 211, 214, 221, 222, 223, 224, 128, 142, 106, 231, 232, 233, 236, 237, 240, 241, 244, 241, 241, 250, 241, 241, 241, 256, 258, 241, 241, 261, 263, 265, 266, 267, 21, 269, 270, 272, 47, 277, 279, 280, 281, 241, 284, 31, 291, 291, 298, 299, 300, 301, 303, 241, 299, 267, 308, 313, 31, 317, 318, 319, 320, 321, 324, 325, 326, 327, 330, 313, 313, 313, 334, 336, 337, 339, 342, 343, 346, 347, 350, 21, 355, 357, 358, 313, 362, 3

In [7]:
#####
# Due to changes in preprocessing, the length of full_tagged_indices has changed, so the random sample indices also changed.
# But we manually tagged the following indices by hand (originally they were random), so we proceed with them.

hacky_sample_indices=[915, 263, 2117, 2585, 2343, 1424, 1033, 893, 4624, 324, 299, 308, 2274, 2462, 5129, 88, 1688, 4610, 2299, 4613, 2973, 88, 1659, 4608, 3396, 2195, 1627, 2230, 3041, 1032, 386, 4098, 980, 3815, 3643, 303, 439, 4608, 308, 4078, 804, 3133, 280, 417, 417, 450, 2406, 2538, 811, 2468, 993, 4608, 3892, 981, 3960, 3772, 2170, 2848, 739, 703, 2584, 1690, 4608, 4093, 2312, 2293, 2645, 568, 2205, 326, 3333, 2498, 2855, 655, 2185, 3326, 2198, 5054, 2185, 4624, 1466, 2543, 2604, 2800, 4606, 4304, 3855, 2290, 1408, 5186, 4947, 913, 464, 1102, 791, 417, 4610, 638, 4146, 774]
random_sample_indices=hacky_sample_indices

print(f'New "random" sample indices: {random_sample_indices}')

New "random" sample indices: [915, 263, 2117, 2585, 2343, 1424, 1033, 893, 4624, 324, 299, 308, 2274, 2462, 5129, 88, 1688, 4610, 2299, 4613, 2973, 88, 1659, 4608, 3396, 2195, 1627, 2230, 3041, 1032, 386, 4098, 980, 3815, 3643, 303, 439, 4608, 308, 4078, 804, 3133, 280, 417, 417, 450, 2406, 2538, 811, 2468, 993, 4608, 3892, 981, 3960, 3772, 2170, 2848, 739, 703, 2584, 1690, 4608, 4093, 2312, 2293, 2645, 568, 2205, 326, 3333, 2498, 2855, 655, 2185, 3326, 2198, 5054, 2185, 4624, 1466, 2543, 2604, 2800, 4606, 4304, 3855, 2290, 1408, 5186, 4947, 913, 464, 1102, 791, 417, 4610, 638, 4146, 774]


In [8]:
random_sample_df=tagged_examples.iloc[random_sample_indices]
random_sample_df


Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority
915,Thunderbird,28439275,[INFO]: Generate SM IN_SERVICE trap for GID=0x...,[INFO]: Generate SM IN_SERVICE trap for <*>,233,"['(', 'INFO', ')', '.', 'Generate', 'SM', 'IN_...","['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'PROPN', ...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P...","['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'VERB', '...","['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'VERB', '...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P...","TagComparison(majority=['PUNCT', 'NOUN', 'PUNC...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P..."
263,Spark,7109853,Received new token for : mesos-master-1:35426,Received new token for : <*>,27,"['Received', 'new', 'token', 'for', '.', 'meso...","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'NOUN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'X']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'NOUN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'NOUN']","TagComparison(majority=['VERB', 'ADJ', 'NOUN',...","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN']"
2117,Android,851,onStartedWakingUp(),onStartedWakingUp(),5,"['onStartedWakingUp', '(', ')']","['NOUN', 'PUNCT', 'PUNCT']","['PROPN', 'PUNCT', 'PUNCT']","['NOUN', 'PUNCT', 'PUNCT']","['PROPN', 'PUNCT', 'PUNCT']","['NOUN', 'PUNCT', 'PUNCT']","TagComparison(majority=['PROPN', 'PUNCT', 'PUN...","['PROPN', 'PUNCT', 'PUNCT']"
2585,Android,153035,2016-12-17 20:31:43.073 T:24286 INFO: IQiyi...,<*> INFO: IQiyiLiveDataProvider::StartModule: ...,473,"['2016-12-17', '20:31:43.073', 'T:24286', 'INF...","['ADJ', 'NUM', 'PROPN', 'PROPN', 'PUNCT', 'PRO...","['NUM', 'NUM', 'NUM', 'NOUN', 'PUNCT', 'PROPN'...","['NUM', 'NUM', 'ADJ', 'PROPN', 'PUNCT', 'INTJ'...","['NUM', 'NUM', 'PROPN', 'PROPN', 'PUNCT', 'PRO...","['NUM', 'NUM', 'PROPN', 'NOUN', 'PUNCT', 'PROP...","TagComparison(majority=['NUM', 'NUM', 'X', 'NO...","['NUM', 'NUM', 'X', 'NOUN', 'PUNCT', 'PROPN', ..."
2343,Android,60802,random_mix_pool - hexdump(len=5): [REMOVED],random_mix_pool - <*> [REMOVED],231,"['random_mix_pool', '-', 'hexdump', '(', 'len=...","['NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN', 'PU...","['NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN', 'PU...","['NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN', 'PU...","['NOUN', 'ADP', 'NOUN', 'PUNCT', 'NUM', 'PUNCT...","['NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NOUN', 'PUN...","TagComparison(majority=['PROPN', 'PUNCT', 'NOU...","['PROPN', 'PUNCT', 'NOUN', 'PUNCT', 'X', 'PUNC..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,BGL,383488,data storage interrupt,data storage interrupt,23,"['data', 'storage', 'interrupt']","['NOUN', 'NOUN', 'NOUN']","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'VERB']","TagComparison(majority=['NOUN', 'NOUN', 'VERB'...","['NOUN', 'NOUN', 'VERB']"
4610,OpenSSH,569,Failed password for root from 187.141.143.180 ...,Failed password for root from <*> port <*>,5,"['Failed', 'password', 'for', 'root', 'from', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN',...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","TagComparison(majority=['VERB', 'NOUN', 'ADP',...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ..."
638,HPC,68897,ServerFileSystem: ServerFileSystem domain stor...,ServerFileSystem: ServerFileSystem domain <*> ...,27,"['ServerFileSystem', '.', 'ServerFileSystem', ...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'NOUN', 'V...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'NOUN', 'A...","['ADV', 'PUNCT', 'PROPN', 'NOUN', 'PROPN', 'AU...","['INTJ', 'PUNCT', 'X', 'NOUN', 'NUM', 'AUX', '...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'NOUN', 'V...","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'PROPN', '..."
4146,Android,1369467,#04 pc 000170d8 /system/lib/libc.so (abort+4),<*> pc <*> /system/lib/libc.so <*>,2034,"['#', '04', 'pc', '000170d8', '/system/lib/lib...","['SYM', 'NUM', 'NOUN', 'NUM', 'NOUN', 'PUNCT',...","['SYM', 'NUM', 'NOUN', 'NUM', 'NOUN', 'PUNCT',...","['SYM', 'NUM', 'NOUN', 'PROPN', 'PUNCT', 'PUNC...","['NUM', 'NUM', 'NUM', 'NUM', 'NOUN', 'PUNCT', ...","['SYM', 'NUM', 'NOUN', 'NOUN', 'NOUN', 'PUNCT'...","TagComparison(majority=['SYM', 'NUM', 'NOUN', ...","['SYM', 'NUM', 'NOUN', 'NUM', 'PROPN', 'PUNCT'..."


## Load if exists

In [9]:
# # TODO
# if os.path.exists(OUTPUT_FILE):
#     os.remove(OUTPUT_FILE)
#     print(f"File '{OUTPUT_FILE}' has been removed.")
# else:
#     print(f"File '{OUTPUT_FILE}' does not exist.")

import pandas as pd
random_sample_df:pd.DataFrame

if os.path.exists(OUTPUT_FILE):
    print(f"File '{OUTPUT_FILE}' already exists. Go on editing or remove it.")
    random_sample_df=pd.read_csv(OUTPUT_FILE)
else: 
    # columns: Dataset,Line,Example,Template,ClusterId,Tokens,[tagger],TagComparison,Majority
    #tagged_examples=pd.read_csv(INPUT_FILE)
    pass

random_sample_df


File 'out/pos_log/../pos_log/random_sample_wip_upos_2025-04-24_kd.csv' already exists. Go on editing or remove it.


Unnamed: 0,Dataset,Line,Example,Template,ClusterId,Tokens,nltk,stanza,spacy,hanta,treetagger,TagComparison,Majority,ManualTagging
0,Thunderbird,28439275,[INFO]: Generate SM IN_SERVICE trap for GID=0x...,[INFO]: Generate SM IN_SERVICE trap for <*>,233,"['(', 'INFO', ')', '.', 'Generate', 'SM', 'IN_...","['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'PROPN', ...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P...","['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'VERB', '...","['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'VERB', '...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P...","TagComparison(majority=['PUNCT', 'NOUN', 'PUNC...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P...","['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'P..."
1,Spark,7109853,Received new token for : mesos-master-1:35426,Received new token for : <*>,27,"['Received', 'new', 'token', 'for', '.', 'meso...","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'NOUN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'X']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'NOUN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'NOUN']","TagComparison(majority=['VERB', 'ADJ', 'NOUN',...","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN']","['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN']"
2,Android,851,onStartedWakingUp(),onStartedWakingUp(),5,"['onStartedWakingUp', '(', ')']","['NOUN', 'PUNCT', 'PUNCT']","['PROPN', 'PUNCT', 'PUNCT']","['NOUN', 'PUNCT', 'PUNCT']","['PROPN', 'PUNCT', 'PUNCT']","['NOUN', 'PUNCT', 'PUNCT']","TagComparison(majority=['PROPN', 'PUNCT', 'PUN...","['PROPN', 'PUNCT', 'PUNCT']","['PROPN', 'PUNCT', 'PUNCT']"
3,Android,153035,2016-12-17 20:31:43.073 T:24286 INFO: IQiyi...,<*> INFO: IQiyiLiveDataProvider::StartModule: ...,473,"['2016-12-17', '20:31:43.073', 'T:24286', 'INF...","['ADJ', 'NUM', 'PROPN', 'PROPN', 'PUNCT', 'PRO...","['NUM', 'NUM', 'NUM', 'NOUN', 'PUNCT', 'PROPN'...","['NUM', 'NUM', 'ADJ', 'PROPN', 'PUNCT', 'INTJ'...","['NUM', 'NUM', 'PROPN', 'PROPN', 'PUNCT', 'PRO...","['NUM', 'NUM', 'PROPN', 'NOUN', 'PUNCT', 'PROP...","TagComparison(majority=['NUM', 'NUM', 'X', 'NO...","['NUM', 'NUM', 'X', 'NOUN', 'PUNCT', 'PROPN', ...","['NUM', 'NUM', 'X', 'NOUN', 'PUNCT', 'PROPN', ..."
4,Android,60802,random_mix_pool - hexdump(len=5): [REMOVED],random_mix_pool - <*> [REMOVED],231,"['random_mix_pool', '-', 'hexdump', '(', 'len=...","['NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN', 'PU...","['NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN', 'PU...","['NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN', 'PU...","['NOUN', 'ADP', 'NOUN', 'PUNCT', 'NUM', 'PUNCT...","['NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NOUN', 'PUN...","TagComparison(majority=['PROPN', 'PUNCT', 'NOU...","['PROPN', 'PUNCT', 'NOUN', 'PUNCT', 'X', 'PUNC...","['PROPN', 'PUNCT', 'NOUN', 'PUNCT', 'X', 'PUNC..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,BGL,383488,data storage interrupt,data storage interrupt,23,"['data', 'storage', 'interrupt']","['NOUN', 'NOUN', 'NOUN']","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'VERB']","TagComparison(majority=['NOUN', 'NOUN', 'VERB'...","['NOUN', 'NOUN', 'VERB']","['NOUN', 'NOUN', 'NOUN']"
96,OpenSSH,569,Failed password for root from 187.141.143.180 ...,Failed password for root from <*> port <*>,5,"['Failed', 'password', 'for', 'root', 'from', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN',...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","TagComparison(majority=['VERB', 'NOUN', 'ADP',...","['VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', ...","['VERB', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NUM',..."
97,HPC,68897,ServerFileSystem: ServerFileSystem domain stor...,ServerFileSystem: ServerFileSystem domain <*> ...,27,"['ServerFileSystem', '.', 'ServerFileSystem', ...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'NOUN', 'V...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'NOUN', 'A...","['ADV', 'PUNCT', 'PROPN', 'NOUN', 'PROPN', 'AU...","['INTJ', 'PUNCT', 'X', 'NOUN', 'NUM', 'AUX', '...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'NOUN', 'V...","TagComparison(majority=['PROPN', 'PUNCT', 'PRO...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'PROPN', '...","['PROPN', 'PUNCT', 'PROPN', 'NOUN', 'PROPN', '..."
98,Android,1369467,#04 pc 000170d8 /system/lib/libc.so (abort+4),<*> pc <*> /system/lib/libc.so <*>,2034,"['#', '04', 'pc', '000170d8', '/system/lib/lib...","['SYM', 'NUM', 'NOUN', 'NUM', 'NOUN', 'PUNCT',...","['SYM', 'NUM', 'NOUN', 'NUM', 'NOUN', 'PUNCT',...","['SYM', 'NUM', 'NOUN', 'PROPN', 'PUNCT', 'PUNC...","['NUM', 'NUM', 'NUM', 'NUM', 'NOUN', 'PUNCT', ...","['SYM', 'NUM', 'NOUN', 'NOUN', 'NOUN', 'PUNCT'...","TagComparison(majority=['SYM', 'NUM', 'NOUN', ...","['SYM', 'NUM', 'NOUN', 'NUM', 'PROPN', 'PUNCT'...","['SYM', 'NUM', 'NOUN', 'NUM', 'PROPN', 'PUNCT'..."


In [10]:
# rebuild lists if df changed
log_lines,log_lines_splitted,tag_comparisons,majorities=read_lists_from_df(random_sample_df)

# "GUI"

In [12]:
corrected=[]
if 'ManualTagging' in random_sample_df.columns:
    corrected=[eval(x) for x in random_sample_df['ManualTagging'].to_list()]
else:
    #corrected=[m.copy() for m in majorities]
    corrected=[[None]*len(m) for m in majorities]
print(corrected)


[['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'NOUN', 'ADP', 'X'], ['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN'], ['PROPN', 'PUNCT', 'PUNCT'], ['NUM', 'NUM', 'X', 'NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'NOUN', 'PUNCT'], ['PROPN', 'PUNCT', 'NOUN', 'PUNCT', 'X', 'PUNCT', 'PUNCT', 'PUNCT', 'VERB', 'PUNCT'], ['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'X', 'X'], ['ADJ', 'NOUN', 'PUNCT', 'NUM', 'PUNCT', 'NUM', 'PROPN', 'PUNCT'], ['PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'VERB', 'PUNCT', 'PROPN'], ['PROPN', 'PUNCT', 'ADJ', 'NOUN', 'PROPN', 'PUNCT', 'NOUN', 'PUNCT'], ['VERB', 'NOUN', 'ADP', 'NOUN', 'NOUN'], ['PROPN', 'VERB'], ['VERB', 'NOUN'], ['PUNCT', 'PROPN', 'PUNCT', 'PUNCT', 'NUM', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'X'], ['PROPN', 'NOUN', 'NOUN', 'PUNCT', 'X', 'PUNCT', 'NUM', 'PUNCT', 'X', 'PUNCT', 'AUX', 'NOUN', 'PUNCT', 'PUNCT', 'ADP', 'PROPN',

In [13]:
from gui_and_helper import GuiLogs

gui_logs:list[GuiLogs]=[]
for i, (log_line, log_line_splitted, tag_comparison, majorities_per_line, manual_tags) in enumerate(zip(log_lines, log_lines_splitted, tag_comparisons, majorities, corrected)):
    #in random sample, we want all examples
    gui_logs.append(GuiLogs(i, log_line, log_line_splitted, tag_comparison, majorities_per_line, manual_tags))

gui_logs[:10]

[GuiLogs(idx=0, log_line='[INFO]: Generate SM IN_SERVICE trap for GID=0xfe800000000000000005ad0000040e39', log_line_splitted=['(', 'INFO', ')', '.', 'Generate', 'SM', 'IN_SERVICE', 'trap', 'for', 'GID=0xfe800000000000000005ad0000040e39'], tag_comparison=TagComparison(majority=['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'NOUN', 'ADP', 'X'], confidence=[1.0, 1.1, 1.0, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.1], minority=[{}, {}, {}, {}, {'nltk': 'PROPN'}, {}, {}, {}, {}, {}]), majorities_per_line=['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'NOUN', 'ADP', 'X'], manual_tags=['PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'NOUN', 'ADP', 'X']),
 GuiLogs(idx=1, log_line='Received new token for : mesos-master-1:35426', log_line_splitted=['Received', 'new', 'token', 'for', '.', 'mesos-master-1:35426'], tag_comparison=TagComparison(majority=['VERB', 'ADJ', 'NOUN', 'ADP', 'PUNCT', 'PROPN'], confidence=[1.0, 1.0, 1.0, 1.0, 1.0, 1.1], minority=[{}, {}, {}, 

In [14]:
from gui_and_helper import initialize_gui
initialize_gui(tagged_examples_df=random_sample_df,
               manually_tagged_ls=corrected,
               majorities_ls=majorities,
               gui_logs_ls=gui_logs,
               output_file_str=OUTPUT_FILE,
               find_none_only=False)

Possible tags: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


In [15]:
%gui tk 
from gui_and_helper import run_gui

run_gui()

2025-04-28 14:41:27.255 Python[14361:19673589] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-28 14:41:27.255 Python[14361:19673589] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [16]:
# Save if not done yet
#random_sample_df.to_csv(OUTPUT_FILE, index=False)
print(f'Saved random samples to: {OUTPUT_FILE}')


Saved random samples to: out/pos_log/../pos_log/random_sample_wip_upos_2025-04-24_kd.csv
