In [1]:
from bokeh.io import output_notebook, show, export_png
from bokeh.plotting import figure

output_notebook()

from pathlib import Path
from jsonlines import jsonlines
from pprint import pprint
from random import randint

In [2]:
RANDOM_QUERY_LOG = "logs/50iters_50instances/random_full_50it_50n.metrics.jsonl"
LEAST_CONFIDENCE_LOG = "logs/scripts/lc_active_learned_kpwr-full_10iters_10instances.metrics.jsonl"
FIGURES_DIR = Path("figures")
FIGURES_DIR.mkdir(exist_ok=True)

ITERATIONS = 10
N_INSTANCES = 10

In [3]:
with jsonlines.open(RANDOM_QUERY_LOG) as reader_rand:
    random_log_data = [j for j in reader_rand]
rand_idx = randint(0, len(random_log_data) - 1)
pprint(random_log_data[rand_idx])

{'_date': '16-11-2022 20:46:14',
 '_iteration': 33,
 '_iteration_time': 9.998528003692627,
 '_labels_count': {'nam_liv_person': 1390,
                   'nam_loc_gpe_city': 638,
                   'nam_loc_gpe_country': 514},
 '_sc_loss': 3428.4337763786316,
 '_spans_count': 2542,
 'spans_sc_f': 0.4095477386934674,
 'spans_sc_p': 0.7581395348837209,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.32458233890214794,
                                          'p': 0.6623376623376623,
                                          'r': 0.21496311907270813},
                       'nam_loc_gpe_city': {'f': 0.387434554973822,
                                            'p': 0.8161764705882353,
                                            'r': 0.2540045766590389},
                       'nam_loc_gpe_country': {'f': 0.6236559139784946,
                                               'p': 0.8656716417910447,
                                               'r': 0.48739495798319327}},
 'spans_sc_r': 0.2

In [4]:
with jsonlines.open(LEAST_CONFIDENCE_LOG) as reader_lc:
    least_confidence_log_data = [j for j in reader_lc]
rand_idx = randint(0, len(least_confidence_log_data) - 1)
pprint(least_confidence_log_data[rand_idx])

{'_date': '18-11-2022 22:17:45',
 '_iteration': 5,
 '_iteration_time': 154.9589409828186,
 '_labels_count': {'_all': 75,
                   'nam_liv_person': 43,
                   'nam_loc_gpe_city': 21,
                   'nam_loc_gpe_country': 11},
 '_sc_loss': 414.64614534146676,
 '_spans_count': 75,
 'spans_sc_f': 0.0,
 'spans_sc_p': 0.0,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                       'nam_loc_gpe_city': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                       'nam_loc_gpe_country': {'f': 0.0, 'p': 0.0, 'r': 0.0}},
 'spans_sc_r': 0.0,
 'speed': 6229.802910533261,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_r': 1.0}


In [5]:
random_iterations, random_fscores, random_losses = [], [], []
lc_iterations, lc_fscores, lc_losses = [], [], []
for rand_log_line, lc_log_line in zip(random_log_data, least_confidence_log_data):
    random_iterations.append(rand_log_line["_iteration"])
    random_fscores.append(rand_log_line["spans_sc_f"])
    random_losses.append(rand_log_line["_sc_loss"])

    lc_iterations.append(lc_log_line["_iteration"])
    lc_fscores.append(lc_log_line["spans_sc_f"])
    lc_losses.append(lc_log_line["_sc_loss"])

# # dummy data
# from math import log
# lc_iterations = random_iterations.copy()
# lc_fscores = [log(it, 300)+0.01 for it in random_iterations]
# lc_losses = [loss-300 for loss in random_losses]

assert len(random_iterations) == len(lc_iterations)
assert len(random_iterations) == len(random_fscores)
assert len(lc_iterations) == len(lc_fscores)

## SpanCategorizer F-score over iterations

In [6]:
p = figure(title="SpanCategorizer performance over iterations",
           x_axis_label="Iteration",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"

p.line(random_iterations, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_iterations, [max_rand_fscore]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

out_path = FIGURES_DIR / f"fscore_on_iters_rand_vs_lc_{ITERATIONS}iters_{N_INSTANCES}instances.png"
p.toolbar_location = None
export_png(p, filename=out_path)

show(p)

## SpanCategorizer training loss over number of entities

In [7]:
p = figure(title="SpanCategorizer training loss over number of iterations",
           x_axis_label="Iteration",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"

p.line(random_iterations, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

min_rand_loss = min(random_losses)
p.line(random_iterations, [min_rand_loss]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

out_path = FIGURES_DIR / f"loss_on_iters_rand_vs_lc_{ITERATIONS}iters_{N_INSTANCES}instances.png"
p.toolbar_location = None
export_png(p, filename=out_path)

show(p)