In [1]:
from bokeh.io import output_notebook, show, export_png
from bokeh.plotting import figure
from bokeh.models import NumeralTickFormatter

output_notebook()

from pathlib import Path
from jsonlines import jsonlines
from pprint import pprint
from random import randint

In [2]:
RANDOM_QUERY_LOG = "logs/scripts/random_learned_kpwr-full_50iters_50instances.metrics.jsonl"
LEAST_CONFIDENCE_LOG = "logs/scripts/lc_active_learned_kpwr-full_50iters_50instances.metrics.jsonl"
FIGURES_DIR = Path("figures")
FIGURES_DIR.mkdir(exist_ok=True)

ITERATIONS = 50
N_INSTANCES = 50

In [3]:
with jsonlines.open(RANDOM_QUERY_LOG) as reader_rand:
    random_log_data = [j for j in reader_rand]
rand_idx = randint(0, len(random_log_data) - 1)
pprint(random_log_data[rand_idx])

{'_date': '19-11-2022 13:46:14',
 '_iteration': 49,
 '_iteration_time': 11.571990966796875,
 '_labels_count': {'_all': 3769,
                   'nam_liv_person': 2106,
                   'nam_loc_gpe_city': 948,
                   'nam_loc_gpe_country': 715},
 '_sc_loss': 81124.82250976562,
 '_spans_count': 3769,
 'spans_sc_f': 0.001027749229188078,
 'spans_sc_p': 0.0049261083743842365,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                       'nam_loc_gpe_city': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                       'nam_loc_gpe_country': {'f': 0.004576659038901602,
                                               'p': 0.0125,
                                               'r': 0.0028011204481792717}},
 'spans_sc_r': 0.0005737234652897303,
 'speed': 10728.932664579688,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_r': 1.0}


In [4]:
with jsonlines.open(LEAST_CONFIDENCE_LOG) as reader_lc:
    least_confidence_log_data = [j for j in reader_lc]
rand_idx = randint(0, len(least_confidence_log_data) - 1)
pprint(least_confidence_log_data[rand_idx])

{'_date': '19-11-2022 08:47:57',
 '_iteration': 39,
 '_iteration_time': 416.32866621017456,
 '_labels_count': {'_all': 3907,
                   'nam_liv_person': 1914,
                   'nam_loc_gpe_city': 608,
                   'nam_loc_gpe_country': 1385},
 '_sc_loss': 81112.162109375,
 '_spans_count': 3907,
 'spans_sc_f': 0.03076227097941642,
 'spans_sc_p': 0.017708333333333333,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.04174198849630238,
                                          'p': 0.024727414330218068,
                                          'r': 0.13382507903055849},
                       'nam_loc_gpe_city': {'f': 0.029919447640966625,
                                            'p': 0.017110891740704178,
                                            'r': 0.11899313501144165},
                       'nam_loc_gpe_country': {'f': 0.013506212857914639,
                                               'p': 0.007473841554559043,
                                              

In [5]:
random_iterations, random_entities_num, random_fscores, random_losses = [], [], [], []
lc_iterations, lc_entities_num, lc_fscores, lc_losses = [], [], [], []
assert len(random_log_data) == len(least_confidence_log_data)

for rand_log_line, lc_log_line in zip(random_log_data, least_confidence_log_data):
    random_iterations.append(rand_log_line["_iteration"])
    random_entities_num.append(rand_log_line["_spans_count"])
    random_fscores.append(rand_log_line["spans_sc_f"])
    random_losses.append(rand_log_line["_sc_loss"])

    lc_iterations.append(lc_log_line["_iteration"])
    lc_entities_num.append(lc_log_line["_spans_count"])
    lc_fscores.append(lc_log_line["spans_sc_f"])
    lc_losses.append(lc_log_line["_sc_loss"])

# # dummy data
# from math import log
# lc_iterations = random_iterations.copy()
# lc_fscores = [log(it, 300)+0.01 for it in random_iterations]
# lc_losses = [loss-300 for loss in random_losses]

assert len(random_iterations) == len(lc_iterations)
assert len(random_iterations) == len(random_fscores)
assert len(lc_iterations) == len(lc_fscores)

## SpanCategorizer F-score over iterations

In [6]:
p = figure(title="SpanCategorizer performance over iterations",
           x_axis_label="Iteration",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_iterations, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_iterations, [max_rand_fscore]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"fscore_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/fscore_on_iters_rand_vs_lc_50it_50n.png'

## SpanCategorizer training loss over iterations

In [10]:
p = figure(title="SpanCategorizer training loss over number of iterations",
           x_axis_label="Iteration",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_iterations, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

min_rand_loss = min(random_losses)
p.line(random_iterations, [min_rand_loss]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"loss_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/loss_on_iters_rand_vs_lc_50it_50n.png'

## SpanCategorizer F-score over number of entities

In [8]:
p = figure(title="SpanCategorizer performance over number of entities",
           x_axis_label="Entities",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_entities_num, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"fscore_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/fscore_on_ents_rand_vs_lc_50it_50n.png'

## SpanCategorizer training loss over number of entities

In [11]:
p = figure(title="SpanCategorizer training loss over number of entities",
           x_axis_label="Entities",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_entities_num, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"loss_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/loss_on_ents_rand_vs_lc_50it_50n.png'