In [1]:
from bokeh.io import output_notebook, show, export_png
from bokeh.plotting import figure
from bokeh.models import NumeralTickFormatter

output_notebook()

from pathlib import Path
from jsonlines import jsonlines
from pprint import pprint
from random import randint

In [2]:
RANDOM_QUERY_LOG = "logs/scripts/random_13i_200n_active_learned_kpwr-full__1669005233f0976288.metrics.jsonl"
LEAST_CONFIDENCE_LOG = "logs/scripts/least_conf_13i_200n_active_learned_kpwr-full__1669005151f7778394.metrics.jsonl"
FIGURES_DIR = Path("figures")
FIGURES_DIR.mkdir(exist_ok=True)

ITERATIONS = 13
N_INSTANCES = 200

VISUALS_NAME = "final_v0"

In [3]:
with jsonlines.open(RANDOM_QUERY_LOG) as reader_rand:
    random_log_data = [j for j in reader_rand]
rand_idx = randint(0, len(random_log_data) - 1)
pprint(random_log_data[rand_idx], compact=True)

{'_date': '21-11-2022 05:34:20',
 '_iteration': 4,
 '_iteration_time': 10.98777985572815,
 '_labels_count': {'_all': 1272,
                   'nam_liv_person': 657,
                   'nam_loc_gpe_city': 329,
                   'nam_loc_gpe_country': 286},
 '_sc_loss': 1653.646856725216,
 '_spans_count': 1272,
 'spans_sc_f': 0.38778107764106917,
 'spans_sc_p': 0.744299674267101,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.47174076865109266,
                                          'p': 0.828042328042328,
                                          'r': 0.32982086406743943},
                       'nam_loc_gpe_city': {'f': 0.4279346210995542,
                                            'p': 0.6101694915254238,
                                            'r': 0.3295194508009153},
                       'nam_loc_gpe_country': {'f': 0.0, 'p': 0.0, 'r': 0.0}},
 'spans_sc_r': 0.2621916236374068,
 'speed': 8436.974063345006,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_r':

In [4]:
with jsonlines.open(LEAST_CONFIDENCE_LOG) as reader_lc:
    least_confidence_log_data = [j for j in reader_lc]
rand_idx = randint(0, len(least_confidence_log_data) - 1)
pprint(least_confidence_log_data[rand_idx], compact=True)

{'_date': '21-11-2022 05:33:42',
 '_iteration': 7,
 '_iteration_time': 21.49504065513611,
 '_labels_count': {'_all': 2191,
                   'nam_liv_person': 1213,
                   'nam_loc_gpe_city': 530,
                   'nam_loc_gpe_country': 448},
 '_sc_loss': 2402.7629323005676,
 '_spans_count': 2191,
 'spans_sc_f': 0.5034709535988308,
 'spans_sc_p': 0.693158953722334,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.6206475259621258,
                                          'p': 0.7383720930232558,
                                          'r': 0.5353003161222339},
                       'nam_loc_gpe_city': {'f': 0.48313090418353577,
                                            'p': 0.5888157894736842,
                                            'r': 0.4096109839816934},
                       'nam_loc_gpe_country': {'f': 0.011142061281337047,
                                               'p': 1.0,
                                               'r': 0.0056022408963585435}}

In [5]:
random_iterations, random_entities_num, random_fscores, random_losses = [], [], [], []
lc_iterations, lc_entities_num, lc_fscores, lc_losses = [], [], [], []
assert len(random_log_data) == len(least_confidence_log_data)

for rand_log_line, lc_log_line in zip(random_log_data, least_confidence_log_data):
    random_iterations.append(rand_log_line["_iteration"])
    random_entities_num.append(rand_log_line["_spans_count"])
    random_fscores.append(rand_log_line["spans_sc_f"])
    random_losses.append(rand_log_line["_sc_loss"])

    lc_iterations.append(lc_log_line["_iteration"])
    lc_entities_num.append(lc_log_line["_spans_count"])
    lc_fscores.append(lc_log_line["spans_sc_f"])
    lc_losses.append(lc_log_line["_sc_loss"])

assert len(random_iterations) == len(lc_iterations)
assert len(random_iterations) == len(random_fscores)
assert len(lc_iterations) == len(lc_fscores)

## SpanCategorizer F-score over iterations

In [6]:
p = figure(title="SpanCategorizer performance over iterations",
           x_axis_label="Iteration",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_iterations, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_iterations, [max_rand_fscore]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_fscore_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_fscore_on_iters_rand_vs_lc_13it_200n.png'

## SpanCategorizer training loss over iterations

In [7]:
p = figure(title="SpanCategorizer training loss over number of iterations",
           x_axis_label="Iteration",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_iterations, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

min_rand_loss = min(random_losses)
p.line(random_iterations, [min_rand_loss]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_loss_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_loss_on_iters_rand_vs_lc_13it_200n.png'

## SpanCategorizer F-score over number of entities

In [8]:
p = figure(title="SpanCategorizer performance over number of entities",
           x_axis_label="Entities",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_entities_num, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_fscore_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_fscore_on_ents_rand_vs_lc_13it_200n.png'

## SpanCategorizer training loss over number of entities

In [9]:
p = figure(title="SpanCategorizer training loss over number of entities",
           x_axis_label="Entities",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_entities_num, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_loss_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_loss_on_ents_rand_vs_lc_13it_200n.png'