In [1]:
from bokeh.io import output_notebook, show, export_png
from bokeh.plotting import figure
from bokeh.models import NumeralTickFormatter

output_notebook()

from pathlib import Path
from jsonlines import jsonlines
from pprint import pprint
from random import randint

In [2]:
RANDOM_QUERY_LOG = "logs_archive/rand_50i_50n_active_learned_kpwr-full.metrics.jsonl"
LEAST_CONFIDENCE_LOG = "logs_archive/least_conf_50i_50n_active_learned_kpwr-full.metrics.jsonl"
FIGURES_DIR = Path("figures")
FIGURES_DIR.mkdir(exist_ok=True)

ITERATIONS = 50
N_INSTANCES = 50

VISUALS_NAME = "final_v0"

In [3]:
with jsonlines.open(RANDOM_QUERY_LOG) as reader_rand:
    random_log_data = [j for j in reader_rand]
rand_idx = randint(0, len(random_log_data) - 1)
pprint(random_log_data[rand_idx])

{'_date': '20-11-2022 18:45:45',
 '_iteration': 41,
 '_iteration_time': 16.310407876968384,
 '_labels_count': {'_all': 3164,
                   'nam_liv_person': 1758,
                   'nam_loc_gpe_city': 788,
                   'nam_loc_gpe_country': 618},
 '_sc_loss': 3152.5611263513565,
 '_spans_count': 3164,
 'spans_sc_f': 0.5361101780315753,
 'spans_sc_p': 0.646677471636953,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.6435492095869454,
                                          'p': 0.6235177865612648,
                                          'r': 0.6649104320337197},
                       'nam_loc_gpe_city': {'f': 0.5068285280728375,
                                            'p': 0.7522522522522522,
                                            'r': 0.38215102974828374},
                       'nam_loc_gpe_country': {'f': 0.0, 'p': 0.0, 'r': 0.0}},
 'spans_sc_r': 0.4578313253012048,
 'speed': 11663.30718506828,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_

In [4]:
with jsonlines.open(LEAST_CONFIDENCE_LOG) as reader_lc:
    least_confidence_log_data = [j for j in reader_lc]
rand_idx = randint(0, len(least_confidence_log_data) - 1)
pprint(least_confidence_log_data[rand_idx])

{'_date': '20-11-2022 23:25:41',
 '_iteration': 38,
 '_iteration_time': 19.882591724395752,
 '_labels_count': {'_all': 2887,
                   'nam_liv_person': 1601,
                   'nam_loc_gpe_city': 707,
                   'nam_loc_gpe_country': 579},
 '_sc_loss': 2653.372898340225,
 '_spans_count': 2887,
 'spans_sc_f': 0.5494584837545127,
 'spans_sc_p': 0.7409931840311588,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.6343768458357946,
                                          'p': 0.7217741935483871,
                                          'r': 0.565858798735511},
                       'nam_loc_gpe_city': {'f': 0.5260029717682021,
                                            'p': 0.75,
                                            'r': 0.40503432494279173},
                       'nam_loc_gpe_country': {'f': 0.23267326732673269,
                                               'p': 1.0,
                                               'r': 0.13165266106442577}},
 'spans_sc_r':

In [5]:
random_iterations, random_entities_num, random_fscores, random_losses = [], [], [], []
lc_iterations, lc_entities_num, lc_fscores, lc_losses = [], [], [], []
assert len(random_log_data) == len(least_confidence_log_data)

for rand_log_line, lc_log_line in zip(random_log_data, least_confidence_log_data):
    random_iterations.append(rand_log_line["_iteration"])
    random_entities_num.append(rand_log_line["_spans_count"])
    random_fscores.append(rand_log_line["spans_sc_f"])
    random_losses.append(rand_log_line["_sc_loss"])

    lc_iterations.append(lc_log_line["_iteration"])
    lc_entities_num.append(lc_log_line["_spans_count"])
    lc_fscores.append(lc_log_line["spans_sc_f"])
    lc_losses.append(lc_log_line["_sc_loss"])

assert len(random_iterations) == len(lc_iterations)
assert len(random_iterations) == len(random_fscores)
assert len(lc_iterations) == len(lc_fscores)

## SpanCategorizer F-score over iterations

In [6]:
p = figure(title="SpanCategorizer performance over iterations",
           x_axis_label="Iteration",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_iterations, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_iterations, [max_rand_fscore]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_fscore_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_fscore_on_iters_rand_vs_lc_50it_50n.png'

## SpanCategorizer training loss over iterations

In [7]:
p = figure(title="SpanCategorizer training loss over number of iterations",
           x_axis_label="Iteration",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_iterations, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

min_rand_loss = min(random_losses)
p.line(random_iterations, [min_rand_loss]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_loss_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_loss_on_iters_rand_vs_lc_50it_50n.png'

## SpanCategorizer F-score over number of entities

In [8]:
p = figure(title="SpanCategorizer performance over number of entities",
           x_axis_label="Entities",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_entities_num, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_fscore_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_fscore_on_ents_rand_vs_lc_50it_50n.png'

## SpanCategorizer training loss over number of entities

In [9]:
p = figure(title="SpanCategorizer training loss over number of entities",
           x_axis_label="Entities",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_entities_num, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

out_path = FIGURES_DIR / f"{VISUALS_NAME}_loss_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/INZYNIERKA/figures/final_v0_loss_on_ents_rand_vs_lc_50it_50n.png'