In [1]:
from bokeh.io import output_notebook, show, export_png
from bokeh.plotting import figure
from bokeh.models import NumeralTickFormatter

output_notebook(hide_banner=True)

from pathlib import Path
from jsonlines import jsonlines
from pprint import pprint
from random import randint

In [2]:
RANDOM_QUERY_LOG = "logs_archive/rand_50i_50n_active_learned_kpwr-full.metrics.jsonl"
LEAST_CONFIDENCE_LOG = "logs_archive/least_conf_50i_50n_active_learned_kpwr-full.metrics.jsonl"
FIGURES_DIR = Path("figures")
FIGURES_DIR.mkdir(exist_ok=True)

ITERATIONS = 50
N_INSTANCES = 50

VISUALS_NAME = "final_v0"

In [3]:
with jsonlines.open(RANDOM_QUERY_LOG) as reader_rand:
    random_log_data = [j for j in reader_rand]
rand_idx = randint(0, len(random_log_data) - 1)
pprint(random_log_data[rand_idx], compact=True)

{'_date': '20-11-2022 18:44:43',
 '_iteration': 37,
 '_iteration_time': 15.336821794509888,
 '_labels_count': {'_all': 2864,
                   'nam_liv_person': 1585,
                   'nam_loc_gpe_city': 717,
                   'nam_loc_gpe_country': 562},
 '_sc_loss': 2591.369528710842,
 '_spans_count': 2864,
 'spans_sc_f': 0.5871104815864023,
 'spans_sc_p': 0.7668825161887142,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.6532305868405454,
                                          'p': 0.7466124661246613,
                                          'r': 0.5806111696522656},
                       'nam_loc_gpe_city': {'f': 0.47384615384615386,
                                            'p': 0.7230046948356808,
                                            'r': 0.3524027459954233},
                       'nam_loc_gpe_country': {'f': 0.5092402464065708,
                                               'p': 0.9538461538461539,
                                               'r': 0.347338

In [4]:
with jsonlines.open(LEAST_CONFIDENCE_LOG) as reader_lc:
    least_confidence_log_data = [j for j in reader_lc]
rand_idx = randint(0, len(least_confidence_log_data) - 1)
pprint(least_confidence_log_data[rand_idx], compact=True)

{'_date': '20-11-2022 23:22:12',
 '_iteration': 27,
 '_iteration_time': 20.572356462478638,
 '_labels_count': {'_all': 2083,
                   'nam_liv_person': 1173,
                   'nam_loc_gpe_city': 489,
                   'nam_loc_gpe_country': 421},
 '_sc_loss': 2270.155914545059,
 '_spans_count': 2083,
 'spans_sc_f': 0.4120065789473685,
 'spans_sc_p': 0.7271407837445574,
 'spans_sc_per_type': {'nam_liv_person': {'f': 0.5742946708463949,
                                          'p': 0.7089783281733746,
                                          'r': 0.48261327713382507},
                       'nam_loc_gpe_city': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                       'nam_loc_gpe_country': {'f': 0.215,
                                               'p': 1.0,
                                               'r': 0.12044817927170869}},
 'spans_sc_r': 0.2874354561101549,
 'speed': 8267.51479715888,
 'token_acc': 1.0,
 'token_f': 1.0,
 'token_p': 1.0,
 'token_r': 1.0}


In [5]:
random_iterations, random_entities_num, random_fscores, random_losses = [], [], [], []
lc_iterations, lc_entities_num, lc_fscores, lc_losses = [], [], [], []
assert len(random_log_data) == len(least_confidence_log_data)

for rand_log_line, lc_log_line in zip(random_log_data, least_confidence_log_data):
    random_iterations.append(rand_log_line["_iteration"])
    random_entities_num.append(rand_log_line["_spans_count"])
    random_fscores.append(rand_log_line["spans_sc_f"])
    random_losses.append(rand_log_line["_sc_loss"])

    lc_iterations.append(lc_log_line["_iteration"])
    lc_entities_num.append(lc_log_line["_spans_count"])
    lc_fscores.append(lc_log_line["spans_sc_f"])
    lc_losses.append(lc_log_line["_sc_loss"])

assert len(random_iterations) == len(lc_iterations)
assert len(random_iterations) == len(random_fscores)
assert len(lc_iterations) == len(lc_fscores)

## SpanCategorizer F-score over iterations

In [6]:
p = figure(title="SpanCategorizer performance over iterations",
           x_axis_label="Iteration",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_iterations, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_iterations, [max_rand_fscore]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

In [7]:
out_path = FIGURES_DIR / f"{VISUALS_NAME}_fscore_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/bachelor_thesis_project/figures/final_v0_fscore_on_iters_rand_vs_lc_50it_50n.png'

## SpanCategorizer training loss over iterations

In [8]:
p = figure(title="SpanCategorizer training loss over number of iterations",
           x_axis_label="Iteration",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_iterations, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_iterations, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

min_rand_loss = min(random_losses)
p.line(random_iterations, [min_rand_loss]*len(random_iterations), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

In [9]:
out_path = FIGURES_DIR / f"{VISUALS_NAME}_loss_on_iters_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/bachelor_thesis_project/figures/final_v0_loss_on_iters_rand_vs_lc_50it_50n.png'

## SpanCategorizer F-score over number of entities

In [10]:
p = figure(title="SpanCategorizer performance over number of entities",
           x_axis_label="Entities",
           y_axis_label="F-score",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0.000")

p.line(random_entities_num, random_fscores, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_fscores, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

In [11]:
out_path = FIGURES_DIR / f"{VISUALS_NAME}_fscore_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p)

'/tmp/tmptsvhs5w9.png'

## SpanCategorizer training loss over number of entities

In [12]:
p = figure(title="SpanCategorizer training loss over number of entities",
           x_axis_label="Entities",
           y_axis_label="Loss",
           width=700,
           height=400)
p.title.align = "center"
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.line(random_entities_num, random_losses, legend_label="random", color="#F6C85F", line_width=2)
p.line(lc_entities_num, lc_losses, legend_label="least-confidence", color="#0B84A5", line_width=2)

max_rand_fscore = max(random_fscores)
p.line(random_entities_num, [max_rand_fscore]*len(random_entities_num), color="black", line_dash="dashed", line_alpha=0.5)

p.add_layout(p.legend[0], "right")

show(p)

In [13]:
out_path = FIGURES_DIR / f"{VISUALS_NAME}_loss_on_ents_rand_vs_lc_{ITERATIONS}it_{N_INSTANCES}n.png"
p.toolbar_location = None
export_png(p, filename=out_path)

'/home/jjamnicki/Documents/bachelor_thesis_project/figures/final_v0_loss_on_ents_rand_vs_lc_50it_50n.png'