In [1]:
%load_ext autoreload
%autoreload

In [2]:
import spacy
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.tokens import DocBin

import argilla as rg

from pprint import pprint
from pathlib import Path
from jsonlines import jsonlines
import random
from time import time as etime


In [4]:
NAME = "random_sm10"

_start_etime_str = str(etime()).replace(".", "f")
DATA_DIR = Path("data")
TRAIN_DB = DATA_DIR / Path("inzynierka-kpwr-train-3.spacy")
TEST_DB = DATA_DIR / Path("inzynierka-kpwr-test-3.spacy")
TEMP_DB = DATA_DIR / Path("temp-train.spacy")
LOGS_DIR = Path("logs")
CONFIG_DIR = Path("config") / Path("spacy")
CONFIG = CONFIG_DIR / Path("config_sm.cfg")
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)
MODEL_OUT = MODELS_DIR / Path(f"{NAME}__{_start_etime_str}.spacy")
MODEL_LAST = MODEL_OUT / Path("model-last")
METRICS_OUT = LOGS_DIR / Path(f"{NAME}__{_start_etime_str}.metrics.jsonl")

SEED = 42
SPANS_KEY = "sc"
N_INSTANCES = 10

random.seed(SEED)
assert not MODEL_OUT.exists()

In [5]:
def query_random(records, exclude, n_instances):
    """Random query strategy"""
    n_queried = 0
    max_idx = len(records) - 1
    while n_queried < n_instances:
        idx = random.randint(0, max_idx)
        if idx not in exclude:
            exclude.add(idx)
            n_queried += 1
            yield idx, records[idx]

def log_results(results, out):
    """Log results to a file"""
    with jsonlines.open(out, mode="w") as writer:
        writer.write(results)

def _docs_train(docbin_path, lang="pl"):
    """Get docs from Docbin using blank nlp object's vocabulary."""
    nlp = spacy.blank(lang)
    docs_train = list(DocBin().from_disk(docbin_path).get_docs(nlp.vocab))
    return docs_train

In [6]:
docs_train = _docs_train(TRAIN_DB)
docs_train_len = len(docs_train)

iteration = 1
max_iters = 10
spans_queried = 0
spans_num_history = []
db = DocBin()
queried = set()
while True:
    if iteration > max_iters or len(queried) >= docs_train_len:
        break

    for q_idx, q_doc in query_random(docs_train, queried, N_INSTANCES):
        queried.add(q_idx)
        db.add(q_doc)
        spans_queried += len(q_doc.spans[SPANS_KEY])
    spans_num_history.append(spans_queried)

    db.to_disk(TEMP_DB)

    train(CONFIG,
            output_path=MODEL_OUT,
            overrides={
                "training.seed": SEED,
                "paths.train": str(TEMP_DB),
                "paths.dev": str(TEST_DB)
            })

    eval_metrics = evaluate(MODEL_LAST, TEST_DB)

    results = {"_iteration": iteration, "_spans_num": spans_queried}
    results.update(eval_metrics)

    log_results(results,
                out=METRICS_OUT)

No more data to query
[38;5;2m✔ Created output directory:
models/random_sm_batch_10__1668441930f2908602.spacy[0m
[38;5;4mℹ Saving to output directory:
models/random_sm_batch_10__1668441930f2908602.spacy[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'spancat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS SPANCAT  SPANS_SC_F  SPANS_SC_P  SPANS_SC_R  SCORE 
---  ------  ------------  ------------  ----------  ----------  ----------  ------
  0       0         10.74        256.19        1.82        0.94       32.12    0.02


KeyboardInterrupt: 