In [1]:
import argilla as rg
from datasets import load_dataset

In [2]:
SEED = 42
TRAIN_NUM_SAMPLES = 400
TEST_NUM_SAMPLES = 100

In [3]:
test_ds_full = load_dataset("clarin-pl/kpwr-ner", split="test")
train_ds_full = load_dataset("clarin-pl/kpwr-ner", split="train")



In [4]:
def _contains_any_label(example, labels):
    return any(map(lambda label: label in labels, example["ner"]))

# index: label (BIO)
include_labels = {
    20: "B-nam_liv_person", 98: "I-nam_liv_person",
    27: "B-nam_loc_gpe_city", 105: "I-nam_loc_gpe_city",
    29: "B-nam_loc_gpe_country", 107: "I-nam_loc_gpe_country"
}

assert len(include_labels) % 2 == 0

labels_num = int(len(include_labels) / 2)

In [5]:
TRAIN_DS_NAME = f"inzynierka-kpwr-train-{labels_num}"
TEST_DS_NAME = f"inzynierka-kpwr-test-{labels_num}"

In [6]:
train_dataset = train_ds_full \
                    .filter(lambda record: _contains_any_label(record, include_labels.keys()))
test_dataset = test_ds_full \
                    .filter(lambda record: _contains_any_label(record, include_labels.keys()))



In [7]:
len(train_dataset), len(test_dataset)

(3456, 1136)

In [8]:
record_sample = train_dataset[0]
for key, item in record_sample.items():
    print(f"{key}: {item}\n")

tokens: ['Roboty', 'mają', 'kilkanaście', 'lat', 'i', 'pochodzą', 'z', 'USA', ',', 'Wysokie', 'napięcie', 'jest', 'dużo', 'młodsze', ',', 'powstało', 'w', 'Niemczech', '.']

lemmas: ['robota', 'maić', 'kilkanaście', 'rok', 'i', 'pochodzić', 'z', 'USA', ',', 'wysoki', 'napięcie', 'być', 'dużo', 'młody', ',', 'powstać', 'w', 'Niemcy', '.']

orth: ['subst:pl:nom:f', 'fin:pl:ter:imperf', 'num:pl:acc:m3:rec', 'subst:pl:gen:m3', 'conj', 'fin:pl:ter:imperf', 'prep:gen:nwok', 'subst:pl:gen:n', 'interp', 'adj:sg:nom:n:pos', 'subst:sg:nom:n', 'fin:sg:ter:imperf', 'num:pl:nom:n:rec', 'adj:sg:nom:n:com', 'interp', 'praet:sg:n:perf', 'prep:loc:nwok', 'subst:pl:loc:n', 'interp']

ner: [73, 160, 160, 160, 160, 160, 160, 29, 160, 73, 151, 160, 160, 160, 160, 160, 160, 29, 160]



In [9]:
def map_idx_to_label(example, labels_dict):
    # "O" if not in labels to be included
    return [
        labels_dict[idx] if idx in labels_dict.keys() else "O"
        for idx in example["ner"]
    ]

def datasets_to_rg(dataset):
    rg_records = [
        rg.TokenClassificationRecord(
            tokens=example["tokens"],
            tags=map_idx_to_label(example, include_labels)
        )
        for example in dataset
    ]
    return rg.DatasetForTokenClassification(rg_records)

In [10]:
train_dataset_sample = train_dataset.shuffle(seed=SEED).select(range(TRAIN_NUM_SAMPLES))
test_dataset_sample = test_dataset.shuffle(seed=SEED).select(range(TEST_NUM_SAMPLES))



In [11]:
rg_records_train = datasets_to_rg(train_dataset_sample)
rg_records_test = datasets_to_rg(test_dataset_sample)

In [12]:
rg_dataset_train = rg.DatasetForTokenClassification(rg_records_train)
rg_dataset_test = rg.DatasetForTokenClassification(rg_records_test)

In [13]:
rg.log(rg_dataset_train, name=TRAIN_DS_NAME)
rg.log(rg_dataset_test, name=TEST_DS_NAME)

  0%|          | 0/400 [00:00<?, ?it/s]

400 records logged to http://localhost:6900/datasets/argilla/inzynierka-kpwr-train-3


  0%|          | 0/100 [00:00<?, ?it/s]

100 records logged to http://localhost:6900/datasets/argilla/inzynierka-kpwr-test-3


BulkResponse(dataset='inzynierka-kpwr-test-3', processed=100, failed=0)

## dobry przykład do displacy ner

In [14]:
for record in rg_dataset_train:
    if "Hadze" in record.text:
        print(record.text)
        print(record.annotation)

Prezydent Sudanu Omar al - Baszir już zapowiedział , że jego kraj nie będzie współpracował z trybunałem w Hadze .
[('nam_loc_gpe_country', 10, 16), ('nam_liv_person', 17, 33), ('nam_loc_gpe_city', 106, 111)]


In [15]:
# !jupyter nbconvert --to pdf kpwr-argilla-log.ipynb --output ./misc/kpwr-argilla-log.pdf