In [8]:
import os
import spacy
from spacy.tokens import Doc
from datasets import load_dataset, ClassLabel, Sequence
from consts import DATA_PATH

In [9]:
# config
MODEL_NAME = "en_core_web_trf"
LOCAL_LIB_PATH = os.path.join(DATA_PATH, "spacy_libs")
SAVE_PATH = os.path.join(DATA_PATH, "deberta_fewnerd_aug")

In [10]:
# gpu
spacy.prefer_gpu()
nlp = spacy.load(MODEL_NAME)

final_label_names = [
    "O",
    "B-ART", "I-ART",
    "B-BUILDING", "I-BUILDING",
    "B-EVENT", "I-EVENT",
    "B-LOC", "I-LOC",
    "B-ORG", "I-ORG",
    "B-OTHER", "I-OTHER",
    "B-PER", "I-PER",
    "B-PROD", "I-PROD",
    "B-NOUN", "I-NOUN",
    "B-PRON", "I-PRON"
]

target_label2id = {label: i for i, label in enumerate(final_label_names)}
id_of_O = target_label2id["O"]

print(f"Target Schema: {len(final_label_names)} labels.")

Target Schema: 21 labels.


In [11]:
def doc_generator(tokens_list):
    for tokens in tokens_list:
        yield Doc(nlp.vocab, words=tokens)

In [12]:
def augment_batch(batch):
    new_batch_tags = []
    doc_gen = doc_generator(batch["tokens"])
    processed_docs = list(nlp.pipe(doc_gen, batch_size=32))

    for i, doc in enumerate(processed_docs):
        original_ids = batch["ner_tags"][i]
        row_tags = []

        for token, src_id in zip(doc, original_ids):
            src_label_raw = original_int2str(src_id)
            src_label_upper = src_label_raw.upper()
            final_id = target_label2id.get(src_label_upper, id_of_O)

            # only augment if O
            if final_id == id_of_O:
                if token.pos_ == "NOUN":
                    final_id = target_label2id["B-NOUN"]
                elif token.pos_ == "PRON":
                    final_id = target_label2id["B-PRON"]

            row_tags.append(final_id)
        new_batch_tags.append(row_tags)

    return {"ner_tags_master": new_batch_tags}

print("Loading Few-NERD (Supervised)...")
# 'dfki-nlp/few-nerd' is the official repo.
# 'supervised' split contains ~130k samples (Huge and good).
dataset = load_dataset("dfki-nlp/few-nerd", "supervised", split="train", cache_dir=os.path.join(DATA_PATH, "raw_cache"))


Loading Few-NERD (Supervised)...


Generating train split: 100%|██████████| 131767/131767 [00:00<00:00, 1167733.72 examples/s]
Generating validation split: 100%|██████████| 18824/18824 [00:00<00:00, 1234845.92 examples/s]
Generating test split: 100%|██████████| 37648/37648 [00:00<00:00, 1223375.22 examples/s]


In [13]:
# Setup Decoder
original_features = dataset.features["ner_tags"].feature
original_int2str = original_features.int2str

In [14]:
# Process
augmented_dataset = dataset.map(augment_batch, batched=True, batch_size=100)
# Clean
print("Finalizing...")
augmented_dataset = augmented_dataset.select_columns(["tokens", "ner_tags_master"])
augmented_dataset = augmented_dataset.rename_column("ner_tags_master", "ner_tags")

  dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
Map: 100%|██████████| 131767/131767 [10:48<00:00, 203.30 examples/s]

Finalizing...





In [15]:
# Save
print(f"Saving to {SAVE_PATH}...")
new_features = augmented_dataset.features.copy()
new_features["ner_tags"] = Sequence(ClassLabel(names=final_label_names))
augmented_dataset = augmented_dataset.cast(new_features)

augmented_dataset.save_to_disk(SAVE_PATH)
print("Success! DeBERTa-ready Few-NERD dataset created.")

Saving to /home/dan/Work/utcn/an4/sem1/pso/proj/knowledge-graph-extraction/train/data/deberta_fewnerd_aug...


Casting the dataset: 100%|██████████| 131767/131767 [00:00<00:00, 2738828.07 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 131767/131767 [00:00<00:00, 3061142.19 examples/s]

Success! DeBERTa-ready Few-NERD dataset created.



