In [5]:
import sys
import os
import spacy
from datasets import load_dataset, ClassLabel, Sequence
from consts import DATA_PATH, MODEL_PATH


In [6]:
# --- CONFIGURATION ---
MODEL_NAME = "en_core_web_lg"
LOCAL_LIB_PATH = os.path.join(MODEL_PATH, "spacy_libs")
PROCESSED_DATA_PATH = os.path.join(DATA_PATH, "wikiann+spacy_pos")

In [3]:
# 1. Load Model (CPU Mode)
if LOCAL_LIB_PATH not in sys.path:
    sys.path.append(LOCAL_LIB_PATH)
model_module = importlib.import_module(MODEL_NAME)
nlp = model_module.load(disable=["parser", "ner", "lemmatizer"])

Loading en_core_web_lg on CPU...
Error: Could not load en_core_web_lg. Make sure it is installed in /home/dan/Work/utcn/an4/sem1/pso/proj/knowledge-graph-extraction/train/data/spacy_libs


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# 2. Load Dataset
print("Loading dataset...")
dataset = load_dataset("conll2003", cache_dir=os.path.join(DATA_PATH, "raw_cache"))

# Setup Labels
original_features = dataset["train"].features["ner_tags"].feature
original_names = original_features.names
new_tags = ["B-NOUN", "I-NOUN", "B-PRON", "I-PRON"]
final_names = original_names + new_tags
label2id = {label: i for i, label in enumerate(final_names)}


In [None]:
# 3. Augmentation Function (Multi-Core CPU)
def augment_batch(batch):
    new_batch_ner_tags = []

    # n_process=-1 uses all available CPU cores
    # batch_size=1000 is efficient for CPU
    docs = list(nlp.pipe(batch["tokens"], batch_size=1000, n_process=1))
    # Note: kept n_process=1 for safety in notebooks/scripts.
    # If running as a pure .py script, you can set n_process=-1 for max speed.

    for i, doc in enumerate(docs):
        original_tags = batch["ner_tags"][i]
        augmented_tags = []

        for token, original_id in zip(doc, original_tags):
            original_label = original_names[original_id]

            if original_label != "O":
                augmented_tags.append(label2id[original_label])
            else:
                if token.pos_ == "NOUN":
                    augmented_tags.append(label2id["B-NOUN"])
                elif token.pos_ == "PRON":
                    augmented_tags.append(label2id["B-PRON"])
                else:
                    augmented_tags.append(label2id["O"])

        new_batch_ner_tags.append(augmented_tags)
    return {"ner_tags": new_batch_ner_tags}


In [None]:
# 4. Run
print("Augmenting (This should take < 1 minute)...")
# We lower batch size slightly for the map function to keep memory low
augmented_dataset = dataset.map(augment_batch, batched=True, batch_size=1000)


In [None]:
# 5. Save
print("Saving...")
new_features = augmented_dataset["train"].features.copy()
new_features["ner_tags"] = Sequence(ClassLabel(names=final_names))
augmented_dataset = augmented_dataset.cast(new_features)
augmented_dataset.save_to_disk(PROCESSED_DATA_PATH)
print("Done.")