In [1]:
import os
import spacy
from datasets import load_dataset, ClassLabel, Sequence

from consts import DATA_PATH

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Setup Paths
RAW_CACHE_PATH = os.path.join(DATA_PATH, "conll2003_raw_cache")
PROCESSED_DATA_PATH = os.path.join(DATA_PATH, "conll2003_augmented_pos")
SPACY_MODEL_NAME = "en_core_web_lg"

In [4]:
# 2. Load spaCy for POS tagging
nlp = spacy.load(SPACY_MODEL_NAME, disable=["parser", "ner", "lemmatizer"])

# 3. Load and Cache the Raw CoNLL-2003 Dataset
print(f"Downloading/Loading CoNLL-2003 to {RAW_CACHE_PATH}...")
dataset = load_dataset("conll2003", cache_dir=RAW_CACHE_PATH)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# 4. Prepare the New Label Set
original_features = dataset["train"].features["ner_tags"].feature
original_label_names = original_features.names

# Define new tags to add
new_tags = ["B-NOUN", "I-NOUN", "B-PRON", "I-PRON"]
final_label_names = original_label_names + new_tags

# Create mappings
label2id = {label: i for i, label in enumerate(final_label_names)}
id2label = {i: label for i, label in enumerate(final_label_names)}

print(f"Original tags: {original_label_names}")
print(f"New tags added: {new_tags}")

# 5. Define the Augmentation Function
def augment_batch(batch):
    new_batch_ner_tags = []

    # Process texts in a batch for speed
    # We use spacy.tokens.Doc to create docs directly from pre-tokenized lists
    # to ensure alignment with CoNLL tokens.
    docs = [spacy.tokens.Doc(nlp.vocab, words=tokens) for tokens in batch["tokens"]]

    # Run the tagger on the batch of docs
    for doc in nlp.pipe(docs):
        pass # The doc is modified in place by the tagger pipe

    # Iterate over the batch
    for i, doc in enumerate(docs):
        original_tags = batch["ner_tags"][i]
        augmented_tags = []

        for token, original_id in zip(doc, original_tags):
            original_label = original_label_names[original_id]

            # RULE: Keep existing NER tags (Persons, Orgs, Locs)
            if original_label != "O":
                augmented_tags.append(label2id[original_label])

            # RULE: If 'O', check for Noun/Pronoun via spaCy
            else:
                if token.pos_ == "NOUN":
                    augmented_tags.append(label2id["B-NOUN"])
                elif token.pos_ == "PRON":
                    augmented_tags.append(label2id["B-PRON"])
                else:
                    augmented_tags.append(label2id["O"])

        new_batch_ner_tags.append(augmented_tags)

    return {"ner_tags": new_batch_ner_tags}


In [None]:
# 6. Apply Augmentation
print("Augmenting dataset (this might take a minute)...")
augmented_dataset = dataset.map(augment_batch, batched=True)


In [None]:
# 7. Cast Features to the New Label Set
# This is crucial so the dataset knows "9" maps to "B-NOUN"
new_features = augmented_dataset["train"].features.copy()
new_features["ner_tags"] = Sequence(ClassLabel(names=final_label_names))

print("Casting features to new label set...")
augmented_dataset = augmented_dataset.cast(new_features)


In [None]:
# 8. Save the Final Processed Dataset
print(f"Saving augmented dataset to {PROCESSED_DATA_PATH}...")
augmented_dataset.save_to_disk(PROCESSED_DATA_PATH)

print("Done! Dataset is ready for training.")