# Named Entity datasets

Here is code to download and prepprocess several well-known datasets across platforms like `Hugging Face` and `Kaggle`. These datasets are often used to train, fine-tune, and evaluate NER models, ranging from general-purpose to domain-specific applications.

## Setting up

Install all necessary packages and initialize the library names.

In [2]:
#!pip install -U pip

In [3]:
#!pip install -U transformers datasets pandas scikit-learn

In [8]:
import os
import sys

import pandas as pd
from datasets import load_dataset

## Hugging Face datasets

Find your dataset on the [Hugging Face Hub](https://huggingface.co/datasets)

### CoNLL-2003 dataset

In [5]:
# Load the CoNLL-2003 dataset from Hugging Face
dataset = load_dataset("conll2003")
dataset

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [10]:
# Get the mapping from the dataset for the NER labels
label_list = dataset['train'].features['ner_tags'].feature.names

def extract_entities(labels, tokens):
    persons, locations, organizations = [], [], []
    current_entity = []
    current_type = None
    
    for label, token in zip(labels, tokens):
        # Convert integer labels to string tags
        label_str = label_list[label]
        
        if label_str.startswith("B-"):  # Beginning of a new entity
            if current_entity:
                # Save the previous entity
                if current_type == "PER":
                    persons.append(" ".join(current_entity))
                elif current_type == "LOC":
                    locations.append(" ".join(current_entity))
                elif current_type == "ORG":
                    organizations.append(" ".join(current_entity))
            # Start a new entity
            current_entity = [token]
            current_type = label_str[2:]
        elif label_str.startswith("I-") and current_type == label_str[2:]:
            # Continuation of an entity
            current_entity.append(token)
        else:
            # No entity or different entity, save the current one
            if current_entity:
                if current_type == "PER":
                    persons.append(" ".join(current_entity))
                elif current_type == "LOC":
                    locations.append(" ".join(current_entity))
                elif current_type == "ORG":
                    organizations.append(" ".join(current_entity))
            current_entity = []
            current_type = None
    
    # Append the last entity if it exists
    if current_entity:
        if current_type == "PER":
            persons.append(" ".join(current_entity))
        elif current_type == "LOC":
            locations.append(" ".join(current_entity))
        elif current_type == "ORG":
            organizations.append(" ".join(current_entity))
    
    return ";".join(persons), ";".join(locations), ";".join(organizations)

# Processing the dataset
def process_dataset(dataset_split):
    rows = []
    
    for i, row in enumerate(dataset_split):
        tokens = row['tokens']
        labels = row['ner_tags']
        text = " ".join(tokens)
        
        # Extract entities from the labels and tokens
        persons, locations, organizations = extract_entities(labels, tokens)
        
        rows.append({
            'id': i,
            'text': text,
            'persons': persons,
            'locations': locations,
            'organizations': organizations
        })
    
    return pd.DataFrame(rows)

# Transform the train dataset
# Use .select() if you want to limit the rows to process, or pass it directly for full dataset
# train_df = process_dataset(dataset['test'].select(range(5)))  # For testing with the first 5 rows

train_df = process_dataset(dataset['train']) 
validation_df = process_dataset(dataset['validation']) 
test_df = process_dataset(dataset['test'])

_ = [print(df.shape, df.columns) for df in [train_df, validation_df, test_df]]



(14041, 5) Index(['id', 'text', 'persons', 'locations', 'organizations'], dtype='object')
(3250, 5) Index(['id', 'text', 'persons', 'locations', 'organizations'], dtype='object')
(3453, 5) Index(['id', 'text', 'persons', 'locations', 'organizations'], dtype='object')


In [11]:
# Save to CSV
def save_df(df, name):
    file_name = f"../data/external/hf/conll2003_transformed.{name}.csv"
    df.to_csv(file_name, index=False)
    print(f"Saved {df.shape} {file_name}")

_ = [save_df(df, name) for df, name 
     in [(train_df, "train"),(validation_df, "validation"),(test_df, "test"),]
    ]

Saved (14041, 5) ../data/external/hf/conll2003_transformed.train.csv
Saved (3250, 5) ../data/external/hf/conll2003_transformed.validation.csv
Saved (3453, 5) ../data/external/hf/conll2003_transformed.test.csv


In [12]:
# Save compounded df, since we use the pretrained model and use df only for evaluation:

all_df = pd.concat([train_df, validation_df, test_df])
assert all_df.shape[0] == (train_df.shape[0] + validation_df.shape[0] + test_df.shape[0])
save_df(all_df, "all")

Saved (20744, 5) ../data/external/hf/conll2003_transformed.all.csv


## Label my personal dataset TODO

TODO 
It should be moved into another NB!

In [None]:
# Add the parent directory to sys.path because we use code from the application
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    sys.path.append(parent_dir+'/ne_extractor_app')
    print(f"added {parent_dir}")
sys.path

In [None]:
from pathlib import Path

from ne_extractor_app.app.ne_extractor import NEExtractor

import typer
from ne_extractor_app.app.evaluation import evaluate
from ne_extractor_app.app.models.ensemble import EnsembleNERModel
from ne_extractor_app.app.name_normalizer import Normalizer
from pydantic import BaseModel

model = EnsembleNERModel()
normalizer = Normalizer()
extractor = NEExtractor(ne_model=model, normalizer=normalizer)
