In [None]:
Named Entity Recognition (NER) is a fundamental task in Natural Language Processing (NLP) that involves identifying and classifying entities in text into predefined categories such as names of people, organizations, locations, dates, etc. Hugging Face provides pre-trained models that can be used for NER tasks, and these models can also be fine-tuned on specific datasets if needed. Here's an end-to-end guide to building an NER system using Hugging Face.

1. Environment Setup
Install Necessary Libraries: Start by installing the required libraries.
bash
Copy code
pip install transformers datasets torch
2. Data Collection & Preprocessing
Collect Data: If you're working on a domain-specific task, you might need a custom dataset. For general-purpose NER, you can use datasets like CoNLL-2003.

python
Copy code
from datasets import load_dataset

dataset = load_dataset("conll2003")
Examine the Data: Take a look at the data structure to understand the format.

python
Copy code
print(dataset["train"][0])
Preprocess Data: Tokenize the input text and align the labels with the tokens.

python
Copy code
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        padding=True,
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
3. Model Selection
Choose a Pre-Trained NER Model: You can use a pre-trained model like BERT, RoBERTa, or DistilBERT. For NER tasks, models fine-tuned on NER datasets are available.
python
Copy code
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=9)
4. Fine-Tuning the Model
Training Arguments: Define the training arguments for fine-tuning the model.

python
Copy code
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)
Trainer Setup: Set up the Hugging Face Trainer for model training.

python
Copy code
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()
5. Inference (NER)
Running NER on New Text: Once trained, you can use the model to recognize named entities in new text.
python
Copy code
from transformers import pipeline

nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Hugging Face Inc. is a company based in New York."
ner_results = nlp_ner(text)
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")
6. Model Evaluation
Evaluate the Model: Use metrics like precision, recall, and F1-score to evaluate the NER model.
python
Copy code
from datasets import load_metric

metric = load_metric("seqeval")

labels = tokenized_datasets["validation"]["labels"]
predictions = trainer.predict(tokenized_datasets["validation"]).predictions
predictions = np.argmax(predictions, axis=2)

true_labels = [
    [label_list[label] for label in label_seq if label != -100]
    for label_seq in labels
]
true_predictions = [
    [label_list[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
    for pred_seq, label_seq in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)
7. Deploying the NER System
Deploy as a Web Service: Use Flask to create a simple API that allows users to perform NER on text inputs.
python
Copy code
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/ner", methods=["POST"])
def ner():
    text = request.json["text"]
    ner_results = nlp_ner(text)
    return jsonify(ner_results)

if __name__ == "__main__":
    app.run(debug=True)
8. Monitoring and Maintenance
Monitor Performance: Track the model's performance in production, and fine-tune the model periodically as more data becomes available.
Update the Model: Regularly retrain or fine-tune the model with new data to improve accuracy.
9. Documentation and Sharing
Document the System: Provide clear documentation for the entire pipeline, from data preprocessing to deployment.
Share the Model: Optionally, share your fine-tuned model on the Hugging Face Model Hub.
This guide provides a complete overview of building a Named Entity Recognition system using Hugging Face's tools. It covers everything from data preparation and model fine-tuning to deployment and evaluation, giving you a solid foundation for developing your own NER applications.

In [None]:
from datasets import load_dataset

dataset = load_dataset("conll2003")


In [None]:
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        padding=True,
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=9)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
from transformers import pipeline

nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Hugging Face Inc. is a company based in New York."
ner_results = nlp_ner(text)
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")


In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

labels = tokenized_datasets["validation"]["labels"]
predictions = trainer.predict(tokenized_datasets["validation"]).predictions
predictions = np.argmax(predictions, axis=2)

true_labels = [
    [label_list[label] for label in label_seq if label != -100]
    for label_seq in labels
]
true_predictions = [
    [label_list[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
    for pred_seq, label_seq in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)


In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/ner", methods=["POST"])
def ner():
    text = request.json["text"]
    ner_results = nlp_ner(text)
    return jsonify(ner_results)

if __name__ == "__main__":
    app.run(debug=True)
