# Name Entity Recognition (NER)


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from spacy.displacy import render
from typing import List

In [27]:
model_id = "elastic/distilbert-base-cased-finetuned-conll03-english"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

print("Model loaded!")

tokenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/954 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Model loaded!


In [28]:
pipe = pipeline(
    model=model_id, tokenizer=tokenizer, task="ner", aggregation_strategy="simple"
)

In [30]:
text = "Mary from the HR department said that The Ritz London was a great hotel option to stay in London, specially during December."
results = pipe(text)
results

[{'entity_group': 'PER',
  'score': 0.9933983,
  'word': 'Mary',
  'start': 0,
  'end': 4},
 {'entity_group': 'ORG',
  'score': 0.99773186,
  'word': 'H',
  'start': 14,
  'end': 15},
 {'entity_group': 'ORG',
  'score': 0.99675083,
  'word': '##R',
  'start': 15,
  'end': 16},
 {'entity_group': 'LOC',
  'score': 0.5997929,
  'word': 'The R',
  'start': 38,
  'end': 43},
 {'entity_group': 'LOC',
  'score': 0.75765514,
  'word': '##itz London',
  'start': 43,
  'end': 53},
 {'entity_group': 'LOC',
  'score': 0.99939835,
  'word': 'London',
  'start': 90,
  'end': 96}]

In [10]:
entities = []
for model_output in results:
    entry = {}
    entry["start"] = model_output["start"]
    entry["end"] = model_output["end"]
    entry["label"] = model_output["entity_group"]
    entities.append(entry)

render_data = [{"text": text, "ents": entities, "title": None}]

In [11]:
render(render_data, style="ent", manual=True, jupyter=True)

In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9971501, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.9986046, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [21]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
text = "My name is Wolfgang and I live in Berlin since today."

results = nlp(text)
print(results)

[{'entity': 'B-PER', 'score': 0.9972506, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.9987753, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [22]:
entities = []
for model_output in results:
    entry = {}
    entry["start"] = model_output["start"]
    entry["end"] = model_output["end"]
    entry["label"] = model_output["entity"]
    entities.append(entry)

render_data = [{"text": text, "ents": entities, "title": None}]

In [23]:
render_data = [{"text": text, "ents": entities, "title": None}]
render(render_data, style="ent", manual=True, jupyter=True)

In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/distilbert-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

[{'entity': 'B-PER', 'score': 0.99110633, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.9967968, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]
