# Name Entity Recognition (NER)


In [21]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from spacy.displacy import render
from typing import List

In [22]:
text = "Mary from the HR department said that The Ritz London was a great hotel option to stay in London, specially during December."

## Distilbert


In [23]:
model_id = "elastic/distilbert-base-cased-finetuned-conll03-english"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)
print("Model loaded!")

pipe = pipeline(
    model=model_id, tokenizer=tokenizer, task="ner", aggregation_strategy="simple"
)
results = pipe(text)

entities = []
for model_output in results:
    entry = {}
    entry["start"] = model_output["start"]
    entry["end"] = model_output["end"]
    entry["label"] = model_output["entity_group"]
    entities.append(entry)

render_data = [{"text": text, "ents": entities, "title": None}]
render(render_data, style="ent", manual=True, jupyter=True)

Device set to use cpu


Model loaded!


## bert-large-NER


In [24]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

results = nlp(text)

entities = []
for model_output in results:
    entry = {}
    entry["start"] = model_output["start"]
    entry["end"] = model_output["end"]
    entry["label"] = model_output["entity"]
    entities.append(entry)

render_data = [{"text": text, "ents": entities, "title": None}]
render(render_data, style="ent", manual=True, jupyter=True)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [25]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

results = nlp(text)

entities = []
for model_output in results:
    entry = {}
    entry["start"] = model_output["start"]
    entry["end"] = model_output["end"]
    entry["label"] = model_output["entity"]
    entities.append(entry)

render_data = [{"text": text, "ents": entities, "title": None}]
render(render_data, style="ent", manual=True, jupyter=True)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [26]:
results

[{'entity': 'B-PER',
  'score': 0.9988023,
  'index': 1,
  'word': 'Mary',
  'start': 0,
  'end': 4},
 {'entity': 'B-ORG',
  'score': 0.98128945,
  'index': 4,
  'word': 'H',
  'start': 14,
  'end': 15},
 {'entity': 'I-ORG',
  'score': 0.8249913,
  'index': 5,
  'word': '##R',
  'start': 15,
  'end': 16},
 {'entity': 'B-LOC',
  'score': 0.83290493,
  'index': 9,
  'word': 'The',
  'start': 38,
  'end': 41},
 {'entity': 'I-LOC',
  'score': 0.6241567,
  'index': 10,
  'word': 'R',
  'start': 42,
  'end': 43},
 {'entity': 'I-LOC',
  'score': 0.829588,
  'index': 11,
  'word': '##itz',
  'start': 43,
  'end': 46},
 {'entity': 'I-LOC',
  'score': 0.9031068,
  'index': 12,
  'word': 'London',
  'start': 47,
  'end': 53},
 {'entity': 'B-LOC',
  'score': 0.99963605,
  'index': 21,
  'word': 'London',
  'start': 90,
  'end': 96}]