# BERT Baseline model for NER

Loading the BERT-based NER model

In [3]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading the tokenizer

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Creating an instance of `pipeline`

In [4]:
from transformers import pipeline

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Testing the NER-tagger model on a test sentence

In [5]:
text = "Apple Inc. plans to open a new store in San Francisco by January 2024. Tim Cook, the CEO, announced the news yesterday."

ner_results = nlp(text)
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.99644405, 'index': 10, 'word': 'in', 'start': 37, 'end': 39}, {'entity': 'I-ORG', 'score': 0.72377217, 'index': 11, 'word': 'san', 'start': 40, 'end': 43}, {'entity': 'I-ORG', 'score': 0.98159146, 'index': 12, 'word': 'francisco', 'start': 44, 'end': 53}]


Loading the CoNLL2003 dataset

In [7]:
from datasets import load_dataset

conll = load_dataset("conll2003")

Downloading data: 100%|██████████| 1.23M/1.23M [00:00<00:00, 2.37MB/s]
Downloading data: 100%|██████████| 312k/312k [00:00<00:00, 779kB/s]
Downloading data: 100%|██████████| 283k/283k [00:00<00:00, 450kB/s]
Generating train split: 100%|██████████| 14041/14041 [00:00<00:00, 41074.68 examples/s]
Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 20708.52 examples/s]
Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 24140.55 examples/s]


Set the example

In [18]:
conll['test']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 3453
})

In [12]:
example = conll['test'][12]

Getting a list of tag names

In [11]:
tag_names = conll["test"].features[f"ner_tags"].feature.names
print(tag_names)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [23]:
ner_results = nlp(example['tokens'])
print(ner_results)
predictions = []
for result in ner_results:
    if len(result) == 0:
        predictions.append('O')
    else:
        predictions.append(result[0]['entity'])

true_tags = [tag_names[i] for i in example['ner_tags']]

[[], [{'entity': 'B-PER', 'score': 0.9877796, 'index': 1, 'word': 'hassan', 'start': 0, 'end': 6}], [], [], [], [], [{'entity': 'B-LOC', 'score': 0.4582857, 'index': 1, 'word': 'a', 'start': 0, 'end': 1}], [], [{'entity': 'B-LOC', 'score': 0.98864543, 'index': 1, 'word': 'ball', 'start': 0, 'end': 4}], [], [], [], [{'entity': 'B-LOC', 'score': 0.9998441, 'index': 1, 'word': 'in', 'start': 0, 'end': 2}], [], [], [{'entity': 'B-LOC', 'score': 0.5084014, 'index': 1, 'word': 'minute', 'start': 0, 'end': 6}], [], [], [], [], [{'entity': 'B-ORG', 'score': 0.99871063, 'index': 1, 'word': 'divert', 'start': 0, 'end': 6}], [], [], [], [], [], [], [{'entity': 'B-MISC', 'score': 0.99965584, 'index': 1, 'word': 'bit', 'start': 0, 'end': 3}], [], [], []]


In [27]:
conll['test'][12]['ner_tags']

[0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0]

In [22]:
from tqdm import tqdm

true_tags_list = []
predicted_tags_list = []

for atest in tqdm(test, desc=str(len(test))):
 
  # add true labels to references
  true_tags_list.append([tag_names[id] for id in atest['ner_tags']])

  # recognize named entity in a test tokens
  test_ner_results = nlp(atest['tokens'])

  predicted_tags = []
  # extract the predicted tags
  for result in test_ner_results:
    if len(result) == 0:
       predicted_tags.append('O')
    else:
       predicted_tags.append(result[0]['entity'])
        
  predicted_tags_list.append(predicted_tags)

NameError: name 'test' is not defined

## Training BERT from scratch on the DANSK dataset

In [4]:
# imports
from transformers import AdamW, AutoModel, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, random_split 
from tqdm import tqdm 
import torch
import pyarrow.parquet as pa

In [6]:
# loading the data
train_data = pa.read_table('data/train-00000-of-00001.parquet') 
dev_data = pa.read_table('data/dev-00000-of-00001.parquet') 
test_data = pa.read_table('data/test-00000-of-00001.parquet') 

In [7]:
train_df = train_data.to_pandas() 
train_df.head()

Unnamed: 0,text,ents,sents,tokens,spans,dagw_source,dagw_domain,dagw_source_full
0,Danmark skal bygges af maskinernes forsigtige ...,"[{'start': 0, 'end': 7, 'label': 'GPE'}]","[{'start': 0, 'end': 60}]","[{'id': 0, 'start': 0, 'end': 7}, {'id': 1, 's...",{'incorrect_spans': []},danavis,News,Danish daily newspapers
1,Hvil i Cap Dag - 2011 år Cap D'Agde Privat bil...,"[{'start': 17, 'end': 21, 'label': 'DATE'}, {'...","[{'start': 0, 'end': 79}]","[{'id': 0, 'start': 0, 'end': 4}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
2,Måtte det nye år gøre dem mere fortrolige med ...,"[{'start': 6, 'end': 16, 'label': 'DATE'}, {'s...","[{'start': 0, 'end': 171}]","[{'id': 0, 'start': 0, 'end': 5}, {'id': 1, 's...",{'incorrect_spans': []},naat,Conversation,NAAT
3,Vi har også 360 graders Private Banking rådgiv...,"[{'start': 12, 'end': 23, 'label': 'QUANTITY'}]","[{'start': 0, 'end': 117}]","[{'id': 0, 'start': 0, 'end': 2}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
4,| Nyhedsmails,[],"[{'start': 0, 'end': 13}]","[{'id': 0, 'start': 0, 'end': 1}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl


In [8]:
train_labels = set()
for ents in train_df['ents']:
    for ent in ents:
        train_labels.add(ent['label'])
print(train_labels)
print(len(train_labels))

{'PRODUCT', 'PERSON', 'LANGUAGE', 'PERCENT', 'FACILITY', 'NORP', 'LAW', 'MONEY', 'LOCATION', 'GPE', 'EVENT', 'TIME', 'ORGANIZATION', 'CARDINAL', 'DATE', 'ORDINAL', 'QUANTITY', 'WORK OF ART'}
18


In [9]:
test_df = test_data.to_pandas() 
test_df.head()

Unnamed: 0,text,ents,sents,tokens,spans,dagw_source,dagw_domain,dagw_source_full
0,Henrik Dahl: Feminister er rene og skære nasse...,"[{'start': 0, 'end': 11, 'label': 'PERSON'}, {...","[{'start': 0, 'end': 53}]","[{'id': 0, 'start': 0, 'end': 6}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
1,27 meters frit fald fra Operaens tag –,"[{'start': 0, 'end': 9, 'label': 'QUANTITY'}, ...","[{'start': 0, 'end': 38}]","[{'id': 0, 'start': 0, 'end': 2}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
2,det må\n,[],"[{'start': 0, 'end': 7}]","[{'id': 0, 'start': 0, 'end': 3}, {'id': 1, 's...",{'incorrect_spans': []},retspraksis,Legal,retspraksis (Danish legal information)
3,Taler 9: jeg er mest på 1,"[{'start': 6, 'end': 7, 'label': 'CARDINAL'}, ...","[{'start': 0, 'end': 25}]","[{'id': 0, 'start': 0, 'end': 5}, {'id': 1, 's...",{'incorrect_spans': []},spont,Conversation,Spontaneous speech
4,25/9:9:00 - 16:30,"[{'start': 12, 'end': 17, 'label': 'TIME'}]","[{'start': 0, 'end': 17}]","[{'id': 0, 'start': 0, 'end': 9}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl


In [10]:
test_labels = set()
for ents in test_df['ents']:
    for ent in ents:
        test_labels.add(ent['label'])
print(test_labels)
print(len(test_labels))

{'PRODUCT', 'PERSON', 'LANGUAGE', 'PERCENT', 'FACILITY', 'NORP', 'LOCATION', 'MONEY', 'LAW', 'GPE', 'EVENT', 'TIME', 'ORGANIZATION', 'CARDINAL', 'DATE', 'ORDINAL', 'QUANTITY', 'WORK OF ART'}
18


In [93]:
# Assuming a predefined set of entity types
entity_types = list(train_labels)
entity_types.append('O')

# Set num_labels
num_labels = len(entity_types)

# Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('vesteinn/DanskBERT') 
model = AutoModel.from_pretrained('vesteinn/DanskBERT', num_labels=num_labels)

# Define batch_size
batch_size = 32  

# Define learning rate
learning_rate = 5e-5 

Some weights of XLMRobertaModel were not initialized from the model checkpoint at vesteinn/DanskBERT and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Formatting the data as a list of dictionaries.

In [94]:
text_ents_df = train_df[['text', 'ents']]
text_ents_df.iloc[0]['text']
n_train = len(text_ents_df)
train_data_formatted = []

for i in range(n_train):
    sample = text_ents_df.iloc[i]
    sample_dict = {'text': '', 'labels': {'entities': []}}
    sample_dict['text'] += sample['text']

    for ent in sample['ents']:
        start = ent['start']
        end = ent['end']
        label = ent['label']
        sample_dict['labels']['entities'].append((start, end, label))
        
    train_data_formatted.append(sample_dict)
print(train_data_formatted)

[{'text': 'Danmark skal bygges af maskinernes forsigtige refleksioner .', 'labels': {'entities': [(0, 7, 'GPE')]}}, {'text': "Hvil i Cap Dag - 2011 år Cap D'Agde Privat billede af nudister på ferie Familie", 'labels': {'entities': [(17, 21, 'DATE'), (25, 35, 'GPE')]}}, {'text': 'Måtte det nye år gøre dem mere fortrolige med livet i Danmark og hjælpe dem, de ældre såvel som den opvoksende generation, til at finde sig til rette i det danske samfund.', 'labels': {'entities': [(6, 16, 'DATE'), (54, 61, 'GPE'), (156, 162, 'NORP')]}}, {'text': 'Vi har også 360 graders Private Banking rådgivning til formuende kunder med specialister indenfor skat og investering', 'labels': {'entities': [(12, 23, 'QUANTITY')]}}, {'text': '| Nyhedsmails', 'labels': {'entities': []}}, {'text': 'jeg synes det er hammer atrengt af vores lærer at skille 3 bedstevenner ad på den måde!', 'labels': {'entities': [(57, 58, 'CARDINAL')]}}, {'text': 'En Hellig Paaskemorgen,\n', 'labels': {'entities': [(10, 22, 'TIME')]}},

In [95]:
text = train_data_formatted[1]['text']
print(text)
ents = train_data_formatted[1]['labels']
print(ents)
tok_sent = tokenizer(text, return_tensors='pt')
with torch.no_grad():
    outputs = model(**tok_sent)

# Get the embeddings from the model outputs
embeddings = outputs.last_hidden_state
print(embeddings.shape)
tokens = tok_sent.tokens()
print(tokens)
print(len(tokens))
print(text[25:35])
tok_sent.input_ids

Hvil i Cap Dag - 2011 år Cap D'Agde Privat billede af nudister på ferie Familie
{'entities': [(17, 21, 'DATE'), (25, 35, 'GPE')]}
torch.Size([1, 22, 768])
['<s>', '▁Hvil', '▁i', '▁Cap', '▁Dag', '▁-', '▁2011', '▁år', '▁Cap', '▁D', "'", 'Ag', 'de', '▁Privat', '▁billede', '▁af', '▁nud', 'ister', '▁på', '▁ferie', '▁Familie', '</s>']
22
Cap D'Agde


tensor([[    0,  2574,    19,  8989,  3209,    57,  4683,   269,  8989,    49,
         49652, 33706,    35,  9237,  1788,    59, 42614,   757,    65,  3890,
         12814,     2]])

In [106]:
def tokenization_output(text):
    # tokenize text and get ids of tokens
    tok_sent = tokenizer(text, return_tensors='pt')
    token_ids = tok_sent.input_ids

    # list of tokens
    tokens = tok_sent.tokens()

    # boolean list of tokens included
    bool_list = []
    special_tokens = {'<s>', '</s>'}
    for token in tokens:
        first_char = token[0]
        if token in special_tokens or first_char != '▁':
            bool_list.append(False)
        else:
            bool_list.append(True)
    
    return token_ids, bool_list

In [107]:
tokenization_output('hej med dig')

(tensor([[   0, 3745,   69,  214,    2]]), [False, True, True, True, False])

In [84]:
def tokenize_and_format_data(dataset, tokenizer):
    tokenized_data = []
    for sample in dataset:
        text = sample["text"]
        entities = sample["labels"]["entities"]

        # Tokenize the input text using the BERT tokenizer
        tokens =  tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

        # Initialize labels for each token as 'O' (Outside)
        labels = ['O'] * len(tokens)
        
        # Update labels for entity spans
        for start, end, entity_type in entities:

            # Tokenize the prefix to get the correct offset
            prefix_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[:start])))
            start_token = len(prefix_tokens)

            # Tokenize the entity to get its length
            entity_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[start:end])))
            end_token = start_token + len(entity_tokens) - 1
            labels[start_token] = f"B-{entity_type}"
    
            for i in range(start_token + 1, end_token +1):
                labels[i] = f"I-{entity_type}"

        # Convert tokens and labels to input IDs and label IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        label_ids = [entity_types.index(label) for label in labels]

        # Pad input_ids and label_ids to the maximum sequence length
        padding_length = tokenizer.model_max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        label_ids += [entity_types.index('O')] * padding_length
        tokenized_data.append({'input_ids': input_ids, 'labels': label_ids})

    # Convert tokenized data to PyTorch dataset
    dataset = TensorDataset(torch.tensor([item['input_ids'] for item in tokenized_data]), 
                            torch.tensor([item['labels'] for item in tokenized_data]))
        
    return dataset

In [87]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
train_dataset_sample = [{"text": "John works at Google in New York.", "labels": {"entities": [(0, 4, "PERSON"), (17, 22, "ORG"), (26, 34, "GPE")]}},
                        {"text": "Apple Inc. is a technology company.", "labels": {"entities": [(0, 10, "ORG")]}}]
entity_types = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [88]:
# Prepare data for fine-tuning
train_data = tokenize_and_format_data(train_dataset_sample, tokenizer) 
train_dataloader = DataLoader(train_data, batch_size=batch_size)

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=learning_rate) 
num_epochs = 15  

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc="Training"):
        inputs, labels = batch
        # Unpack the tuple
        outputs = model(inputs, labels=labels)
        loss =  outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the fine-tuned model for later use model.save_pretrained('fine_tuned_ner_model')

IndexError: list assignment index out of range

In [17]:
tokens = [['Mexico', 'ligger', 'i', 'Spanien'], ['Se', 'Venedig', 'og', 'dø', 'af', 'Lone', 'Kellerman']]
tags = [['B-GPE', 'O', 'O', 'B-GPE'], ['O', 'B-GPE', 'O', 'O', 'O', 'B-PER', 'I-PER']]
for i, pred in enumerate(zip(tokens,tags)):
    print(pred)
    # print(tag)
    print(i)
for token, tag in tokens, tags:
    print(token)
    print(tag)

(['Mexico', 'ligger', 'i', 'Spanien'], ['B-GPE', 'O', 'O', 'B-GPE'])
0
(['Se', 'Venedig', 'og', 'dø', 'af', 'Lone', 'Kellerman'], ['O', 'B-GPE', 'O', 'O', 'O', 'B-PER', 'I-PER'])
1
['Mexico', 'ligger', 'i', 'Spanien']
['Se', 'Venedig', 'og', 'dø', 'af', 'Lone', 'Kellerman']
['B-GPE', 'O', 'O', 'B-GPE']
['O', 'B-GPE', 'O', 'O', 'O', 'B-PER', 'I-PER']


In [36]:
# function to output file as iob2 file
def make_output_file(tokens, tags, output_file_name):
    preds = zip(tokens, tags)
    with open(output_file_name, 'w', encoding='UTF-8') as f:
        for token_list, tag_list in preds:
            for i, line in enumerate(zip(token_list, tag_list)):
                token, tag = line
                f.write(str(i+1) + "\t" + token + "\t" + tag + "\n")
            f.write("\n")