# Fine-tuning bert for NER problem

In [1]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m358.4/542.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 

In [2]:
!pip install accelerate -U
!pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/297.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.

## Load data

In [3]:
from zipfile import ZipFile
import pandas as pd

In [4]:
with ZipFile("../data/public_dat.zip", "r") as myzip:
    myzip.extractall(path='../data',  members=['dev.jsonl', 'test.jsonl', 'train.jsonl'])

In [5]:
train = pd.read_json('./data/train.jsonl', lines=True)

train.head()

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4


## Prepocessing data

In [6]:
import re

# For each word in the text, to find its beginning and end
train['sentences_ner'] = train['sentences'].apply(lambda text: [[ele.start(), ele.end() - 1, ele.group()] for ele in re.finditer(r'\w+', text)])
# Tokenization
train['tokens'] = train['sentences'].apply(lambda x: [ele.group() for ele in re.finditer(r'\w+', x)])

train.head()

Unnamed: 0,ners,sentences,id,sentences_ner,tokens
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0,"[[0, 5, Бостон], [7, 14, взорвали], [16, 23, Т...","[Бостон, взорвали, Тамерлан, и, Джохар, Царнае..."
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1,"[[0, 3, Умер], [5, 11, избитый], [13, 14, до],...","[Умер, избитый, до, комы, гитарист, и, сооснов..."
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2,"[[0, 4, Путин], [6, 13, подписал], [15, 26, ра...","[Путин, подписал, распоряжение, о, выходе, Рос..."
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3,"[[0, 7, Бенедикт], [9, 11, XVI], [13, 17, носи...","[Бенедикт, XVI, носил, кардиостимулятор, Папа,..."
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4,"[[0, 4, Обама], [6, 13, назначит], [15, 15, в]...","[Обама, назначит, в, Верховный, суд, латиноаме..."


A function that transposes entity markup to the word level. Use IOB notation to separate multiple entities of the same type in a row

In [7]:
# Using IOB2 format
def extract_labels(item, ners):
    sent_labels = []
    flag = False
    for word in item:
        labels = []
        for ner in ners:
            # Flag means that the word has a tag, but it is inside a nested named entity
            if flag:
                # The condition means that the word comes last in the nested named entity
                if word[1] == ner[1] and word[0] > ner[0]:
                    flag = False
                    labels.append([ner[1] - ner[0], 'I-' + ner[2]])
                # The condition means that the word stands somewhere between the first and last word in the nested named entity
                if word[1] < ner[1] and word[0] > ner[0]:
                    labels.append([ner[1] - ner[0], 'I-' + ner[2]])
            # Conditional means that words have a tag and are either single or the first in a construct
            if word[0] == ner[0]:
                # Single word in the named entity
                if word[1] == ner[1]:
                    labels.append([ner[1] - ner[0], 'B-' + ner[2]])
                # The first word in the nested named entity
                else:
                    flag = True
                    labels.append([ner[1] - ner[0], 'B-' + ner[2]])

        if len(labels) != 0:
            # The word has only one tag
            if len(labels) == 1:
                sent_labels.append(labels[0][1])
            else:
                temp = ''
                max_c = 0
                # A word has multiple tags, the one in the larger construction is selected
                for j in labels:
                    if j[0] > max_c:
                        max_c = j[0]
                        temp = j[1]
                sent_labels.append(temp)
        # A word without a tag
        else:
            sent_labels.append('O')
    return sent_labels


In [8]:
sents_labels = []
for i in range(len(train)):
    sents_labels.append(extract_labels(train['sentences_ner'][i], train['ners'][i]))

len(sents_labels)

519

In [9]:
train['tags'] = sents_labels

train.head()

Unnamed: 0,ners,sentences,id,sentences_ner,tokens,tags
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0,"[[0, 5, Бостон], [7, 14, взорвали], [16, 23, Т...","[Бостон, взорвали, Тамерлан, и, Джохар, Царнае...","[B-CITY, B-EVENT, B-FAMILY, I-FAMILY, I-FAMILY..."
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1,"[[0, 3, Умер], [5, 11, избитый], [13, 14, до],...","[Умер, избитый, до, комы, гитарист, и, сооснов...","[B-EVENT, O, O, B-DISEASE, B-PROFESSION, O, O,..."
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2,"[[0, 4, Путин], [6, 13, подписал], [15, 26, ра...","[Путин, подписал, распоряжение, о, выходе, Рос...","[B-PERSON, B-EVENT, I-EVENT, O, B-EVENT, I-EVE..."
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3,"[[0, 7, Бенедикт], [9, 11, XVI], [13, 17, носи...","[Бенедикт, XVI, носил, кардиостимулятор, Папа,...","[B-PERSON, I-PERSON, O, O, B-PROFESSION, I-PRO..."
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4,"[[0, 4, Обама], [6, 13, назначит], [15, 15, в]...","[Обама, назначит, в, Верховный, суд, латиноаме...","[B-PERSON, B-EVENT, O, B-ORGANIZATION, I-ORGAN..."


In [10]:
ner_train = train[['tokens', 'tags']]
ner_train.head()

Unnamed: 0,tokens,tags
0,"[Бостон, взорвали, Тамерлан, и, Джохар, Царнае...","[B-CITY, B-EVENT, B-FAMILY, I-FAMILY, I-FAMILY..."
1,"[Умер, избитый, до, комы, гитарист, и, сооснов...","[B-EVENT, O, O, B-DISEASE, B-PROFESSION, O, O,..."
2,"[Путин, подписал, распоряжение, о, выходе, Рос...","[B-PERSON, B-EVENT, I-EVENT, O, B-EVENT, I-EVE..."
3,"[Бенедикт, XVI, носил, кардиостимулятор, Папа,...","[B-PERSON, I-PERSON, O, O, B-PROFESSION, I-PRO..."
4,"[Обама, назначит, в, Верховный, суд, латиноаме...","[B-PERSON, B-EVENT, O, B-ORGANIZATION, I-ORGAN..."


 BERT for Russian: rubert-tiny (https://huggingface.co/cointegrated/rubert-tiny)

In [11]:
model_checkpoint = "cointegrated/rubert-tiny2"
batch_size = 16

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
from datasets import DatasetDict, Dataset

In [14]:
from sklearn.model_selection import train_test_split
ner_data = train[['tokens', 'tags']]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)

In [15]:
# List all tags
label_list = sorted({label for i in range(len(ner_data)) for label in ner_data['tags'][i]})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
len(label_list), label_list[:5]

(59, ['O', 'B-AGE', 'B-AWARD', 'B-CITY', 'B-COUNTRY'])

In [16]:
# Load to Dataset
ner_data = DatasetDict({
    'train': Dataset.from_pandas(ner_train[['tokens', 'tags']]),
    'test': Dataset.from_pandas(ner_test[['tokens', 'tags']])
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 415
    })
    test: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 104
    })
})

In [17]:
ner_data = ner_data.remove_columns(["__index_level_0__"])

ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 415
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 104
    })
})

Function for tokenization and align labels by token identifiers

In [18]:
def tokenize_and_align_labels(examples):
    # Tokenize input examples
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        # Retrieve word ids from tokenized inputs
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Check if word index is None
            if word_idx is None:
                label_ids.append(-100)
            # Update label_ids based on alignment with word indexes
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        # Update label_ids with appropriate index values
        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)
    # Add aligned labels to tokenized inputs
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 415
    })
    test: Dataset({
        features: ['tokens', 'tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 104
    })
})

## Model

In [21]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments

args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=100,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [24]:
from datasets import load_metric

metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [25]:
import numpy as np

def compute_metrics(p):
    # Unpack predictions and labels
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Extract true predictions based on alignment with labels
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # Extract true labels based on alignment with predictions
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # Compute metrics based on true predictions and true labels
    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [26]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.989009,0.0,0.0,0.0,0.544049
2,No log,2.299865,0.0,0.0,0.0,0.544049
3,No log,1.950454,0.14918,0.013998,0.025594,0.566967
4,No log,1.725463,0.150794,0.05553,0.081169,0.628108
5,No log,1.585077,0.148498,0.079834,0.103842,0.637585
6,No log,1.48973,0.181141,0.101061,0.129739,0.651025
7,No log,1.416975,0.216963,0.135364,0.166714,0.665281
8,No log,1.360157,0.245538,0.165052,0.197406,0.678644
9,No log,1.313528,0.265184,0.198123,0.2268,0.690336
10,No log,1.272818,0.294188,0.228119,0.256975,0.70195


TrainOutput(global_step=2600, training_loss=0.7166972600496733, metrics={'train_runtime': 536.8024, 'train_samples_per_second': 77.31, 'train_steps_per_second': 4.843, 'total_flos': 304518798454962.0, 'train_loss': 0.7166972600496733, 'epoch': 100.0})

In [35]:
args1 = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [36]:
trainer1 = Trainer(
    model,
    args1,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [37]:
trainer1.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.790424,0.500426,0.542686,0.5207,0.799953
2,No log,0.788612,0.507962,0.544685,0.525683,0.801818
3,No log,0.791119,0.508036,0.549454,0.527934,0.801934
4,No log,0.792349,0.508499,0.547608,0.527329,0.80174
5,No log,0.790953,0.513576,0.54407,0.528384,0.803333
6,No log,0.791076,0.510957,0.545147,0.527499,0.802595
7,No log,0.792644,0.510084,0.548531,0.52861,0.802401
8,No log,0.790778,0.514689,0.549762,0.531647,0.804032
9,No log,0.7957,0.512348,0.552069,0.531467,0.802478
10,No log,0.792792,0.51529,0.554684,0.534262,0.804382


TrainOutput(global_step=520, training_loss=0.30732995179983286, metrics={'train_runtime': 111.5267, 'train_samples_per_second': 74.422, 'train_steps_per_second': 4.663, 'total_flos': 60800712002706.0, 'train_loss': 0.30732995179983286, 'epoch': 20.0})

## Save model

In [44]:
model.save_pretrained('ner_bert.bin')
tokenizer.save_pretrained('ner_bert.bin')

('ner_bert.bin/tokenizer_config.json',
 'ner_bert.bin/special_tokens_map.json',
 'ner_bert.bin/vocab.txt',
 'ner_bert.bin/added_tokens.json',
 'ner_bert.bin/tokenizer.json')

## Prediction for test data

In [38]:
from transformers import pipeline

In [39]:
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)

In [40]:
test = pd.read_json('./data/test.jsonl', lines=True)

test.head()

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


In [41]:
test['senences'].apply(lambda x: ' '.join([ele.group() for ele in re.finditer(r'\w+', x)]))

0     Владелец Бирмингема получил шесть лет тюрьмы м...
1     Акция протеста на Майдане Независимости объявл...
2     Фольксваген может перейти под контроль Порше З...
3     В Москве покажут фильмы Чарли Чаплина с живой ...
4     Чулпан Хаматова сыграет главную роль в фильме ...
                            ...                        
60    ОБСЕ назвала референдум о статусе Крыма незако...
61    Египетского студента могут выслать из страны з...
62    Геннадий Онищенко отправлен в отставку Геннади...
63    Племянник Алишера Усманова разбился в ДТП Вид ...
64    Владимир Булавин назначен на новую должность г...
Name: senences, Length: 65, dtype: object

In [42]:
out = []
for i in range(len(test)):
    d = {"id": int(test['id'][i]), "ners": []}
    pred_ner = pipe(test['senences'][i])
    for item in pred_ner:
        d['ners'].append([int(item['start']), int(item['end']) - 1, item['entity_group']])
    out.append(d)

len(out)

65

## Write to JSONL

In [43]:
import json

with open('test.jsonl', 'w') as f:
    for item in out:
        json.dump(item, f, default=str)
        f.write('\n')

!zip text test.jsonl

  adding: test.jsonl (deflated 76%)
