# Библиотеки

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    BertForTokenClassification,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM
)
from datasets import Dataset
import evaluate
from corus import load_ne5
import os
import zipfile
import torch
import pandas as pd
import math
from corus import load_lenta
import seqeval

# Обучите NER-модель

## Загрузка и подготовка данных

In [4]:
!wget http://www.labinform.ru/pub/named_entities/collection5.zip
!unzip -oq collection5.zip -d Collection5
!rm collection5.zip

--2025-04-16 16:32:36--  http://www.labinform.ru/pub/named_entities/collection5.zip
Resolving www.labinform.ru (www.labinform.ru)... 95.181.230.181
Connecting to www.labinform.ru (www.labinform.ru)|95.181.230.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1899530 (1.8M) [application/zip]
Saving to: ‘collection5.zip’


2025-04-16 16:32:37 (9.83 MB/s) - ‘collection5.zip’ saved [1899530/1899530]



In [8]:
path = "Collection5/Collection5"
records = list(load_ne5(path))

In [9]:
print("Содержимое папки Collection5:", os.listdir("Collection5/Collection5"))

Содержимое папки Collection5: ['238.ann', '539.txt', '538.ann', '088.txt', '524.ann', '1016.ann', '15_01_13b.ann', '1145.ann', '384.ann', '04_02_13a_abdulatipov.ann', '1100.txt', '092.ann', '25_12_12e.ann', '140.ann', '284.ann', '30_11_12i.ann', '327.ann', '544.ann', '293.ann', '056.ann', '126.txt', '21_11_12j.txt', '334.ann', '121.txt', '220.ann', '1183.ann', '28_11_12f.txt', '1037.ann', '516.txt', '298.txt', '090.ann', '1148.txt', '264.ann', '347.txt', '09_01_13c.txt', 'last_39.txt', '368.ann', '138.ann', '268.ann', '548.ann', '185.txt', 'blokhin.ann', '04_12_12d.ann', 'last_02.txt', '388.txt', '166.txt', '112.ann', '1168.ann', '25_12_12c.txt', '20_11_12d.ann', 'last_40.ann', '09_01_13e.txt', '379.ann', '1146.txt', '397.txt', '395.txt', '437.txt', '512.ann', '1003.ann', '628.txt', '1112.ann', '013.txt', '15_01_13e.ann', '420.txt', '2041.ann', '536.txt', '362.txt', '1166.ann', '100.txt', '085.txt', '020.ann', '064.ann', '347.ann', '454.txt', '429.txt', '1037.txt', '146.txt', 'last_30_

In [10]:
first_record = records[0]
print("\nТекст документа:", first_record.text[:100], "...")
print("\nПримеры spans:", first_record.spans[:2])


Текст документа: Сын бывшего главы ФСБ занял руководящий пост в "Зарубежнефти"

Младший сын бывшего руководителя ФС ...

Примеры spans: [Ne5Span(index='T1', type='ORG', start=18, stop=21, text='ФСБ'), Ne5Span(index='T2', type='ORG', start=48, stop=60, text='Зарубежнефти')]


In [11]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")



In [12]:
def convert_to_iob(record):
    text = record.text
    spans = record.spans

    words = text.split()
    labels = ['O'] * len(words)

    current_pos = 0
    for i, word in enumerate(words):
        start = text.find(word, current_pos)
        end = start + len(word)
        current_pos = end

        for span in spans:
            word_in_entity = (
                start >= span.start and
                end <= span.stop
            )

            if word_in_entity:
                if i == 0 or labels[i-1] != f"I-{span.type}":
                    labels[i] = f"B-{span.type}"
                else:
                    labels[i] = f"I-{span.type}"
                break

    return {'tokens': words, 'ner_tags': labels}

In [13]:
data = [convert_to_iob(record) for record in records]

In [14]:
print("Пример разметки:")
for token, tag in zip(data[0]['tokens'][:10], data[0]['ner_tags'][:10]):
    print(f"{token:15} -> {tag}")

Пример разметки:
Сын             -> O
бывшего         -> O
главы           -> O
ФСБ             -> B-ORG
занял           -> O
руководящий     -> O
пост            -> O
в               -> O
"Зарубежнефти"  -> O
Младший         -> O


## Разделение данных на train/test

In [15]:
train_data, test_data = train_test_split(
    data,
    test_size=0.2,
    random_state=42
)

## Токенизация и выравнивание меток

In [16]:
train_dataset = Dataset.from_dict({
    'tokens': [x['tokens'] for x in train_data],
    'ner_tags': [x['ner_tags'] for x in train_data]
})

test_dataset = Dataset.from_dict({
    'tokens': [x['tokens'] for x in test_data],
    'ner_tags': [x['ner_tags'] for x in test_data]
})

In [17]:
all_tags = sorted(list(set(tag for item in data for tag in item['ner_tags'])))
tags = ['O'] + [tag for tag in all_tags if tag != 'O']

In [18]:
from collections import Counter
tag_counts = Counter(tag for item in data for tag in item['ner_tags'])
print(tag_counts)

Counter({'O': 191125, 'B-PER': 13233, 'B-ORG': 8921, 'B-LOC': 3241, 'B-GEOPOLIT': 3070, 'B-MEDIA': 1201})


In [19]:
id2label = {i: tag for i, tag in enumerate(tags)}
label2id = {tag: i for i, tag in enumerate(tags)}

In [151]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding='max_length',
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        label_ids += [-100] * (128 - len(label_ids))

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 800/800 [00:00<00:00, 1604.40 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1727.92 examples/s]


## Настройка модели

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny2",
    num_labels=len(tags),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model = model.to(device)

## Настройка обучения

In [23]:
seqeval = evaluate.load("seqeval")

In [21]:
training_args = TrainingArguments(
    output_dir="ner_model",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42,
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=2
)

In [24]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tags[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [tags[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels
    )

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [23]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    max_length=128,
    label_pad_token_id=-100
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [25]:
sample = tokenized_train[0]
print("Длина входных ID:", len(sample['input_ids']))
print("Длина меток:", len(sample['labels']))
print("Максимальная длина в наборе:", max(len(x['input_ids']) for x in tokenized_train))

Длина входных ID: 128
Длина меток: 128
Максимальная длина в наборе: 128


## Обучение и оценка

In [26]:
print("Оценка до обучения:")
trainer.evaluate()

Оценка до обучения:




{'eval_loss': 1.7781246900558472,
 'eval_model_preparation_time': 0.0016,
 'eval_precision': 0.03732342007434944,
 'eval_recall': 0.15677701436602123,
 'eval_f1': 0.060293057890944024,
 'eval_accuracy': 0.19735509285312325,
 'eval_runtime': 7.0795,
 'eval_samples_per_second': 28.25,
 'eval_steps_per_second': 1.836}

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
1,No log,0.666608,0.0016,0.0,0.0,0.0,0.819809
2,No log,0.468438,0.0016,0.677976,0.355715,0.466612,0.861958
3,No log,0.384012,0.0016,0.589799,0.422548,0.492358,0.867923
4,No log,0.330818,0.0016,0.637452,0.515615,0.570097,0.882217
5,No log,0.291994,0.0016,0.684383,0.651468,0.66752,0.902307
6,No log,0.264383,0.0016,0.708589,0.721424,0.714949,0.911986
7,No log,0.244883,0.0016,0.726806,0.751093,0.73875,0.917895
8,No log,0.235008,0.0016,0.732978,0.773267,0.752584,0.921384
9,No log,0.228919,0.0016,0.731516,0.787945,0.758683,0.922566
10,No log,0.226126,0.0016,0.737288,0.787945,0.761775,0.923635


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=250, training_loss=0.3942828369140625, metrics={'train_runtime': 59.4986, 'train_samples_per_second': 134.457, 'train_steps_per_second': 4.202, 'total_flos': 14156107776000.0, 'train_loss': 0.3942828369140625, 'epoch': 10.0})

In [28]:
print("\nОценка после обучения:")
trainer.evaluate()


Оценка после обучения:




{'eval_loss': 0.22612637281417847,
 'eval_model_preparation_time': 0.0016,
 'eval_precision': 0.7372881355932204,
 'eval_recall': 0.787945034353529,
 'eval_f1': 0.7617753623188406,
 'eval_accuracy': 0.9236353404614519,
 'eval_runtime': 0.283,
 'eval_samples_per_second': 706.795,
 'eval_steps_per_second': 45.942,
 'epoch': 10.0}

In [29]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")

('ner_model/tokenizer_config.json',
 'ner_model/special_tokens_map.json',
 'ner_model/vocab.txt',
 'ner_model/added_tokens.json',
 'ner_model/tokenizer.json')

# Улучшение качества модели

## Дообучение в MLM-режиме

### Подготовка данных для MLM

In [30]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [31]:
def tokenize_mlm(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_special_tokens_mask=True
    )

In [33]:
# mlm_dataset = Dataset.from_dict({
#     'tokens': [x['tokens'] for x in train_data],
#     'ner_tags': [x['ner_tags'] for x in train_data]
# })

In [34]:
mlm_dataset = Dataset.from_dict({
    'text': [" ".join(x['tokens']) for x in train_data] 
})

In [35]:
tokenized_mlm = mlm_dataset.map(
    tokenize_mlm,
    batched=True,
    remove_columns=['text']
)

Map: 100%|██████████| 800/800 [00:00<00:00, 4351.63 examples/s]


In [36]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15
)

### MLM-дообучение

In [37]:
mlm_model = AutoModelForMaskedLM.from_pretrained("cointegrated/rubert-tiny2")

In [38]:
mlm_dataset = tokenized_mlm.train_test_split(test_size=0.1, seed=42)

In [39]:
mlm_args = TrainingArguments(
    output_dir="mlm_model",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    seed=42,
    report_to="none",
    fp16=True,
    remove_unused_columns=False
)

In [40]:
mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_args,
    train_dataset=mlm_dataset["train"],
    eval_dataset=mlm_dataset["test"],
    data_collator=data_collator
)

In [41]:
print("Оценка до обучения:")
mlm_trainer.evaluate()

Оценка до обучения:


{'eval_loss': 3.041212797164917,
 'eval_model_preparation_time': 0.0011,
 'eval_runtime': 0.4508,
 'eval_samples_per_second': 177.482,
 'eval_steps_per_second': 22.185}

In [42]:
mlm_trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,No log,2.931897,0.0011
2,No log,2.876945,0.0011
3,No log,2.87749,0.0011
4,No log,2.851192,0.0011


TrainOutput(global_step=180, training_loss=3.148079766167535, metrics={'train_runtime': 19.3939, 'train_samples_per_second': 148.5, 'train_steps_per_second': 9.281, 'total_flos': 5494838722560.0, 'train_loss': 3.148079766167535, 'epoch': 4.0})

In [43]:
print("\nОценка после обучения:")
mlm_trainer.evaluate()


Оценка после обучения:


{'eval_loss': 2.9015727043151855,
 'eval_model_preparation_time': 0.0011,
 'eval_runtime': 0.2602,
 'eval_samples_per_second': 307.447,
 'eval_steps_per_second': 38.431,
 'epoch': 4.0}

In [44]:
mlm_model.save_pretrained("mlm_rubert_tiny2")

### Дообучение на NER с MLM-инициализации

In [45]:
model_ner_mlm = AutoModelForTokenClassification.from_pretrained(
    "mlm_rubert_tiny2",
    num_labels=len(tags),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at mlm_rubert_tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
ner_data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    max_length=128,
    label_pad_token_id=-100
)

In [47]:
trainer_ner_mlm = Trainer(
    model=model_ner_mlm,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=ner_data_collator,
    compute_metrics=compute_metrics
)

In [48]:
print("Оценка до обучения:")
trainer_ner_mlm.evaluate()

Оценка до обучения:




{'eval_loss': 1.985571026802063,
 'eval_model_preparation_time': 0.0008,
 'eval_precision': 0.02839910996603818,
 'eval_recall': 0.1514678326046221,
 'eval_f1': 0.047830374753451685,
 'eval_accuracy': 0.06117051209904333,
 'eval_runtime': 0.6777,
 'eval_samples_per_second': 295.117,
 'eval_steps_per_second': 19.183}

In [49]:
trainer_ner_mlm.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
1,No log,0.703858,0.0008,0.0,0.0,0.0,0.819809
2,No log,0.49269,0.0008,0.708647,0.235478,0.353493,0.847552
3,No log,0.374012,0.0008,0.694764,0.414428,0.519171,0.87085
4,No log,0.310748,0.0008,0.686461,0.541537,0.605447,0.889195
5,No log,0.269025,0.0008,0.722167,0.678638,0.699726,0.91148
6,No log,0.238387,0.0008,0.752681,0.745159,0.748901,0.925492
7,No log,0.215889,0.0008,0.78445,0.775141,0.779768,0.935509
8,No log,0.201953,0.0008,0.799123,0.796377,0.797748,0.940968
9,No log,0.193629,0.0008,0.814826,0.806683,0.810734,0.945076
10,No log,0.19101,0.0008,0.81735,0.809182,0.813245,0.945751


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=250, training_loss=0.40011834716796874, metrics={'train_runtime': 27.0902, 'train_samples_per_second': 295.309, 'train_steps_per_second': 9.228, 'total_flos': 14156107776000.0, 'train_loss': 0.40011834716796874, 'epoch': 10.0})

In [50]:
print("\nОценка после обучения:")
trainer_ner_mlm.evaluate()


Оценка после обучения:




{'eval_loss': 0.1910102516412735,
 'eval_model_preparation_time': 0.0008,
 'eval_precision': 0.8173501577287067,
 'eval_recall': 0.8091817613991256,
 'eval_f1': 0.8132454488386692,
 'eval_accuracy': 0.9457512661789533,
 'eval_runtime': 0.2537,
 'eval_samples_per_second': 788.384,
 'eval_steps_per_second': 51.245,
 'epoch': 10.0}

In [51]:
model_ner_mlm.save_pretrained("ner_model_mlm")

tokenizer.save_pretrained("ner_model_mlm")

tokenized_mlm.save_to_disk("mlm_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 800/800 [00:00<00:00, 35532.53 examples/s]


##  Генерация синтетических данных

### Загрузка новостного корпуса + Генерация разметки с помощью SOTA модели

deeppavlov/run.sh

```
python3.10 -m venv deeppavlov-venv
deeppavlov-venv/bin/pip install -q deeppavlov corus
file=lenta-ru-news.csv.gz
if [ -e "$file" ]; then
    echo "lenta-ru-news exists"
else 
    wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
fi 
deeppavlov-venv/bin/python3 main.py
```


deeppavlov/main.py

```
from deeppavlov import build_model
from corus import load_lenta
import pickle
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)

def main():
    logging.info('Loading lenta-ru-news...')
    records = load_lenta('lenta-ru-news.csv.gz')

    logging.info('Building ner_collection3_bert...')
    model = build_model('ner_collection3_bert', download=True, install=True)
    
    texts = [record.title for record in records][:10_000] # берем title так как надо ужаться в max_tokens_len = 512
    tokens, tags = [], []
    batch_size = 100
    for start, end in tqdm(zip(range(0, len(texts), batch_size), range(batch_size, len(texts), batch_size)), total=len(texts)//batch_size, unit='batches', desc='Predicting'): 
        batch = texts[start:end]
        try:
            batch_tokens, batch_tags = model(batch)
            tokens.extend(batch_tokens)
            tags.extend(batch_tags)
        except RuntimeError:
            logging.warning(f'RuntimeError at {start}')
            continue

    
    with open('ner.pkl', 'wb') as f:
        pickle.dump((tokens, tags), f)
    logging.info('Done!')

if __name__ == '__main__':
    main()
```

In [1]:
!cd deeppavlov && bash run.sh

[0mlenta-ru-news exists
INFO:root:Loading lenta-ru-news...
INFO:root:Building ner_collection3_bert...
Collecting protobuf<=3.20
  Downloading protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: protobuf
Successfully installed protobuf-3.20.0
[0mCollecting torch<1.14.0,>=1.6.0
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m547.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:28[0m
[?25hCollecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m827.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:11[0m
[0mCollecting nvidia-cublas-cu11=

In [131]:
import pickle
from datasets import Dataset

In [132]:
ag_tokens, ag_tags = pickle.load(open('deeppavlov/ner.pkl', 'rb'))

In [136]:
synthetic_data = [
    {'tokens': tokens, 'ner_tags': tags} 
    for tokens, tags in zip(ag_tokens, ag_tags)
]

### Объединение данных

In [137]:
all_tags = set()

for item in train_data:
    all_tags.update(item['ner_tags'])

for item in synthetic_data:
    all_tags.update(item['ner_tags'])

In [138]:
item['ner_tags'] = [tag.split('-')[-1] if tag != 'O' else tag for tag in item['ner_tags']]

In [133]:
ag_tags_cnt = Counter()
for t in ag_tags:
    ag_tags_cnt.update(t)
_total = sum(ag_tags_cnt.values())
for tag, count in ag_tags_cnt.items():
    print(f'{count / _total:.4f} :{tag}')

0.8840 :O
0.0449 :S-LOC
0.0179 :S-ORG
0.0273 :S-PER
0.0027 :B-LOC
0.0028 :E-LOC
0.0033 :B-ORG
0.0005 :I-ORG
0.0033 :E-ORG
0.0064 :B-PER
0.0066 :E-PER
0.0001 :I-LOC
0.0002 :I-PER


In [None]:
deeppavlov2collections_ner_tags = {
    'O': 'O',
    'S-LOC': 'B-LOC', 'E-LOC': 'I-LOC', 'B-LOC': 'B-LOC', 'I-LOC': 'I-LOC',
    'S-PER': 'B-PER', 'E-PER': 'I-PER', 'B-PER': 'B-PER', 'I-PER': 'I-PER',
    'S-ORG': 'B-ORG', 'E-ORG': 'I-ORG', 'B-ORG': 'B-ORG', 'I-ORG': 'I-ORG',
    
    'B-GEOPOLIT': 'B-LOC',  
    'B-MEDIA': 'B-ORG'       
}

In [139]:
tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
id2label = {i: tag for i, tag in enumerate(tags)}
label2id = {tag: i for i, tag in enumerate(tags)}

In [None]:
def convert_tags(tags):
    return [deeppavlov2collections_ner_tags.get(tag, 'O') for tag in tags]

for item in train_data:
    item['ner_tags'] = convert_tags(item['ner_tags'])
    
valid_synthetic = []
for item in synthetic_data:
    if len(item['tokens']) != len(item['ner_tags']):
        continue
    converted_tags = convert_tags(item['ner_tags'])
    if any(tag not in tags for tag in converted_tags):
        continue 
    valid_synthetic.append({'tokens': item['tokens'], 'ner_tags': converted_tags})

In [None]:
def print_tag_distribution(data, name):
    counter = Counter()
    for item in data:
        counter.update(item['ner_tags'])
    print(f"\nРаспределение тегов в {name}:")
    for tag, count in counter.most_common():
        print(f"{tag}: {count}")

print_tag_distribution(train_data, "исходных данных")
print_tag_distribution(valid_synthetic, "синтетических данных")


Распределение тегов в исходных данных:
O: 155099
B-PER: 10562
B-ORG: 8010
B-LOC: 5056

Распределение тегов в синтетических данных:
O: 77769
B-LOC: 4187
B-PER: 2966
B-ORG: 1866
I-PER: 603
I-ORG: 334
I-LOC: 248


In [None]:
combined_data = train_data + valid_synthetic

for item in combined_data:
    for tag in item['ner_tags']:
        assert tag in tags, f"Обнаружен недопустимый тег: {tag}"

combined_dataset = Dataset.from_dict({
    'tokens': [x['tokens'] for x in combined_data],
    'ner_tags': [x['ner_tags'] for x in combined_data]
})

In [152]:
tokenized_combined = combined_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=32
)

Map: 100%|██████████| 10700/10700 [00:02<00:00, 4739.63 examples/s]


### Обучение на объединённых данных

In [153]:
model_ner_ontonotes_bert  = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny2",
    num_labels=len(tags),
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are new

In [154]:
training_args = TrainingArguments(
    output_dir="ner_ontonotes_bert",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=42
)

In [155]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    max_length=128,
    label_pad_token_id=-100 
)

In [157]:
trainer = Trainer(
    model=model_ner_ontonotes_bert,
    args=training_args,
    train_dataset=tokenized_combined,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [158]:
print("Оценка до обучения:")
trainer.evaluate()

Оценка до обучения:




{'eval_loss': 2.030855178833008,
 'eval_precision': 0.021872803935347858,
 'eval_recall': 0.09406875708349074,
 'eval_f1': 0.03549283728886038,
 'eval_accuracy': 0.0795160382667417,
 'eval_runtime': 0.6321,
 'eval_samples_per_second': 316.428,
 'eval_steps_per_second': 20.568}

In [159]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3033,0.964029,0.009907,0.012845,0.011186,0.796961
2,0.11,1.112526,0.007005,0.009067,0.007904,0.801069
3,0.0612,1.21641,0.007476,0.009445,0.008346,0.806021
4,0.046,1.313577,0.006173,0.007556,0.006795,0.810355
5,0.0397,1.361821,0.006554,0.007934,0.007178,0.812155
6,0.0295,1.432135,0.005956,0.007178,0.00651,0.812606
7,0.0249,1.460983,0.005545,0.0068,0.006109,0.811255
8,0.0221,1.495569,0.005544,0.0068,0.006108,0.81148
9,0.0195,1.505103,0.005864,0.007178,0.006455,0.811874
10,0.0182,1.516502,0.005588,0.0068,0.006135,0.812324




TrainOutput(global_step=6690, training_loss=0.060446675618489584, metrics={'train_runtime': 221.4358, 'train_samples_per_second': 483.21, 'train_steps_per_second': 30.212, 'total_flos': 189363662592000.0, 'train_loss': 0.060446675618489584, 'epoch': 10.0})

In [160]:
print("\nОценка после обучения:")
trainer.evaluate()


Оценка после обучения:




{'eval_loss': 0.9640293717384338,
 'eval_precision': 0.009906759906759906,
 'eval_recall': 0.012844729882886286,
 'eval_f1': 0.011186050337226519,
 'eval_accuracy': 0.796961170512099,
 'eval_runtime': 0.3071,
 'eval_samples_per_second': 651.357,
 'eval_steps_per_second': 42.338,
 'epoch': 10.0}

In [161]:
trainer.save_model("ner_ontonotes_bert")
tokenizer.save_pretrained("ner_ontonotes_bert")

('ner_ontonotes_bert/tokenizer_config.json',
 'ner_ontonotes_bert/special_tokens_map.json',
 'ner_ontonotes_bert/vocab.txt',
 'ner_ontonotes_bert/added_tokens.json',
 'ner_ontonotes_bert/tokenizer.json')

# Сравнение результатов различных подходов

## Базовое дообучение на Collection5

In [None]:
print("\nОценка после обучения:")
trainer.evaluate()


Оценка после обучения:




{'eval_loss': 0.22612637281417847,
 'eval_model_preparation_time': 0.0016,
 'eval_precision': 0.7372881355932204,
 'eval_recall': 0.787945034353529,
 'eval_f1': 0.7617753623188406,
 'eval_accuracy': 0.9236353404614519,
 'eval_runtime': 0.283,
 'eval_samples_per_second': 706.795,
 'eval_steps_per_second': 45.942,
 'epoch': 10.0}

## MLM дообучение + NER дообучение

In [None]:
print("\nОценка после обучения:")
trainer_ner_mlm.evaluate()


Оценка после обучения:




{'eval_loss': 0.1910102516412735,
 'eval_model_preparation_time': 0.0008,
 'eval_precision': 0.8173501577287067,
 'eval_recall': 0.8091817613991256,
 'eval_f1': 0.8132454488386692,
 'eval_accuracy': 0.9457512661789533,
 'eval_runtime': 0.2537,
 'eval_samples_per_second': 788.384,
 'eval_steps_per_second': 51.245,
 'epoch': 10.0}

## Синтетические данные + NER дообучение

In [None]:
print("\nОценка после обучения:")
trainer.evaluate()


Оценка после обучения:




{'eval_loss': 0.9640293717384338,
 'eval_precision': 0.009906759906759906,
 'eval_recall': 0.012844729882886286,
 'eval_f1': 0.011186050337226519,
 'eval_accuracy': 0.796961170512099,
 'eval_runtime': 0.3071,
 'eval_samples_per_second': 651.357,
 'eval_steps_per_second': 42.338,
 'epoch': 10.0}

## Вывод

**Базовое дообучение модели на оригинальных данных Collection5** позволило достичь **F1-меры 76.2%**, что подтверждает способность rubert-tiny2 адаптироваться к задаче NER даже на небольшом датасете. При этом **precision (73.7%)** и **recall (78.8%)** находятся в приемлемом балансе, а **accuracy 92.4%** отражает хорошее общее соответствие предсказаний.

Использование предварительного **MLM-дообучения** значительно улучшило качество: **F1 вырос до 81.3%, точность до 81.7%, а accuracy достигла 94.6%**. Это свидетельствует, что адаптация языковой модели к домену через восстановление маскированных токенов помогает модели лучше улавливать контекстные зависимости в текстах, что критически важно для распознавания именованных сущностей. **Уменьшение loss с 0.226 до 0.191** подтверждает более успешную оптимизацию.

Однако **подход с синтетическими данными** показал катастрофически **низкое качество (F1=1.1%, precision=0.99%)**, что требует глубокого анализа. Возможные причины:

- Некорректная конвертация тегов из формата DeepPavlov в целевой BIO-схеме
- Преобладание шумных аннотаций в синтетических данных
- Конфликт между распределениями оригинальных и синтетических примеров
- Неучтённые особенности токенизации при объединении датасетов.

Комбинация MLM-дообучения с целевым NER-тюнингом демонстрирует наилучший результат, в то время как качество синтетических данных оказалось недостаточным для улучшения модели без дополнительной обработки.