In [1]:
import json
import os
import random
import shutil
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score as seqeval_f1
from simpletransformers.ner import NERArgs, NERModel
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
DATA_ROOT = Path("data/Detailed-NER-Dataset-RU")
PICKLE_PATH = DATA_ROOT / "dataset" / "detailed-ner_dataset-ru.pickle"
UTILS_PATH = (DATA_ROOT / "utils").resolve()

if str(UTILS_PATH) not in sys.path:
    sys.path.append(str(UTILS_PATH))

from relabeling import biolu2bio

raw_df = pd.read_pickle(PICKLE_PATH)
print(f"Loaded {len(raw_df)} sentences from {PICKLE_PATH}")
print(f"Columns: {raw_df.columns.tolist()}")

raw_df["ner_tags"] = raw_df["ner_tags"].apply(biolu2bio)
unique_tags = sorted({tag for sequence in raw_df["ner_tags"] for tag in sequence})
print(f"({len(unique_tags)}): {unique_tags}")

raw_df.head(30)

Loaded 7532 sentences from data/Detailed-NER-Dataset-RU/dataset/detailed-ner_dataset-ru.pickle
Columns: ['tokens', 'ner_tags']
(19): ['B-CITY', 'B-COUNTRY', 'B-DISTRICT', 'B-FIRST_NAME', 'B-HOUSE', 'B-LAST_NAME', 'B-MIDDLE_NAME', 'B-REGION', 'B-STREET', 'I-CITY', 'I-COUNTRY', 'I-DISTRICT', 'I-FIRST_NAME', 'I-HOUSE', 'I-LAST_NAME', 'I-MIDDLE_NAME', 'I-REGION', 'I-STREET', 'O']


Unnamed: 0,tokens,ner_tags
0,"[dnsmasq, , 3753720, , , , , , , 1, , 0, Mar10...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[2022-09-09, 12:37:10]","[O, O]"
2,"[Повар, судовой]","[O, O]"
3,"[профилирование:, SafeData]","[O, O]"
4,"[Кораблестроение,, океанотехника, и, системоте...","[O, O, O, O, O, O, O]"
5,"[Художник, росписи, по, эмали]","[O, O, O, O]"
6,[https://www.youtube.com/watch?v=dQw4w9WgXcQ],[O]
7,"[Apr, 01,, 2016]","[O, O, O]"
8,"[дининфра:, ДинИнфра]","[O, O]"
9,"[CN=Самооценка, 2020,OU=DL,OU=SGROUPS,DC=int,D...","[O, O, O, O, O, O, O, O, O, B-CITY, O, O, O, O..."


In [3]:
required_cols = {"tokens", "ner_tags"}
missing_cols = required_cols - set(raw_df.columns)
if missing_cols:
    raise ValueError(f"{missing_cols}")

lengths_ok = raw_df.apply(lambda row: len(row["tokens"]) == len(row["ner_tags"]), axis=1)
if not lengths_ok.all():
    bad_idx = lengths_ok.index[~lengths_ok].tolist()
    raise ValueError(f"mismatch at rows: {bad_idx[:5]}")

print(raw_df["tokens"].map(len).describe())

def to_simple_format(df: pd.DataFrame) -> pd.DataFrame:
    records = []
    for sent_id, (tokens, labels) in enumerate(zip(df["tokens"], df["ner_tags"])):
        for token, label in zip(tokens, labels):
            records.append({"sentence_id": sent_id, "words": token, "labels": label})
    return pd.DataFrame(records)

simple_df = to_simple_format(raw_df)
display(simple_df.head())
print(f"SimpleTransformers-ready rows: {len(simple_df)}")

count    7532.000000
mean        7.024827
std        11.170213
min         1.000000
25%         2.000000
50%         3.000000
75%         8.000000
max       424.000000
Name: tokens, dtype: float64


Unnamed: 0,sentence_id,words,labels
0,0,dnsmasq,O
1,0,,O
2,0,3753720,O
3,0,,O
4,0,,O


SimpleTransformers-ready rows: 52911


In [4]:
def detokenize(tokens):
    text = " ".join(tokens)
    for punct in [" ,", " .", " :", " ;", " !", " ?"]:
        text = text.replace(punct, punct.strip())
    return text

entity_rows = raw_df[raw_df["ner_tags"].map(lambda tags: any(t != "O" for t in tags))]

In [5]:
MODEL_TYPE = "bert"
MODEL_NAME = "DeepPavlov/rubert-base-cased"
LABELS = sorted(simple_df["labels"].unique())
print(f"Labels ({len(LABELS)}): {LABELS}")

SAVE_DIR = Path("models/rubert-detailed-ner")
WEIGHT_FILES = ["pytorch_model.bin", "model.safetensors"]

def saved_model_exists(path: Path, filenames=WEIGHT_FILES) -> bool:
    return any((path / name).exists() for name in filenames)

model_args = NERArgs()
model_args.labels_list = LABELS
model_args.num_train_epochs = 8
model_args.learning_rate = 3e-5
model_args.train_batch_size = 16
model_args.eval_batch_size = 32
model_args.max_seq_length = 256
model_args.save_steps = 2000
model_args.overwrite_output_dir = True
model_args.output_dir = "outputs/rubert-base-cased"
model_args.best_model_dir = "outputs/rubert-base-cased/best"
model_args.use_multiprocessing = False

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if saved_model_exists(SAVE_DIR):
    model_source = str(SAVE_DIR)
    MODEL_EXISTS = True
    print(f"model gotted")
else:
    model_source = MODEL_NAME
    MODEL_EXISTS = False
    print("training")
model = NERModel(
    MODEL_TYPE,
    model_source,
    args=model_args,
    labels=LABELS,
    use_cuda=torch.cuda.is_available(),
)
MODEL_WAS_TRAINED = False
print("DONE")

Labels (19): ['B-CITY', 'B-COUNTRY', 'B-DISTRICT', 'B-FIRST_NAME', 'B-HOUSE', 'B-LAST_NAME', 'B-MIDDLE_NAME', 'B-REGION', 'B-STREET', 'I-CITY', 'I-COUNTRY', 'I-DISTRICT', 'I-FIRST_NAME', 'I-HOUSE', 'I-LAST_NAME', 'I-MIDDLE_NAME', 'I-REGION', 'I-STREET', 'O']
Using device: cuda
model gotted
DONE


In [6]:
train_sentences, val_sentences = train_test_split(raw_df, test_size=0.15, shuffle=True, random_state=42)
print(f"Train sentences: {len(train_sentences)}, Val sentences: {len(val_sentences)}")

train_df = to_simple_format(train_sentences.reset_index(drop=True))
val_df = to_simple_format(val_sentences.reset_index(drop=True))
print(f"Train rows: {len(train_df)}, Val rows: {len(val_df)}")

def seqeval_metrics(true_labels, predictions):
    return {"f1": seqeval_f1(true_labels, predictions)}

if MODEL_EXISTS:
    print(f"{SAVE_DIR}. Skipping training.")
else:
    model.train_model(train_df, eval_data=val_df, f1=seqeval_metrics)
    MODEL_WAS_TRAINED = True
    MODEL_EXISTS = True

Train sentences: 6402, Val sentences: 1130
Train rows: 45360, Val rows: 7551
models/rubert-detailed-ner. Skipping training.


In [7]:
eval_result, model_outputs, predictions = model.eval_model(val_df)
print("Validation metrics:", eval_result)

def extract_label(token):
    if isinstance(token, dict):
        for key in ("label", "labels", "tag", "prediction", "entity"):
            if key in token:
                return token[key]
        return token.get("value", "O")
    if isinstance(token, (list, tuple)) and len(token) > 1:
        return token[1]
    return token

pred_label_sequences = [[extract_label(token) for token in sent] for sent in predictions]
true_label_sequences = [group["labels"].tolist() for _, group in val_df.groupby("sentence_id", sort=True)]
print(classification_report(true_label_sequences, pred_label_sequences, zero_division=0))


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/36 [00:00<?, ?it/s]

  with amp.autocast():


Validation metrics: {'eval_loss': 0.07366522965498411, 'precision': np.float64(0.8768768768768769), 'recall': np.float64(0.8835098335854765), 'f1_score': np.float64(0.880180859080633)}
              precision    recall  f1-score   support

        CITY       0.97      0.95      0.96       110
     COUNTRY       0.93      0.98      0.95        93
    DISTRICT       0.92      0.86      0.89        14
  FIRST_NAME       0.81      0.83      0.82       139
       HOUSE       0.73      0.80      0.76        20
   LAST_NAME       0.84      0.84      0.84       174
 MIDDLE_NAME       0.97      0.89      0.93        36
      REGION       0.86      0.91      0.88        54
      STREET       0.90      0.90      0.90        21

   micro avg       0.88      0.88      0.88       661
   macro avg       0.88      0.88      0.88       661
weighted avg       0.88      0.88      0.88       661



In [8]:
SAVE_DIR = Path("models/rubert-detailed-ner")
if not MODEL_WAS_TRAINED:
    print(f"no train")
    SAVE_DIR.mkdir(parents=True, exist_ok=True)

    model.save_model(str(SAVE_DIR))
    model.tokenizer.save_pretrained(str(SAVE_DIR))

    source_dir = Path(model.args.output_dir)
    artifacts = [
        "pytorch_model.bin",
        "model.safetensors",
        "config.json",
        "tokenizer_config.json",
        "special_tokens_map.json",
        "vocab.txt",
    ]
    copied = []
    for name in artifacts:
        src = source_dir / name
        if src.exists():
            shutil.copy2(src, SAVE_DIR / name)
            copied.append(name)

    if not copied:
        raise FileNotFoundError(f" {source_dir}.")

    model.args.save(str(SAVE_DIR / "model_args.json"))
    LABELS_PATH = SAVE_DIR / "labels.json"
    LABELS_PATH.write_text(json.dumps(LABELS, ensure_ascii=False, indent=2))
    print(f"saved to {SAVE_DIR} (copied: {copied})")

no train
saved to models/rubert-detailed-ner (copied: ['model.safetensors', 'config.json', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt'])


In [9]:
LOAD_DIR = Path("models/rubert-detailed-ner")
weight_exists = any((LOAD_DIR / name).exists() for name in ["pytorch_model.bin", "model.safetensors"])
if not weight_exists:
    raise FileNotFoundError(f"{LOAD_DIR}")

loaded_model = NERModel(
    MODEL_TYPE,
    str(LOAD_DIR),
    args=model_args,
    labels=LABELS,
    use_cuda=torch.cuda.is_available(),
)
print("Model reloaded from disk.")

def normalize_prediction(entry):
    if isinstance(entry, dict):
        token = entry.get("word") or entry.get("token") or entry.get("text") or entry.get("sentence")
        for key in ("label", "tag", "prediction", "entity", "labels"):
            if key in entry:
                return token, entry[key]
        if token is None and len(entry) == 1:
            tok, label = next(iter(entry.items()))
            return tok, label
        return token, "O"
    if isinstance(entry, (list, tuple)) and len(entry) > 1:
        return entry[0], entry[1]
    return entry, "O"

default_sentences = [
"Алексей Петров прилетел в Казань по рабочим делам",
"Компания VK запустила новый сервис в Новосибирске",
"Премьер-министр провёл встречу в Доме правительства",
"Тинькофф объявил о партнёрстве с X5 Group",
"Екатерина созвонилась с Андреем и договорилась о встрече в Екатеринбурге",
]
custom_sentences = default_sentences
predictions, raw_outputs = loaded_model.predict(custom_sentences)
normalized_predictions = [[normalize_prediction(p) for p in sent] for sent in predictions]

import pandas as pd
pred_rows = []
for sentence, preds in zip(custom_sentences, normalized_predictions):
    tokens = [tok for tok, _ in preds]
    labels = [label for _, label in preds]
    print(tokens)
    print(labels)
    print()
    for tok, label in preds:
        pred_rows.append({"sentence": sentence, "token": tok, "pred_label": label})
pred_df = pd.DataFrame(pred_rows)
print(pred_df.head())

Model reloaded from disk.


  0%|          | 0/5 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

['Алексей', 'Петров', 'прилетел', 'в', 'Казань', 'по', 'рабочим', 'делам']
['B-FIRST_NAME', 'B-LAST_NAME', 'O', 'O', 'B-CITY', 'O', 'O', 'O']

['Компания', 'VK', 'запустила', 'новый', 'сервис', 'в', 'Новосибирске']
['O', 'O', 'O', 'O', 'O', 'O', 'B-CITY']

['Премьер-министр', 'провёл', 'встречу', 'в', 'Доме', 'правительства']
['O', 'O', 'O', 'O', 'O', 'O']

['Тинькофф', 'объявил', 'о', 'партнёрстве', 'с', 'X5', 'Group']
['O', 'O', 'O', 'O', 'O', 'O', 'O']

['Екатерина', 'созвонилась', 'с', 'Андреем', 'и', 'договорилась', 'о', 'встрече', 'в', 'Екатеринбурге']
['B-FIRST_NAME', 'O', 'O', 'B-FIRST_NAME', 'O', 'O', 'O', 'O', 'O', 'B-CITY']

                                            sentence     token    pred_label
0  Алексей Петров прилетел в Казань по рабочим делам   Алексей  B-FIRST_NAME
1  Алексей Петров прилетел в Казань по рабочим делам    Петров   B-LAST_NAME
2  Алексей Петров прилетел в Казань по рабочим делам  прилетел             O
3  Алексей Петров прилетел в Казань по рабочим д

  with amp.autocast():
