<a href="https://colab.research.google.com/github/igoryan75/PI/blob/main/%D0%9A%D1%83%D1%80%D1%81%D0%BE%D0%B2%D0%B0%D1%8F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 !pip install naeval;
 !pip install nerus;
 !pip install corus;
 !pip install transformers;
 !pip install transformers[torch];
 !pip install seqeval;
 !pip install tokenizers;
 !pip install datasets;
 !pip install torch;
 !pip install accelerate -U;

In [None]:
!wget https://github.com/dialogue-evaluation/factRuEval-2016/archive/master.zip;
!unzip master.zip;
!rm master.zip;

In [None]:
!ls

sample_data


In [None]:
!rm -r pdn_model_ariesjr
!rm -r train_result_ariesjr
!rm -r factRuEval-2016-master

Обучение модели


In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import numpy as np
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
from corus import load_factru


records = list(load_factru("factRuEval-2016-master"))
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_tok_len = 512


types = ["SURNAME", "NAME"]
label_names = ["O"] + [f"{prefix}-{t}" for t in types for prefix in ["B", "I"]]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

# --- Токенизация и разметка ---
result = {'input_ids': [], 'token_type_ids': [], 'attention_mask': [], 'labels': []}

for rec in tqdm(records):
    tokenized_inputs = tokenizer(rec.text, truncation=True, max_length=max_tok_len, padding=True)
    enc = tokenized_inputs.encodings[0]
    offsets = enc.offsets
    word_ids = enc.word_ids

    labels = []
    temp_tags = []

    for i in range(len(offsets)):
        if word_ids[i] is None:
            labels.append(-100)
            temp_tags.append("O")
            continue

        # Проверяем, мульти-токен одного слова
        if i > 0 and word_ids[i-1] == word_ids[i]:
            labels.append(-100)
            temp_tags.append(temp_tags[-1])
            continue

        offset = offsets[i]
        tag_found = "O"

        # Ищем принадлежность span-у
        for obj in rec.objects:
            for span in obj.spans:
                # Берём только SURNAME и NAME
                if span.type.upper() not in types:
                    continue

                if span.start <= offset[0] < span.stop or span.start < offset[1] <= span.stop:
                    prefix = "I-" if temp_tags and temp_tags[-1] == span.type else "B-"
                    tag_found = prefix + span.type.upper()
                    break
            if tag_found != "O":
                break

        labels.append(label2id.get(tag_found, 0))
        temp_tags.append(tag_found.replace("B-", "").replace("I-", "") if tag_found != "O" else "O")

    result['input_ids'].append(tokenized_inputs['input_ids'])
    result['token_type_ids'].append(tokenized_inputs['token_type_ids'])
    result['attention_mask'].append(tokenized_inputs['attention_mask'])
    result['labels'].append(labels)

# --- Train/Test/Valid ---
ds = Dataset.from_dict(result)
gen = np.random.default_rng(42)
dsd = ds.train_test_split(test_size=0.2, generator=gen)
dsd_temp = dsd["test"].train_test_split(test_size=0.5, generator=gen)

dsd = DatasetDict({
    "train": dsd["train"],
    "test": dsd_temp["test"],
    "valid": dsd_temp["train"]
})


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    y_true = [[id2label[l] for l, p in zip(label, pred) if l != -100] for label, pred in zip(labels, predictions)]
    y_pred = [[id2label[p] for l, p in zip(label, pred) if l != -100] for label, pred in zip(labels, predictions)]

    res = classification_report(y_true=y_true, y_pred=y_pred, scheme=IOB2, output_dict=True)["weighted avg"]
    res.pop("support", None)
    return res

training_args = TrainingArguments(
    output_dir="./train_result_name_surname",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    logging_steps=50,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to=None,
    save_safetensors=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dsd["train"],
    eval_dataset=dsd["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- Обучение и сохранение модели ---
trainer.train()
model_dir = "./pdn_model_name_surname"
trainer.save_model(model_dir)
print(f"Модель сохранена в {model_dir}")


Запуск разспознавания ПДн

In [None]:
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/_General.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/CADnum.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/CARD.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/INN.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/Married.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/namesurname.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/PASS.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/Phone.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/SNILS.txt -P tests/
!wget https://raw.githubusercontent.com/igoryan75/PI/main/tests/URL.txt -P tests/
!cd tests && ls;

In [None]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import os

tests_dir = "tests"  # папка с тестовыми файлами


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = "./pdn_model_name_surname"

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer.model_max_length = 512

model = AutoModelForTokenClassification.from_pretrained(model_dir, local_files_only=True).to(device)
id2label = model.config.id2label


# ============================================================
#  ВАЛИДАЦИИ
# ============================================================

def check_luhn(number: str) -> bool:
    digits = [int(d) for d in number if d.isdigit()]
    if len(digits) != 16:
        return False

    checksum = 0
    parity = len(digits) % 2

    for i, digit in enumerate(digits):
        if i % 2 == parity:
            d = digit * 2
            if d > 9:
                d -= 9
        else:
            d = digit
        checksum += d

    return checksum % 10 == 0


def check_inn(inn: str) -> bool:
    inn = re.sub(r'\D', '', inn)
    if len(inn) == 10:
        nums = list(map(int, inn))
        coeffs = [2,4,10,3,5,9,4,6,8]
        s = sum(a*b for a, b in zip(nums, coeffs)) % 11 % 10
        return s == nums[9]

    if len(inn) == 12:
        nums = list(map(int, inn))
        coeffs1 = [7,2,4,10,3,5,9,4,6,8,0]
        c1 = sum(a*b for a, b in zip(nums, coeffs1)) % 11 % 10
        coeffs2 = [3,7,2,4,10,3,5,9,4,6,8]
        c2 = sum(a*b for a, b in zip(nums, coeffs2)) % 11 % 10
        return c1 == nums[10] and c2 == nums[11]

    return False


def check_snils(snils: str) -> bool:
    digits = re.sub(r'\D', '', snils)
    if len(digits) != 11:
        return False

    nums = list(map(int, digits))
    control = nums[-2] * 10 + nums[-1]
    s = sum((9 - i) * nums[i] for i in range(9))

    if s < 100:
        return s == control
    elif s in (100, 101):
        return control == 0
    else:
        return (s % 101) == control


# ============================================================
#  РЕГУЛЯРКИ ДЛЯ ПДН
# ============================================================

regex_rules = [
    ("CARD", re.compile(r"\b(?:\d{4}[ -]?){3}\d{4}\b")),
    ("INN", re.compile(r"\b\d{10}\b|\b\d{12}\b")),
    ("PASSPORT", re.compile(r"\b\d{4}\s\d{6}\b")),
    ("SNILS", re.compile(r"\b\d{3}-\d{3}-\d{3}\s?\d{2}\b")),
    ("URL", re.compile(r"\b((?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?)\b")),
    ("PHONE", re.compile(r"(?:(?:\+7|8)[\s\-]?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2})")),
    ("MARRIAGE_CERT", re.compile(r"\b\d{2}-\d{2}-\d{6}\b")),
    ("CAD_NUMBER", re.compile(r"\b\d{2}:\d{2}:\d{6,7}:\d+\b")),
]


# ============================================================
#  РАСПОЗНАВАНИЕ МОДЕЛЬЮ (ФИ + ИМЯ)
# ============================================================

def recognize_pdn_model(text):
    toks = tokenizer(text, return_offsets_mapping=True, truncation=True)
    input_ids = torch.tensor([toks["input_ids"]]).to(device)
    offsets = toks["offset_mapping"]

    with torch.no_grad():
        logits = model(input_ids).logits

    preds = torch.argmax(logits, dim=-1)[0].cpu().numpy()

    result = []
    curr_tag = None
    curr_start = None
    curr_end = None

    for p, (start, end) in zip(preds, offsets):
        tag_full = id2label.get(int(p), "O")
        tag = tag_full.split("-")[-1] if tag_full != "O" else "O"

        if tag not in ("SURNAME", "NAME"):
            if curr_tag:
                result.append([curr_tag, curr_start, curr_end])
                curr_tag = None
            continue

        if tag != curr_tag:
            if curr_tag:
                result.append([curr_tag, curr_start, curr_end])
            curr_tag = tag
            curr_start, curr_end = start, end
        else:
            curr_end = end

    if curr_tag:
        result.append([curr_tag, curr_start, curr_end])

    return result


# ============================================================
#  (regex + BERT)
# ============================================================

if not os.path.exists(tests_dir):
    raise FileNotFoundError(f"Папка {tests_dir} не найдена!")
if not os.path.isdir(tests_dir):
    raise NotADirectoryError(f"{tests_dir} существует, но это не папка!")

for filename in os.listdir(tests_dir):
    if not filename.lower().endswith(".txt"):
        continue

    file_path = os.path.join(tests_dir, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    print(f"\n===== Файл: {filename} =====")
    valid = []
    invalid = []

    # regex + проверки
    for label, pattern in regex_rules:
        for m in pattern.finditer(text):
            s, e = m.span()
            val = m.group()
            is_valid = True

            if label == "CARD" and not check_luhn(val):
                is_valid = False
            if label == "INN" and not check_inn(val):
                is_valid = False
            if label == "SNILS" and not check_snils(val):
                is_valid = False

            if is_valid:
                valid.append([label, s, e])
            else:
                invalid.append([label, s, e])

    # BERT (SURNAME, NAME)
    for tag, s, e in recognize_pdn_model(text):
        valid.append([tag, s, e])

    # вывод
    print("\n--- ПДН прошедшие валидацию ---")
    for r in valid:
        print(r)

    print("\n--- ПДН не прошедшие валидацию ---")
    for r in invalid:
        print(r)



===== Файл: namesurname.txt =====

--- ПДН прошедшие валидацию ---
['NAME', 18, 22]
['SURNAME', 23, 29]
['NAME', 31, 36]
['NAME', 37, 45]
['SURNAME', 46, 55]
['NAME', 57, 62]
['SURNAME', 63, 71]
['SURNAME', 72, 80]

--- ПДН не прошедшие валидацию ---

===== Файл: INN.txt =====

--- ПДН прошедшие валидацию ---
['INN', 21, 33]
['MARRIAGE_CERT', 64, 76]

--- ПДН не прошедшие валидацию ---
['INN', 48, 58]

===== Файл: CARD.txt =====

--- ПДН прошедшие валидацию ---
['CARD', 36, 55]
['CARD', 71, 87]
['CARD', 106, 125]

--- ПДН не прошедшие валидацию ---
['CARD', 145, 164]

===== Файл: URL.txt =====

--- ПДН прошедшие валидацию ---
['URL', 22, 41]
['URL', 55, 74]
['URL', 86, 116]
['URL', 138, 153]
['URL', 154, 164]

--- ПДН не прошедшие валидацию ---

===== Файл: CADnum.txt =====

--- ПДН прошедшие валидацию ---
['CAD_NUMBER', 39, 56]

--- ПДН не прошедшие валидацию ---

===== Файл: _General.txt =====

--- ПДН прошедшие валидацию ---
['CARD', 155, 174]
['INN', 38, 50]
['PASSPORT', 60, 71]
[