# Лабораторная 3: ELMo + BiLSTM NER (standalone)



## 0. Предварительные требования


## 1. Базовые пути и окружение


In [13]:
from pathlib import Path
import sys

LAB_DIR = Path.cwd()
if not (LAB_DIR / 'train_elmo_lstm.py').exists():
    candidate = LAB_DIR / 'NLP_LAB_3'
    if (candidate / 'train_elmo_lstm.py').exists():
        LAB_DIR = candidate.resolve()
        sys.path.append(str(LAB_DIR))
    else:
        raise RuntimeError('Не найден train_elmo_lstm.py. Запустите блокнот из директории NLP_LAB_3 или корня проекта.')

DATA_DIR = (LAB_DIR / '..' / 'data').resolve()
MODELS_DIR = (LAB_DIR / '..' / 'models').resolve()
REPORTS_DIR = LAB_DIR / 'reports'
REPORTS_DIR.mkdir(exist_ok=True)

print(f"Рабочая папка: {LAB_DIR}")
print(f"Данные будут в: {DATA_DIR}")
print(f"Модели будут в: {MODELS_DIR}")
print(f"Результаты будем сохранять в: {REPORTS_DIR}")

Рабочая папка: /Users/il_dimas/Documents/Programming_projects/NLP_LABS/NLP_LAB_3
Данные будут в: /Users/il_dimas/Documents/Programming_projects/NLP_LABS/data
Модели будут в: /Users/il_dimas/Documents/Programming_projects/NLP_LABS/models
Результаты будем сохранять в: /Users/il_dimas/Documents/Programming_projects/NLP_LABS/NLP_LAB_3/reports


## 2. Установка зависимостей


In [None]:
!python3 -m pip install --user -r requirements.txt

## 3. Загрузка датасета и предобученной ELMo-модели


In [34]:
import subprocess
import urllib.request
import zipfile

DATA_REPO = DATA_DIR / 'Detailed-NER-Dataset-RU'
if not DATA_REPO.exists():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    subprocess.run(['git', 'clone', 'https://github.com/AlexKly/Detailed-NER-Dataset-RU.git', str(DATA_REPO)], check=True)

MODEL_DIR = MODELS_DIR / 'ruwikiruscorpora_tokens_elmo_1024_2019'
MODEL_ZIP = MODELS_DIR / 'ruwikiruscorpora_tokens_elmo_1024_2019.zip'
MODEL_URL = 'https://vectors.nlpl.eu/repository/20/195.zip'

if not MODEL_DIR.exists():
    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    urllib.request.urlretrieve(MODEL_URL, MODEL_ZIP)
    with zipfile.ZipFile(MODEL_ZIP, 'r') as zf:
        zf.extractall(MODELS_DIR / 'ruwikiruscorpora_tokens_elmo_1024_2019')
    MODEL_ZIP.unlink(missing_ok=True)

print('DATA AND MODEL LOADED!')

DATA AND MODEL LOADED!


## 4. Импорт библиотек и конфигурация устройства

In [None]:
import json
import random
import time
from dataclasses import dataclass, asdict
from typing import Dict, Iterable, List, Sequence, Tuple

import numpy as np
import pandas as pd
import torch
from allennlp.modules.elmo import Elmo, batch_to_ids
from razdel import tokenize as razdel_tokenize
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions'




Используемое устройство: cpu


## 5 перевод тегов BIOLU → BIO

In [None]:
def biolu2bio(tags: Sequence[str]) -> List[str]:
    converted = []
    for tag in tags:
        prefix = tag.split('-')[0]
        label = tag.split('-')[-1]
        if prefix == 'U':
            converted.append(f'B-{label}')
        elif prefix == 'L':
            converted.append(f'I-{label}')
        else:
            converted.append(tag)
    return converted

example = ['U-CITY', 'B-LAST_NAME', 'L-LAST_NAME', 'O']
print(example, '->', biolu2bio(example))

Пример конверсии: ['U-CITY', 'B-LAST_NAME', 'L-LAST_NAME', 'O'] -> ['B-CITY', 'B-LAST_NAME', 'I-LAST_NAME', 'O']


## 6. Загрузка датасета и проверка BIO-тегов

In [None]:
import pandas as pd
from collections import Counter

pickle_path = DATA_REPO / 'dataset' / 'detailed-ner_dataset-ru.pickle'
if not pickle_path.exists():
    raise FileNotFoundError('Не найден detailed-ner_dataset-ru.pickle.')

df = pd.read_pickle(pickle_path)
tokens = df['tokens'].tolist()
raw_tags = df['ner_tags'].tolist()

bio_tags = [biolu2bio(seq) for seq in raw_tags]

invalid = []
for seq in bio_tags:
    for tag in seq:
        if tag != 'O' and not tag.startswith(('B-', 'I-')):
            invalid.append(tag)

if invalid:
    print(Counter(invalid))
else:
    print('BIO - good')

print('предложения', len(tokens))

Все теги соответствуют BIO формату.
Количество предложений: 7532


## 7. Статистика датасета

In [35]:
from collections import Counter

tag_counter = Counter(tag for seq in bio_tags for tag in seq if tag != 'O')
print(tag_counter.most_common(10))

[('B-LAST_NAME', 1084), ('B-FIRST_NAME', 918), ('B-COUNTRY', 804), ('B-CITY', 677), ('B-REGION', 381), ('B-MIDDLE_NAME', 311), ('I-HOUSE', 218), ('B-STREET', 135), ('B-HOUSE', 120), ('B-DISTRICT', 110)]


## 8. Определение словаря тегов и разбиение

In [36]:
import numpy as np

indices = np.arange(len(tokens))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42, shuffle=True)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42, shuffle=True)

def select(items, idxs):
    return [items[i] for i in idxs]

train_tokens, val_tokens, test_tokens = select(tokens, train_idx), select(tokens, val_idx), select(tokens, test_idx)
train_tags, val_tags, test_tags = select(bio_tags, train_idx), select(bio_tags, val_idx), select(bio_tags, test_idx)

print(f'Train: {len(train_tokens)}, Val: {len(val_tokens)}, Test: {len(test_tokens)}')

unique_tags = sorted({tag for seq in bio_tags for tag in seq})
tag2idx = {tag: idx for idx, tag in enumerate(unique_tags)}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
print('Количество тегов:', len(unique_tags))

Train: 5422, Val: 603, Test: 1507
Количество тегов: 19


## 9. Обёртки для ELMo и Dataset

In [37]:
class ElmoSentenceEmbedder:
    def __init__(self, options_path: Path, weight_path: Path, device: torch.device):
        self.device = device
        self.elmo = Elmo(
            options_file=str(options_path),
            weight_file=str(weight_path),
            num_output_representations=1,
            dropout=0.0,
        ).to(self.device)
        self.elmo.eval()

    @torch.no_grad()
    def embed(self, tokens: Sequence[str]) -> torch.Tensor:
        character_ids = batch_to_ids([list(tokens)]).to(self.device)
        outputs = self.elmo(character_ids)
        embeddings = outputs['elmo_representations'][0][0]
        return embeddings.cpu()


class ElmoSequenceDataset(Dataset):
    def __init__(self, tokens: Sequence[Sequence[str]], tags: Sequence[Sequence[str]], embedder: ElmoSentenceEmbedder, tag2idx: Dict[str, int]):
        self.tokens = tokens
        self.tags = tags
        self.embedder = embedder
        self.tag2idx = tag2idx
        self._cache: Dict[int, torch.Tensor] = {}

    def __len__(self):
        return len(self.tokens)

    def _embed(self, idx: int) -> torch.Tensor:
        if idx not in self._cache:
            embedding = self.embedder.embed(self.tokens[idx])
            self._cache[idx] = embedding.clone().detach()
        return self._cache[idx]

    def __getitem__(self, idx: int):
        embeddings = self._embed(idx)
        tag_ids = torch.tensor([self.tag2idx[tag] for tag in self.tags[idx]], dtype=torch.long)
        return embeddings, tag_ids, embeddings.size(0)


def collate_batch(batch: Iterable[Tuple[torch.Tensor, torch.Tensor, int]]):
    embeddings, tags, lengths = zip(*batch)
    lengths_tensor = torch.tensor(lengths, dtype=torch.long)
    padded_embeddings = pad_sequence(embeddings, batch_first=True)
    padded_tags = pad_sequence(tags, batch_first=True, padding_value=-100)
    return padded_embeddings, padded_tags, lengths_tensor

## 10. BiLSTM модель и утилиты

In [None]:
class BiLstmTagger(nn.Module):
    def __init__(self, embedding_dim: int, hidden_size: int, num_labels: int, num_layers: int = 1, dropout: float = 0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size * 2, num_labels)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, _ = self.lstm(packed)
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)
        outputs = self.dropout(outputs)
        return self.classifier(outputs)


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


@dataclass
class Metrics:
    loss: float
    precision: float
    recall: float
    f1: float

    def to_dict(self):
        return asdict(self)


def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    pbar = tqdm(loader, desc='Train', leave=False)
    for embeddings, tags, lengths in pbar:
        embeddings = embeddings.to(DEVICE)
        tags = tags.to(DEVICE)

        optimizer.zero_grad(set_to_none=True)
        logits = model(embeddings, lengths)
        loss = criterion(logits.view(-1, logits.size(-1)), tags.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        valid_tokens = (tags != -100).sum().item()
        total_loss += loss.item() * valid_tokens
        total_tokens += valid_tokens
        pbar.set_postfix({'loss': f'{loss.item():.04f}'})
    return total_loss / max(total_tokens, 1)


def evaluate(model, loader, criterion, idx2tag, stage='Eval'):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    gold_sequences, pred_sequences = [], []

    with torch.no_grad():
        for embeddings, tags, lengths in tqdm(loader, desc=stage, leave=False):
            embeddings = embeddings.to(DEVICE)
            tags = tags.to(DEVICE)
            logits = model(embeddings, lengths)
            loss = criterion(logits.view(-1, logits.size(-1)), tags.view(-1))

            mask = tags != -100
            token_count = mask.sum().item()
            total_loss += loss.item() * token_count
            total_tokens += token_count

            preds = logits.argmax(dim=-1).cpu().numpy()
            gold = tags.cpu().numpy()
            lengths_np = lengths.numpy()
            for pred_seq, gold_seq, length in zip(preds, gold, lengths_np):
                pred_labels = [idx2tag[idx] for idx in pred_seq[:length]]
                gold_labels = [idx2tag[idx] for idx in gold_seq[:length]]
                pred_sequences.append(pred_labels)
                gold_sequences.append(gold_labels)

    loss_value = total_loss / max(total_tokens, 1)
    precision = precision_score(gold_sequences, pred_sequences, zero_division=0)
    recall = recall_score(gold_sequences, pred_sequences, zero_division=0)
    f1 = f1_score(gold_sequences, pred_sequences, zero_division=0)
    return Metrics(loss_value, precision, recall, f1), gold_sequences, pred_sequences

## 11. Гиперпараметры и DataLoader

In [38]:
config = {
    'epochs': 5,
    'batch_size': 12,
    'hidden_size': 256,
    'num_layers': 1,
    'dropout': 0.3,
    'learning_rate': 1e-3,
    'seed': 42,
}
set_seed(config['seed'])
options_path = MODEL_DIR / 'options.json'
weights_path = MODEL_DIR / 'model.hdf5'
if not (options_path.exists() and weights_path.exists()):
    raise FileNotFoundError('Не найдены options.json/model.hdf5 в директории модели.')

elmo_embedder = ElmoSentenceEmbedder(options_path, weights_path, DEVICE)

train_dataset = ElmoSequenceDataset(train_tokens, train_tags, elmo_embedder, tag2idx)
val_dataset = ElmoSequenceDataset(val_tokens, val_tags, elmo_embedder, tag2idx)
test_dataset = ElmoSequenceDataset(test_tokens, test_tags, elmo_embedder, tag2idx)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_batch)

model = BiLstmTagger(
    embedding_dim=1024,
    hidden_size=config['hidden_size'],
    num_labels=len(tag2idx),
    num_layers=config['num_layers'],
    dropout=config['dropout'],
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
criterion = nn.CrossEntropyLoss(ignore_index=-100)

print('MODEL AND DATA READY!')

MODEL AND DATA READY!


## 12. Обучение

In [None]:
history = []
best_state = None
best_val_f1 = -1.0

start_time = time.time()
for epoch in range(1, config['epochs'] + 1):
    print(f'=== Epoch {epoch}/{config["epochs"]} ===')
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_metrics, _, _ = evaluate(model, val_loader, criterion, idx2tag, stage='Validation')

    epoch_summary = {
        'epoch': epoch,
        'train_loss': train_loss,
        'val_loss': val_metrics.loss,
        'val_precision': val_metrics.precision,
        'val_recall': val_metrics.recall,
        'val_f1': val_metrics.f1,
    }
    history.append(epoch_summary)
    print(f"Train loss: {train_loss:.4f}")
    print(f"Validation -> loss: {val_metrics.loss:.4f}, precision: {val_metrics.precision:.4f}, recall: {val_metrics.recall:.4f}, f1: {val_metrics.f1:.4f}")

    if val_metrics.f1 > best_val_f1:
        best_val_f1 = val_metrics.f1
        best_state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
            'val_metrics': val_metrics.to_dict(),
        }

if best_state:
    model.load_state_dict(best_state['model'])
    print(f"({best_state['epoch']}).")

print(f"Обучение заняло {(time.time() - start_time)/60:.2f} мин.")

=== Epoch 1/5 ===


                                                                     

Train loss: 0.2156
Validation -> loss: 0.1636, precision: 0.7731, recall: 0.6411, f1: 0.7009
✓ Лучшее качество обновлено.
=== Epoch 2/5 ===


                                                                     

Train loss: 0.0852
Validation -> loss: 0.1182, precision: 0.7705, recall: 0.7896, f1: 0.7800
✓ Лучшее качество обновлено.
=== Epoch 3/5 ===


                                                                     

Train loss: 0.0584
Validation -> loss: 0.0974, precision: 0.8317, recall: 0.8193, f1: 0.8254
✓ Лучшее качество обновлено.
=== Epoch 4/5 ===


                                                                     

Train loss: 0.0430
Validation -> loss: 0.0923, precision: 0.8038, recall: 0.8317, f1: 0.8175
=== Epoch 5/5 ===


                                                                     

Train loss: 0.0280
Validation -> loss: 0.0844, precision: 0.8601, recall: 0.8366, f1: 0.8482
✓ Лучшее качество обновлено.
Загружены веса лучшей эпохи (5).
Обучение заняло 5.33 мин.


## 13. Тестирование и отчёт

In [29]:
test_metrics, gold_sequences, pred_sequences = evaluate(model, test_loader, criterion, idx2tag, stage='Test')
print(
    f"Test -> loss: {test_metrics.loss:.4f}, precision: {test_metrics.precision:.4f}, "
    f"recall: {test_metrics.recall:.4f}, f1: {test_metrics.f1:.4f}"
)

report_text = classification_report(gold_sequences, pred_sequences, zero_division=0)
print('Seqeval report:')
print(report_text)

                                                       

Test -> loss: 0.0797, precision: 0.8571, recall: 0.8230, f1: 0.8397
Seqeval report:
              precision    recall  f1-score   support

        CITY       0.95      0.90      0.92       148
     COUNTRY       0.88      0.86      0.87       132
    DISTRICT       0.92      0.71      0.80        17
  FIRST_NAME       0.81      0.82      0.82       194
       HOUSE       0.63      0.71      0.67        24
   LAST_NAME       0.83      0.80      0.81       235
 MIDDLE_NAME       0.86      0.80      0.83        60
      REGION       0.93      0.78      0.85        69
      STREET       0.86      0.76      0.81        25

   micro avg       0.86      0.82      0.84       904
   macro avg       0.85      0.79      0.82       904
weighted avg       0.86      0.82      0.84       904





## 14. Сохранение результатов

In [40]:
metrics_payload = {
    'config': config,
    'history': history,
    'best_val': best_state['val_metrics'] if best_state else None,
    'test': test_metrics.to_dict(),
    'classification_report': report_text,
}

metrics_path = REPORTS_DIR / 'metrics.json'
metrics_path.write_text(json.dumps(metrics_payload, indent=2, ensure_ascii=False), encoding='utf-8')

2385

## 15. Инференс на новых текстах

In [42]:
def tokenize_text(text: str) -> List[str]:
    return [token.text for token in razdel_tokenize(text)]


def extract_entities(tokens: Sequence[str], tags: Sequence[str]):
    entities = []
    buffer = []
    current_label = None
    for token, tag in zip(tokens, tags):
        if tag == 'O':
            if buffer:
                entities.append((' '.join(buffer), current_label))
                buffer = []
                current_label = None
            continue
        prefix, label = tag.split('-', 1)
        if prefix == 'B':
            if buffer:
                entities.append((' '.join(buffer), current_label))
            buffer = [token]
            current_label = label
        elif prefix == 'I' and current_label == label:
            buffer.append(token)
        else:
            if buffer:
                entities.append((' '.join(buffer), current_label))
            buffer = [token]
            current_label = label
    if buffer:
        entities.append((' '.join(buffer), current_label))
    return entities


def predict_tags_for_sentence(tokens: Sequence[str]):
    embeddings = elmo_embedder.embed(tokens)
    tensor = embeddings.unsqueeze(0).to(DEVICE)
    lengths = torch.tensor([len(tokens)], dtype=torch.long)
    model.eval()
    with torch.no_grad():
        logits = model(tensor, lengths)
        pred_ids = logits.argmax(dim=-1)[0][: len(tokens)].cpu().tolist()
    return [idx2tag[idx] for idx in pred_ids]


def predict_from_file(path: Path, save: bool = True):
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f'Файл {path} не найден.')
    texts = [line.strip() for line in path.read_text(encoding='utf-8').splitlines() if line.strip()]
    results = []
    for text in texts:
        tokens = tokenize_text(text)
        tags = predict_tags_for_sentence(tokens) if tokens else []
        entities = extract_entities(tokens, tags) if tokens else []
        results.append({'text': text, 'tokens': tokens, 'predicted_tags': tags, 'entities': entities})

    if save:
        out_path = REPORTS_DIR / f'external_predictions_{path.stem}.json'
        out_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding='utf-8')
        print('Сохранено в', out_path)
    return results


default_text_path = LAB_DIR / 'sample_text.txt'
print('Файл по умолчанию:', default_text_path)
preview = predict_from_file(default_text_path)
for item in preview[:]:
    print('---')
    print(item['text'])
    print('Сущности:', item['entities'] or '—')

Файл по умолчанию: /Users/il_dimas/Documents/Programming_projects/NLP_LABS/NLP_LAB_3/sample_text.txt
Сохранено в /Users/il_dimas/Documents/Programming_projects/NLP_LABS/NLP_LAB_3/reports/external_predictions_sample_text.json
---
В Саратове продолжают разрушаться казармы Деконского, построенные более 150 лет назад и являющиеся объектом культурного наследия регионального значения.
Сущности: [('В', 'CITY'), ('Саратове', 'MIDDLE_NAME'), ('продолжают', 'REGION'), ('разрушаться', 'MIDDLE_NAME'), ('казармы', 'STREET'), ('Деконского', 'STREET'), (',', 'STREET'), ('построенные', 'MIDDLE_NAME'), ('более', 'COUNTRY'), ('150', 'COUNTRY'), ('лет', 'REGION'), ('назад и', 'MIDDLE_NAME'), ('являющиеся', 'CITY'), ('объектом', 'CITY'), ('культурного', 'CITY'), ('наследия', 'STREET'), ('регионального', 'MIDDLE_NAME'), ('значения', 'MIDDLE_NAME'), ('.', 'FIRST_NAME')]
---
Сегодня в этом на месте убедился корреспондент ИА "Взгляд-инфо".
Сущности: [('Сегодня', 'REGION'), ('в этом', 'REGION'), ('на', 'REGION