## Import libraries

In [None]:
!pip install seqeval
!pip install datasets
!pip install ipymarkup
!pip install transformers
!pip install accelerate -U
!pip install transformers[torch]

!git clone https://github.com/AlexKly/Detailed-NER-Dataset-RU.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16165 sha256=2c5a22dc1306288fa6785a2057d9f86a4589d4ffe7c5a91c7cc9fb8742b88006
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m

In [None]:
import numpy as np
import pandas as pd
import os, ast, torch, pathlib, datasets, ipymarkup, transformers

os.environ['WANDB_DISABLED'] = 'true'

## Configs and parameters

In [None]:
# Path and core settings:
_ROOT = pathlib.Path().resolve()
DIR_DETAILED_NER_DATASET_RU = _ROOT/'Detailed-NER-Dataset-RU'
DIR_DATASET = DIR_DETAILED_NER_DATASET_RU/'dataset'
DIR_OUTPUT = _ROOT/'output'
LABELS = [
    # LOC tags:
    'COUNTRY', 'REGION', 'DISTRICT', 'CITY', 'STREET', 'HOUSE',
    # PER tags:
    'LAST_NAME', 'FIRST_NAME', 'MIDDLE_NAME'
]
PARTS = ['B', 'I', 'L', 'U']
LABELS_LIST = [f'{p}-{l}' for l in LABELS for p in PARTS] + ['O']
DOMAINS_MAP = {k: v for v, k in enumerate(LABELS_LIST)}
REVERSE_DOMAINS_MAP = dict((v, k) for k, v in DOMAINS_MAP.items())

# Data splitting:
train_ratio = 0.8
test_ratio = 0.1

# Model loading and train params:
modelname = 'Babelscape/wikineural-multilingual-ner'
batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = transformers.AutoTokenizer.from_pretrained(modelname)
model = transformers.AutoModelForTokenClassification.from_pretrained(
    modelname,
    num_labels=len(LABELS_LIST),
    id2label=REVERSE_DOMAINS_MAP,
    label2id=DOMAINS_MAP,
    ignore_mismatched_sizes=True
).to(device)
train_args = transformers.TrainingArguments(
    f'{modelname}-finetuned-ner',
    evaluation_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    report_to=None,
)
data_collator = transformers.DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric('seqeval')

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([37]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([37, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  metric = datasets.load_metric('seqeval')


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p: transformers.trainer_utils.EvalPrediction) -> dict:
    """ Calculate metrics in during training and validation.

    :param p: Pair of predictions and labels.
    :return: Calculated metrics.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [LABELS_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABELS_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1'],
        'accuracy': results['overall_accuracy'],
    }

## Load data

In [None]:
df = pd.read_pickle(DIR_DATASET/'detailed-ner_dataset-ru.pickle')
df['tokens'] = df['tokens'].astype(str).apply(lambda x: ast.literal_eval(x))
df['ner_tags'] = df['ner_tags'].astype(str).apply(lambda x: ast.literal_eval(x))

## Split data

In [None]:
df = df.sample(frac=1)
train_ds = datasets.Dataset.from_pandas(
    df.iloc[:int(train_ratio * df.shape[0])].reset_index().drop('index', axis=1)
)
test_ds = datasets.Dataset.from_pandas(
    df.iloc[int(train_ratio * df.shape[0]):int((train_ratio + test_ratio) * df.shape[0])].reset_index().drop('index', axis=1)
)
val_ds = datasets.Dataset.from_pandas(
    df.iloc[int((train_ratio + test_ratio) * df.shape[0]):].reset_index().drop('index', axis=1)
)

## Data preprocessing

In [None]:
def tokenize_and_align_labels(ds: datasets.arrow_dataset.Dataset) -> transformers.tokenization_utils_base.BatchEncoding:
    """ Preprare datasets before training model.

    :param ds: Input dataset.
    :return: Prepared tokenized samples and labels.
    """
    label_all_tokens = True
    tokenized_inputs = tokenizer(ds['tokens'], truncation=True, padding=True, is_split_into_words=True, max_length=512)
    labels = list()
    for i, label in enumerate(ds[f'ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = list()
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(DOMAINS_MAP[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on the label_all_tokens flag.
            else:
                label_ids.append(DOMAINS_MAP[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels

    return tokenized_inputs

In [None]:
tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)
tokenized_val_ds = val_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/6025 [00:00<?, ? examples/s]

Map:   0%|          | 0/753 [00:00<?, ? examples/s]

Map:   0%|          | 0/754 [00:00<?, ? examples/s]

## Fine-tuning

In [None]:
%%time

trainer = transformers.Trainer(
    model,
    train_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and validate:
trainer.train()
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.165882,0.707182,0.71243,0.709797,0.957667
2,0.295800,0.107805,0.798548,0.816327,0.807339,0.971415
3,0.099600,0.089499,0.807487,0.840445,0.823636,0.977472
4,0.050600,0.081117,0.821306,0.886827,0.85281,0.981828
5,0.050600,0.098756,0.796327,0.884972,0.838313,0.97543
6,0.025100,0.08222,0.835106,0.87384,0.854034,0.981556
7,0.015900,0.098189,0.814626,0.888683,0.850044,0.978833
8,0.009700,0.082318,0.865108,0.892393,0.878539,0.983666
9,0.009700,0.090862,0.806397,0.888683,0.845543,0.979446
10,0.006500,0.083708,0.863309,0.890538,0.876712,0.983462




CPU times: user 1h 29min 59s, sys: 1min 46s, total: 1h 31min 46s
Wall time: 1h 33min 42s


{'eval_loss': 0.08370821923017502,
 'eval_precision': 0.8633093525179856,
 'eval_recall': 0.8905380333951762,
 'eval_f1': 0.8767123287671232,
 'eval_accuracy': 0.9834615122847614,
 'eval_runtime': 13.2847,
 'eval_samples_per_second': 56.682,
 'eval_steps_per_second': 3.613,
 'epoch': 10.0}

## Save pretrained model

In [None]:
trainer.save_model(output_dir=str(DIR_OUTPUT/'ner_ru_model'))

## Load pretrained model and predict

In [None]:
my_tokenizer = transformers.AutoTokenizer.from_pretrained(DIR_OUTPUT/'ner_ru_model')
my_model = transformers.AutoModelForTokenClassification.from_pretrained(DIR_OUTPUT/'ner_ru_model')
nlp = transformers.pipeline(task='ner', model=my_model, tokenizer=my_tokenizer, aggregation_strategy='average')

In [None]:
def predict(text: str, confidience: float = 0.5) -> None:
    spans = nlp(text)
    spans_list = [(span['start'], span['end'], span['entity_group']) for span in spans
                  if span['entity_group'] != 'LABEL_0' and span['score'] > confidience]
    ipymarkup.show_span_box_markup(text=text, spans=spans_list)

In [None]:
predict(text='Сидоров Алексей посетил город Москва на прошлой неделе, но он живет по прописке на ул.Победы д.25 в г.Пенза')
predict(text='Петров Алексей Юрьевич проживает по адресу: г.Москва, ул.Пушкина, д.228, но фактически его страна проживания США')
predict(text='США и Польша - это две страны блока НАТО')