## NER BiLSTM+CRF
https://github.com/deepmipt/ner

https://arxiv.org/pdf/1709.09686.pdf

In [5]:
import pandas as pd

In [4]:
with open('./doc.txt') as f:
    text = f.read()

In [17]:
import time

In [1]:
from ner.network import NER
from ner.corpus import Corpus
import json
from ner.utils import md5_hashsum, download_untar
from glob import glob


# Check existance of the model by hashsum
if md5_hashsum(sorted(glob('model/*'))) != 'fd50a27b96b24cdabdda13795a3baae7':
    # Download and extract model
    download_url = 'http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz'
    download_path = 'model/'
    download_untar(download_url, download_path)

# Load network params
with open('model/params.json') as f:
    network_params = json.load(f)


corpus = Corpus(dicts_filepath='model/dict.txt')

network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params)

Downloading from http://lnsigo.mipt.ru/export/models/ner/ner_model_total_rus.tar.gz to model/ner_model_total_rus.tar.gz


100%|█████████▉| 44.1M/44.3M [00:24<00:00, 1.17MB/s]

Extracting model/ner_model_total_rus.tar.gz archive into model/


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from model/ner_model


100%|██████████| 44.3M/44.3M [00:40<00:00, 1.17MB/s]

In [27]:
from collections import Counter

In [83]:
from ner.utils import tokenize, lemmatize


def predict(sentence, network):
    # Split sentence into tokens
    tokens = tokenize(sentence)
    
    # Lemmatize every token
    # Example: был -> быть, его -> он
    tokens_lemmas = lemmatize(tokens)
    
    tags = network.predict_for_token_batch([tokens_lemmas])[0]
    return tokens, tags
   

In [84]:
tokens, tags = predict(text, network)

In [91]:
NER_dict = {token: tag for token, tag in zip(tokens,tags) if tag != 'O' and token.isalpha()}

In [92]:
NER_dict

{'CRL': 'B-ORG',
 'Certificate': 'B-ORG',
 'Distribution': 'I-ORG',
 'Good': 'I-ORG',
 'Points': 'I-ORG',
 'Standing': 'I-ORG',
 'of': 'I-ORG',
 'АСТ': 'B-ORG',
 'Арбитражный': 'B-ORG',
 'Беларусь': 'I-LOC',
 'Белгородская': 'B-LOC',
 'Белгородской': 'B-LOC',
 'Воронеж': 'B-LOC',
 'ЗАО': 'B-ORG',
 'ИНН': 'B-PER',
 'ИТОГО': 'B-PER',
 'Казахстан': 'B-LOC',
 'М': 'B-PER',
 'Минздрава': 'B-ORG',
 'Минздравсоцразвития': 'B-ORG',
 'Минкомсвязи': 'B-ORG',
 'Москва': 'B-LOC',
 'НДС': 'I-PER',
 'Николаевич': 'I-PER',
 'ОГРН': 'I-PER',
 'ОКВЭД': 'B-PER',
 'Павел': 'I-PER',
 'Предрейсовый': 'B-ORG',
 'РФ': 'B-LOC',
 'Республики': 'B-LOC',
 'Рожков': 'B-PER',
 'России': 'I-ORG',
 'Российской': 'B-LOC',
 'Сi': 'I-ORG',
 'Сmin': 'B-ORG',
 'Сбербанк': 'B-ORG',
 'Таможенного': 'B-ORG',
 'ФНС': 'B-ORG',
 'Федерации': 'I-LOC',
 'ХХХ': 'I-PER',
 'ЦС': 'B-ORG',
 'ЭД': 'B-PER',
 'ЭДО': 'B-PER',
 'акт': 'B-ORG',
 'арбитражного': 'B-ORG',
 'арбитражный': 'B-ORG',
 'медицинский': 'I-ORG',
 'медицинского': 'I-

In [79]:
import re
tel = re.compile('[+0-9\-\(\)]{11,}')

In [80]:
tel.findall(text)

['8-473-260-95-53', '8(473)260-95-30', '8(473)260-95-30', '1027700132195']

In [81]:
email = re.compile('ˆ[a-zA-Z0-9_.+-]+[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')

In [82]:
email.findall(text)

[]