In [3]:
import typing
import warnings

from matplotlib.pyplot import text
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import typing
import tqdm
import os
import re
from pathlib import Path

from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    NamesExtractor,
    PER,
    Doc
) 

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)


s = 'Назначить  Эрмиш  Ирину  Геннадьевну  на должность председателя \nкомитета  Ивановской  области,  по  социальной  защите  населения с \n16.01.2006  на  срок  полномочий'

In [28]:
doc = Doc(s)
doc.segment(segmenter) 
doc.tag_morph(morph_tagger)
for token in doc.tokens:
    token.lemmatize(morph_vocab)

doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)
for span in doc.spans:
    span.normalize(morph_vocab)

names = []
# фамилию принимает за имя

for span in doc.spans:
    if span.type == PER:
        span.extract_fact(names_extractor)
        names.append(span)
        # print(span.fact.as_dict)
        # names.append({'name_raw':span.text,'name_norm':span.normal})

if len(names) == 2:
    if len(names[1].fact.as_dict) + len(names[0].fact.as_dict) == 3:
        concated_name_norm = names[0].normal + ' ' + names[1].normal 
        concated_name_norm = ' '.join(concated_name_norm.split())
        concated_name_raw = names[0].text + ' ' + names[1].text 
        concated_name_raw = ' '.join(concated_name_raw.split())
        names = {'name_raw':concated_name_raw,'name_norm':concated_name_raw}
else:
    names = [{'name_raw':' '.join(span.text.split()),'name_norm':' '.join(span.normal.split())} for span in names]

names

{'name_raw': 'Эрмиш Ирину Геннадьевну', 'name_norm': 'Эрмиш Ирину Геннадьевну'}

In [4]:
# regs = open('regions_n_links.txt', 'r').read()

import json
data = list(json.load( open('regions_n_links.json', 'r')).keys())

with open('regions.json', 'w') as f:
    json.dump(data, f)


In [9]:
def normalize_plz(s):
    doc = Doc(s)
    doc.segment(segmenter) 
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)

    return doc.spans[0].normal

import ast
regs_n_links = open('regions_n_links.txt', 'r').read()
regs_n_links = ast.literal_eval(regs_n_links)
d = {}
for k,v in regs_n_links.items():
    try:
        new_reg = normalize_plz(k)    
    except Exception as x:
        new_reg = k    
    d[new_reg] = v

import json 
with open('regions_n_links.json','w') as f:
    json.dump(d,f)   


with open('regions_n_links.json','r') as f:
    print(json.load(f))



{'Белгородская область': 'http://pravo.gov.ru/proxy/ips/?searchres=&bpas=r013100&a3=&a3type=1&a3value=&a6=&a6type=1&a6value=&a15=&a15type=1&a15value=&a7type=3&a7from=&a7to=&a7date=01.01.2000&a8=&a8type=1&a1=&a0=%ED%E0%E7%ED%E0%F7%E8%F2%FC&a16=&a16type=1&a16value=&a17=&a17type=1&a17value=&a4=&a4type=1&a4value=&a23=&a23type=1&a23value=&textpres=&sort=7&x=39&y=14', 'Брянская область': 'http://pravo.gov.ru/proxy/ips/?searchres=&bpas=r013200&a3=&a3type=1&a3value=&a6=&a6type=1&a6value=&a15=&a15type=1&a15value=&a7type=3&a7from=&a7to=&a7date=01.01.2000&a8=&a8type=1&a1=&a0=%ED%E0%E7%ED%E0%F7%E8%F2%FC&a16=&a16type=1&a16value=&a17=&a17type=1&a17value=&a4=&a4type=1&a4value=&a23=&a23type=1&a23value=&textpres=&sort=7&x=39&y=14', 'Владимирская область': 'http://pravo.gov.ru/proxy/ips/?searchres=&bpas=r013300&a3=&a3type=1&a3value=&a6=&a6type=1&a6value=&a15=&a15type=1&a15value=&a7type=3&a7from=&a7to=&a7date=01.01.2000&a8=&a8type=1&a1=&a0=%ED%E0%E7%ED%E0%F7%E8%F2%FC&a16=&a16type=1&a16value=&a17=&a17type