In [87]:
import codecs
from collections import defaultdict

definitions = defaultdict(list)
with codecs.open(r'locale\ru\vocab\ozhegov.txt', 'r', 'utf8') as f:
    description = f.readline().split('|')
    states = [line.split('|') for line in f]
    for s in states:
        definitions[s[0]].append(s)

In [6]:
print (description)

['VOCAB', 'BASEFORM', 'PHONGL', 'GRCLASSGL', 'STYLGL', 'DEF', 'ANTI', 'LEGLEXAM\r\n']


In [88]:
for states in definitions.values():
    for i in range(1,len(states)):
        if states[i-1][4].startswith('!'):
            atakze = states[i][4].rfind(u'а также')
            s4 = states[i][4][atakze:] if atakze > 0 else states[i][4]
            states[i][4] =states[i-1][4][1:]+' '+ states[i-1][5]+', '+ s4

In [89]:
import re

abbrs = {r'\bobs\.?': u'устаревшее',
        r'\bустар\.': u'устаревшее',
        r'\bстар\.': u'в старину',
        r'\bColloq\.?': u'разговорное',
        r'\bразг\.': u'разговорное',
        r'\bшутл\.': u'шутлилое',
        r'\bофиц\.': u'официальное',
        r'\bвысок\.': u'высокое',
        r'\bупотр\.': u'употребляется',
        r'\bобл\.': u'местное',
        r'\bпрост\.': u'просторечное',
        r'-н\.': u'-нибудь',
        r'-л\.': u'-либо',
        r'\bнеодобр\.': u'неодобрительное',
        r'\bnon-st\.': u'',
        r'\bсо знач\.': u'со значением',
        r'\bперен\.': u'переносное',
        r'\bв знач\.': u'в значении',
        r'\bSpec': u'',
        r'\bLib': u'',
        r'\bирон\.': u'ироничное',
        r'\bспец\.': u'специальное',
        r'\bкнижн\.': u'книжное',
       }

for states in definitions.values():
    for s in states:
        for (abbr,value) in abbrs.items():
            for i in [4,5]:
                s[i] = re.sub(abbr, value, s[i], flags=re.IGNORECASE)
                s[i] = re.sub(r'\(-\w+\)', '', s[i])  #изобразить (-ать)
                s[i] = re.sub(r'==', u'то же, что и ', s[i])
                s[i] = re.sub(r'<=', 'от слова ', s[i])
                s[i] = re.sub(r'\d*\s*N\d*\W*\d*','', s[i], flags=re.IGNORECASE)
                s[i] = re.sub(r'[^а-яё0-9\",\.:\(\)\s-]+', '', s[i], flags=re.IGNORECASE)
                s[i] = re.sub(r'\(\s+', '(', s[i])
                s[i] = re.sub(r'\s+\)', ')', s[i])
                s[i] = re.sub(r'\s+', ' ', s[i])

In [92]:
with codecs.open(r'locale\ru\vocab\ozhegov.hint.txt', 'w+', 'utf8') as f:
    for word in sorted(definitions.keys()):
        for s in definitions[word]:
            worddef = (s[4]+' '+s[5]).strip()
            if worddef != "":
                f.write(s[0]+'|'+worddef+'\n')
        

Parsing orphography dictionary of Lopatin (to get nouns)

In [8]:
import pymorphy2
import codecs

morph = pymorphy2.MorphAnalyzer()

nouns = set()
adjvs = set()
adjends = [u'ый', u'ний', u'кий', u'ой', u'ая', u'яя', u'ое', u'ее']

def add(line):
    if not any([line.endswith(end) for end in adjends]):
        nouns.add(line)
    else:
        adjvs.add(line)

with codecs.open(r'locale\ru\vocab\lopatin.txt', 'r', 'utf8') as f:
    for line in f:
        line = line.strip()
        if len(line) == 0 or line[0].isupper() or ' ' in line:
            continue
        line = line.lower()
        if (line.endswith(u'цы') and line[0:-2]+u'ец' in nouns):
            continue
        forms = morph.parse(line)
        for form in forms:
            if form.score > 0.01:
                if ({'NOUN', 'nomn', 'sing'} in form.tag or 
                    ({'NOUN', 'nomn', 'plur'} in form.tag and {'NOUN', 'nomn', 'plur'} in form.normalized.tag)):
                    add(line)
                    break

In [9]:
with codecs.open(r'locale\ru\vocab\lopatin.noun.txt', 'w+', 'utf8') as f:
    for noun in sorted(set([n.replace('ё', 'е') for n in nouns])):
        f.write(noun + '\r\n')
with codecs.open(r'locale\ru\vocab\lopatin.noun-adj.txt', 'w+', 'utf8') as f:
    for word in sorted(set([n.replace('ё', 'е') for n in adjvs])):
        f.write(word + '\r\n')