In [1]:
from ufal.udpipe import Model, Pipeline
#import os
#import re
import sys
import json
#import wget

In [2]:
#udpipe_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
#modelfile = wget.download(udpipe_url)
modelfile = 'udpipe_syntagrus.model'

In [3]:
dataFolderPath = '../data'
booksFolderPath = dataFolderPath + '/books'
booksJsonFilePath = booksFolderPath + '/books.json'
booksJsonFile = open(booksJsonFilePath);
booksJson = json.load(booksJsonFile);
booksJsonFile.close();

for bookJson in booksJson:
    print(bookJson['name'])

Война и мир. Том 1
Идиот
Мастер и Маргарита
Палата № 6
Фома Гордеев
Что делать?
Отцы и дети
Обломов
Старуха Изергиль
Аэлита
12 стульев
Похождения бравого солдата Швейка
Записки сумасшедшего
История одного города
Большие Безобразия Маленького Папы
Белые ночи
Чушь собачья
Дар Шаванахолы
Игра в бисер
Король, дама, валет
Лолита
О дивный новый мир
Смерть африканского охотника
Записки Хендрика Груна из амстердамской богадельни
Деловые люди
Красота
Книга песка
Москва — Петушки
Непобедимое Солнце
Не позвать ли нам Дживса?
Пейзаж, нарисованный чаем
Понедельник начинается в субботу
Сказка о тройке
Последнее путешествие Ийона Тихого
С нами бот
Юмористические рассказы
Заповедник
Америка
Замок
Трое в лодке, не считая собаки
Золотой теленок
Сукины дети
Карамора
Приключения барона Мюнхаузена
Июнь
Собачье сердце
Вавилонские хроники / Обретение Энкиду


In [4]:
def num_replace(word):
    newtoken = 'x' * len(word)
    return newtoken

def clean_token(token, misc):
    out_token = token.strip().replace(' ', '')
    if token == 'Файл' and 'SpaceAfter=No' in misc:
        return None
    return out_token


def clean_lemma(lemma, pos):
    out_lemma = lemma.strip().replace(' ', '').replace('_', '').lower()
    if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
        return None
    if pos != 'PUNCT':
        if out_lemma.startswith('«') or out_lemma.startswith('»'):
            out_lemma = ''.join(out_lemma[1:])
        if out_lemma.endswith('«') or out_lemma.endswith('»'):
            out_lemma = ''.join(out_lemma[:-1])
        if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
                or out_lemma.endswith('.'):
            out_lemma = ''.join(out_lemma[:-1])
    return out_lemma

In [5]:
def process_direct(pipeline, text='Строка', keep_pos=True, keep_punct=False):
    entities = {'PROPN'}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split('\t') for w in content if w]

    for t in tagged:
        if len(t) != 10:
            print('STRANGE TAG IN TAGGED!', file=sys.stderr)
            continue
        (word_id, word, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        token = clean_token(word, misc)
        lemma = clean_lemma(lemma, pos)
        if not lemma or not token:
            print('NO LEMMA OR NO TOKEN!', file=sys.stderr)
            continue
        
        if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
            lemma = num_replace(token)
        spaces_after = ' '
        if 'SpacesAfter=\\s\\s\\s' in misc:
            spaces_after = '   '
        elif 'SpacesAfter=\\s\\s' in misc:
            spaces_after = '  '
        elif 'SpacesAfter=\\s\\n' in misc:
            spaces_after = '  \n'
        elif 'SpacesAfter=\\s' in misc:
            spaces_after = ' '
        elif 'SpacesAfter=\\n' in misc:
            spaces_after = '\n'        
        elif 'SpaceAfter=No' in misc:
            spaces_after = ''                    
        else:
            spaces_after = ' '
            
        chunk = word + spaces_after;
        tagged_propn.append('%s_%s~%s|' % (lemma, pos, chunk))
        
    if not keep_punct:
        tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
    if not keep_pos:
        tagged_propn = [word.split('_')[0] for word in tagged_propn]
    return tagged_propn


In [6]:
def lemmatize_text(text, modelfile=modelfile):    
    print('Loading the model...')
    model = Model.load(modelfile)
    process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

    print('Processing text...')
    output = process_direct(process_pipeline, text=text)
    print('Text done')
    return ''.join(output)

In [7]:
for bookJson in booksJson:
    name = bookJson['name']
    print('Name: ' + name)
    original_text_path = bookJson['originalTextPath']
    lemmatized_text_path = bookJson['lemmatizedTextPath']
    print('Opening original text from file... ' + original_text_path)
    original_text_file_handle = open(booksFolderPath + original_text_path, 'r', encoding='utf-8')
    original_text = original_text_file_handle.read()
    original_text_file_handle.close()
    print('Lemmatizing text...')
    lemmatized_text = lemmatize_text(original_text, modelfile)
    
    lemmatized_text_file_handle = open(booksFolderPath + lemmatized_text_path, 'w') 
    print('Writing lemmatized text to file... ' + lemmatized_text_path)
    lemmatized_text_file_handle.write(lemmatized_text)
    lemmatized_text_file_handle.close()
    print('Book done')

Name: Война и мир. Том 1
Opening original text from file... ./txt/voyna-i-mir-tom-1.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/voyna-i-mir-tom-1.lemmatized.txt
Book done
Name: Идиот
Opening original text from file... ./txt/idiot.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/idiot.lemmatized.txt
Book done
Name: Мастер и Маргарита
Opening original text from file... ./txt/master-i-margarita.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/master-i-margarita.lemmatized.txt
Book done
Name: Палата № 6
Opening original text from file... ./txt/palata-6-sbornik.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/palata-6-sbornik.lemmatized.txt
Book done
Name: Фома Гордеев
Opening original text from file... ./txt/foma-gordeev.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/foma-gordeev.lemmatized.txt
Book done
Name: Что делать?
Opening original text from file... ./txt/сhto-delat.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/сhto-delat.lemmatized.txt
Book done
Name: Отцы и дети
Opening original text from file... ./txt/otcy-i-dety.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/otcy-i-dety.lemmatized.txt
Book done
Name: Обломов
Opening original text from file... ./txt/oblomov.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/oblomov.lemmatized.txt
Book done
Name: Старуха Изергиль
Opening original text from file... ./txt/staruha-isergil.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/staruha-isergil.lemmatized.txt
Book done
Name: Аэлита
Opening original text from file... ./txt/aelita.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/aelita.lemmatized.txt
Book done
Name: 12 стульев
Opening original text from file... ./txt/12-stuljev.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/12-stuljev.lemmatized.txt
Book done
Name: Похождения бравого солдата Швейка
Opening original text from file... ./txt/pohozhdeniya-bravogo-soldata-shveyka.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA O

Text done
Writing lemmatized text to file... ./txt/pohozhdeniya-bravogo-soldata-shveyka.lemmatized.txt
Book done
Name: Записки сумасшедшего
Opening original text from file... ./txt/zapiski-sumasshedshego.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/zapiski-sumasshedshego.lemmatized.txt
Book done
Name: История одного города
Opening original text from file... ./txt/istoriya-odnogo-goroda.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/istoriya-odnogo-goroda.lemmatized.txt
Book done
Name: Большие Безобразия Маленького Папы
Opening original text from file... ./txt/bolshie-bezobraziya-malenkogo-papy.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/bolshie-bezobraziya-malenkogo-papy.lemmatized.txt
Book done
Name: Белые ночи
Opening original text from file... ./txt/belye-nochi.src.

NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/chush-sobachya.lemmatized.txt
Book done
Name: Дар Шаванахолы
Opening original text from file... ./txt/dar-Shavanaholy.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/dar-Shavanaholy.lemmatized.txt
Book done
Name: Игра в бисер
Opening original text from file... ./txt/igra-v-biser.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/igra-v-biser.lemmatized.txt
Book done
Name: Король, дама, валет
Opening original text from file... ./txt/korol-dama-valet.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/korol-dama-valet.lemmatized.txt
Book done
Name: Лолита
Opening original text from file... ./txt/lolita.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/lolita.lemmatized.txt
Book done
Name: О дивный новый мир
Opening original text from file... ./txt/o-divnyy-novyy-mir.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/o-divnyy-novyy-mir.lemmatized.txt
Book done
Name: Смерть африканского охотника
Opening original text from file... ./txt/smert-afrikanskogo-ohotnika.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/smert-afrikanskogo-ohotnika.lemmatized.txt
Book done
Name: Записки Хендрика Груна из амстердамской богадельни
Opening original text from file... ./txt/zapiski-Hendrika-Gruna-iz-amsterdamskoy-bogadelni.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/zapiski-Hendrika-Gruna-iz-amsterdamskoy-bogadelni.lemmatized.txt
Book done
Name: Деловые люди
Opening original text from file.

NO LEMMA OR NO TOKEN!


Processing text...
Text done
Writing lemmatized text to file... ./txt/kniga-peska.lemmatized.txt
Book done
Name: Москва — Петушки
Opening original text from file... ./txt/moskva-petushki.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/moskva-petushki.lemmatized.txt
Book done
Name: Непобедимое Солнце
Opening original text from file... ./txt/nepobedimoe-solnce.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/nepobedimoe-solnce.lemmatized.txt
Book done
Name: Не позвать ли нам Дживса?
Opening original text from file... ./txt/ne-pozvat-li-nam-Dzhivsa.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/ne-pozvat-li-nam-Dzhivsa.lemmatized.txt
Book done
Name: Пейзаж, нарисованный чаем
Opening original text from file... ./txt/peyzazh-narisovannyy-chaem.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/peyzazh-narisovannyy-chaem.lemmatized.txt
Book done
Name: Понедельник начинается в субботу
Opening original text from file... ./txt/ponedelnik-nachinaetsya-v-subbotu.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/ponedelnik-nachinaetsya-v-subbotu.lemmatized.txt
Book done
Name: Сказка о тройке
Opening original text from file... ./txt/skazka-o-troyke.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/skazka-o-troyke.lemmatized.txt
Book done
Name: Последнее путешествие Ийона Тихого
Opening original text from file... ./txt/poslednee-puteshestvie-Iyona-Tihogo.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/poslednee-puteshestvie-Iyona-Tihogo.lemmatized.txt
Book done
Name: С нами бот
Opening original text from file... ./txt/s-nami-bot.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/s-nami-bot.lemmatized.txt
Book done
Name: Юмористические рассказы
Opening original text from file... ./txt/yumoristicheskie-rasskazy(Teffi).src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/yumoristicheskie-rasskazy(Teffi).lemmatized.txt
Book done
Name: Заповедник
Opening original text from file... ./txt/zapovednik.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/zapovednik.lemmatized.txt
Book done
Name: Америка
Opening original text from file... ./txt/аmerika.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/аmerika.lemmatized.txt
Book done
Name: Замок
Opening original text from file... ./txt/zamok.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/zamok.lemmatized.txt
Book done
Name: Трое в лодке, не считая собаки
Opening original text from file... ./txt/troe-v-lodke-ne-schitaya-sobaki.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/troe-v-lodke-ne-schitaya-sobaki.lemmatized.txt
Book done
Name: Золотой теленок
Opening original text from file... ./txt/zolotoy-telenok.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/zolotoy-telenok.lemmatized.txt
Book done
Name: Сукины дети
Opening original text from file... ./txt/sukiny-deti.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/sukiny-deti.lemmatized.txt
Book done
Name: Карамора
Opening original text from file... ./txt/karamora.src.txt
Lemmatizing text...
Loading the model...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Processing text...
Text done
Writing lemmatized text to file... ./txt/karamora.lemmatized.txt
Book done
Name: Приключения барона Мюнхаузена
Opening original text from file... ./txt/priklyucheniya-barona-Myunghauzena.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/priklyucheniya-barona-Myunghauzena.lemmatized.txt
Book done
Name: Июнь
Opening original text from file... ./txt/iyun.src.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/iyun.lemmatized.txt
Book done
Name: Собачье сердце
Opening original text from file... ./txt/sobache-serdce.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/sobache-serdce.lemmatized.txt
Book done
Name: Вавилонские хроники / Обретение Энкиду
Opening original text from file... ./txt/vavilonskie-hroniki-obretenie-enkidu.src.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/vavilonskie-hroniki-obretenie-enkidu.lemmatized.txt
Book done


NO LEMMA OR NO TOKEN!
