In [1]:
from ufal.udpipe import Model, Pipeline
#import os
#import re
import sys
import json
#import wget

In [2]:
#udpipe_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
#modelfile = wget.download(udpipe_url)
modelfile = 'udpipe_syntagrus.model'

In [3]:
dataFolderPath = '../data'
booksFolderPath = dataFolderPath + '/books'
booksJsonFilePath = booksFolderPath + '/books.json'
booksJsonFile = open(booksJsonFilePath);
booksJson = json.load(booksJsonFile);
booksJsonFile.close();

for bookJson in booksJson:
    print(bookJson['name'])

Война и мир. Том I
Идиот
Мастер и Маргарита
Палата № 6
Повесть о разуме
Что делать?
Отцы и дети
Обломов
Старууха Изергиль. Карамола.
Аэлита


In [4]:
def num_replace(word):
    newtoken = 'x' * len(word)
    return newtoken

def clean_token(token, misc):
    out_token = token.strip().replace(' ', '')
    if token == 'Файл' and 'SpaceAfter=No' in misc:
        return None
    return out_token


def clean_lemma(lemma, pos):
    out_lemma = lemma.strip().replace(' ', '').replace('_', '').lower()
    if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
        return None
    if pos != 'PUNCT':
        if out_lemma.startswith('«') or out_lemma.startswith('»'):
            out_lemma = ''.join(out_lemma[1:])
        if out_lemma.endswith('«') or out_lemma.endswith('»'):
            out_lemma = ''.join(out_lemma[:-1])
        if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
                or out_lemma.endswith('.'):
            out_lemma = ''.join(out_lemma[:-1])
    return out_lemma

In [5]:
def process_direct(pipeline, text='Строка', keep_pos=True, keep_punct=False):
    entities = {'PROPN'}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split('\t') for w in content if w]

    for t in tagged:
        if len(t) != 10:
            print('STRANGE TAG IN TAGGED!', file=sys.stderr)
            continue
        (word_id, word, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        token = clean_token(word, misc)
        lemma = clean_lemma(lemma, pos)
        if not lemma or not token:
            print('NO LEMMA OR NO TOKEN!', file=sys.stderr)
            continue
        
        if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
            lemma = num_replace(token)
        spaces_after = ' '
        if 'SpacesAfter=\\s\\s\\s' in misc:
            spaces_after = '   '
        elif 'SpacesAfter=\\s\\s' in misc:
            spaces_after = '  '
        elif 'SpacesAfter=\\s\\n' in misc:
            spaces_after = '  \n'
        elif 'SpacesAfter=\\s' in misc:
            spaces_after = ' '
        elif 'SpacesAfter=\\n' in misc:
            spaces_after = '\n'        
        elif 'SpaceAfter=No' in misc:
            spaces_after = ''                    
        else:
            spaces_after = ' '
            
        chunk = word + spaces_after;
        tagged_propn.append('%s_%s~%s|' % (lemma, pos, chunk))
        
    if not keep_punct:
        tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
    if not keep_pos:
        tagged_propn = [word.split('_')[0] for word in tagged_propn]
    return tagged_propn


In [6]:
def lemmatize_text(text, modelfile=modelfile):    
    print('Loading the model...')
    model = Model.load(modelfile)
    process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

    print('Processing text...')
    output = process_direct(process_pipeline, text=text)
    print('Text done')
    return ''.join(output)

In [7]:
for bookJson in booksJson:
    name = bookJson['name']
    print('Name: ' + name)
    original_text_path = bookJson['originalTextPath']
    lemmatized_text_path = bookJson['lemmatizedTextPath']
    print('Opening original text from file... ' + original_text_path)
    original_text_file_handle = open(booksFolderPath + original_text_path, 'r', encoding='utf-8')
    original_text = original_text_file_handle.read()
    original_text_file_handle.close()
    print('Lemmatizing text...')
    lemmatized_text = lemmatize_text(original_text, modelfile)
    
    lemmatized_text_file_handle = open(booksFolderPath + lemmatized_text_path, 'w') 
    print('Writing lemmatized text to file... ' + lemmatized_text_path)
    lemmatized_text_file_handle.write(lemmatized_text)
    lemmatized_text_file_handle.close()
    print('Book done')

Name: Война и мир. Том I
Opening original text from file... ./txt/1. voyna-i-mir-tom-1.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/1. voyna-i-mir-tom-1.lemmatized.txt
Book done
Name: Идиот
Opening original text from file... ./txt/2. idiot.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/2. idiot.lemmatized.txt
Book done
Name: Мастер и Маргарита
Opening original text from file... ./txt/3. master-i-margarita.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/3. master-i-margarita.lemmatized.txt
Book done
Name: Палата № 6
Opening original text from file... ./txt/4. palata-6-sbornik.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/4. palata-6-sbornik.lemmatized.txt
Book done
Name: Повесть о разуме
Opening original text from file... ./txt/5. povest-o-razume.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Writing lemmatized text to file... ./txt/5. povest-o-razume.lemmatized.txt
Book done
Name: Что делать?
Opening original text from file... ./txt/6. сhto-delat.txt
Lemmatizing text...
Loading the model...
Processing text...

NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!



Text done
Writing lemmatized text to file... ./txt/6. сhto-delat.lemmatized.txt
Book done
Name: Отцы и дети
Opening original text from file... ./txt/7. otcy-i-dety.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/7. otcy-i-dety.lemmatized.txt
Book done
Name: Обломов
Opening original text from file... ./txt/8. oblomov.txt
Lemmatizing text...
Loading the model...
Processing text...


NO LEMMA OR NO TOKEN!
NO LEMMA OR NO TOKEN!


Text done
Writing lemmatized text to file... ./txt/8. oblomov.lemmatized.txt
Book done
Name: Старууха Изергиль. Карамола.
Opening original text from file... ./txt/9. staruha-isergil.karamola.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/9. staruha-isergil.karamola.lemmatized.txt
Book done
Name: Аэлита
Opening original text from file... ./txt/10. aelita.txt
Lemmatizing text...
Loading the model...
Processing text...
Text done
Writing lemmatized text to file... ./txt/10. aelita.lemmatized.txt
Book done
