In [10]:
import collections
import os
import glob
import numpy as np
import math
import re
import pandas as pd
from tqdm import tqdm

Автоматический поиск произведений в папках texts/txt/\*\* (вместо \*\* — имя автора) и создание словарей *название-путь\_к\_файлу* (TITLE_PATH_DICT), *название-автор* (TITLE_AUTHOR_DICT), *автор-множество\_названий\_произведений* (AUTHOR__TITLE_SET__DICT).

In [15]:
FILE_PATHS = glob.glob('texts/converted/**/*.txt', recursive=True)

In [16]:
TITLE_PATH_DICT = {path.split("\\")[-1].split(".")[0] : path for path in FILE_PATHS}
TITLE_AUTHOR_DICT = {path.split("\\")[-1].split(".")[0] : path.split("\\")[-2] for path in FILE_PATHS}

In [17]:
AUTHOR__TITLE_SET__DICT = collections.defaultdict(set)
for title, author in TITLE_AUTHOR_DICT.items():
    AUTHOR__TITLE_SET__DICT[author].add(title)
AUTHOR__TITLE_SET__DICT = dict(AUTHOR__TITLE_SET__DICT)

### Лемматизация и токенизация

In [9]:
NUM_OF_WORDS = 5000

In [7]:
NEW_FOLDER = 'texts/5000_words_lemmatized/'

In [8]:
import re

from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords

patterns = "[A-Za-z0-9!#$%&'()*+,.…/:;<=>?@[\]^_`{|}~—–\"«»„“\-]+"
morph = MorphAnalyzer()

def lemmatize(doc, remove_stop_words=False):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    stopwords_ru = stopwords.words("russian") if remove_stop_words else ''
    for token in doc.split():
        if token and token.strip() not in stopwords_ru:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            
            tokens.append(token)
    return tokens

In [12]:
with tqdm(total=len(TITLE_PATH_DICT)) as pbar:
    for author, title_set in AUTHOR__TITLE_SET__DICT.items():
        i = 0
        lemmatized = []
        new_dir = os.path.join(NEW_FOLDER, author) 
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        for title in title_set:
            path = TITLE_PATH_DICT[title]
            with open(path, 'r', encoding="utf8") as infile:
                text = infile.read()
            lemmatized += lemmatize(text)[:-250]
            while len(lemmatized) >= NUM_OF_WORDS:
                np.save(os.path.join(new_dir, f"{i}.npy"), lemmatized[:NUM_OF_WORDS])
                del lemmatized[:NUM_OF_WORDS] 
                i += 1
            pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [13:43<00:00,  6.29s/it]


### Нарезка по N предложений

In [12]:
NUM_OF_SENTENCES = 500

In [13]:
NEW_FOLDER = f'texts/{NUM_OF_SENTENCES}_sentences/'

In [19]:
def get_ind_of_sentence_end(string, start=0):
    ind = start
    for char in string[start:]:
        if char in ".?!…":
            return ind
        ind += 1
    return None

def string_to_list_of_sentences(string):
    sentences = []
    while len(string) > 0:
        end_ind = get_ind_of_sentence_end(string)
        if end_ind is None:
            break
        sentence = string[:end_ind+1]
        if len(sentence) == 1 and len(sentences) > 0:
            sentences[-1] = sentences[-1] + sentence
        else:
            sentences.append(sentence)
        string = string[end_ind+1:]
    return sentences

In [20]:
with tqdm(total=len(TITLE_PATH_DICT)) as pbar:
    for author, title_set in AUTHOR__TITLE_SET__DICT.items():
        i = 0
        sentences = []
        new_dir = os.path.join(NEW_FOLDER, author) 
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        for title in title_set:
            path = TITLE_PATH_DICT[title]
            with open(path, 'r', encoding="utf8") as infile:
                text = infile.read()
            sentences += string_to_list_of_sentences(text)[:-10]
            while len(sentences) >= NUM_OF_SENTENCES:
                np.save(os.path.join(new_dir, f"{i}.npy"), sentences[:NUM_OF_SENTENCES])
                del sentences[:NUM_OF_SENTENCES] 
                i += 1
            pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [02:07<00:00,  1.03it/s]
