In [1]:
import pandas as pd
from itertools import chain
from pymystem3 import Mystem
from ai.nn.data.datasets import ListDataset, ApplyPreprocessors, to_list_dataset
from ai.nn.data.preprocessors import RemoveEmojis, RemoveHTML, RemoveSymbols
from joblib import Parallel, delayed

In [2]:
data = pd.read_excel("data.xlsx")["snippet.textOriginal"]
data.dropna(inplace=True)

In [3]:
default_preprocessors = [
    RemoveEmojis(),
    RemoveHTML(),
    RemoveSymbols(["\\t", "\\r", "\a", "\b", "\f", "\t", "\v", "\r", "\n", "\\n", "/",
                   "\\a", "\\b", "\\f", "\\t", "\\v", "\\r", "//", "\\",
                   "[", "]", "\"", "`", "\n"])
]

In [4]:
dataset = ApplyPreprocessors(ListDataset(list(data)), default_preprocessors)
comments = to_list_dataset(dataset).array

In [5]:
BATCH_SIZE = 1024
comments_batched = [comments[i: i + BATCH_SIZE] for i in range(0, len(comments), BATCH_SIZE)]

In [6]:
def lemmatize_batch(batch):
    stem = Mystem()
    return [''.join(stem.lemmatize(text)) for text in batch]

In [7]:
processed = Parallel(n_jobs=3)(delayed(lemmatize_batch)(batch) for batch in comments_batched)

In [8]:
processed = list(chain(*processed))

In [9]:
_lemmas = pd.DataFrame(processed)
_lemmas.columns = ["lemmas"]
_lemmas.index = data.index

In [10]:
full_data = pd.read_excel("data.xlsx")

In [11]:
full_data = pd.merge(full_data, _lemmas, left_index=True, right_index=True, how="outer")

In [12]:
full_data.to_excel("processed.xlsx")