In [119]:
from pymystem3 import Mystem
import re
import time
import os
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

In [20]:
texts = [
    "я такой крутой пришел домой",
    "мама мыла - раму ; рама что то еще делала",
    "моя рыба рубачит лучше меня"
]

In [22]:
mystem_part = {
    'A': 'ADJ', # прилагательное
    'ADV': 'ADV', # наречие
    'ADVPRO': 'ADV', # местоименное наречие
    'ANUM': 'ADJ', # числительное-прилагательное
    'APRO': 'DET', # местоимение-прилагательное
    'COM': 'ADJ', # часть композита - сложного слова
    'CONJ': 'SCONJ', # союз
    'INTJ': 'INTJ',	# междометие
    'NUM': 'NUM', # числительное
    'PART': 'PART',	# частица
    'PR': 'ADP', # предлог
    'S': 'NOUN', # существительное
    'SPRO': 'PRON',
    'V': 'VERB', # глагол
}

In [8]:
def load_stopwords():
    stopwords = set()

    with open('../../res/stopwords.txt', mode="r", encoding="utf8") as file:
        for line in file:
            stopwords.add(line.replace('\n', ''))

    return stopwords

In [89]:
def __mystem_analisys__(mystem, text):
    try:   
        res = mystem.analyze(text)
        stem_res = []
        for r in res:
            analysis = r['analysis']
            try:
                gr = analysis[0]
                lex = analysis[0]['lex']
                parts = gr['gr'].split("=")
                parts2 = parts[0].split(",")
                part = parts2[0]
                lex_pas = "%s_%s" % (lex, mystem_part[part])
                stem_res.append((lex, lex_pas))
            except Exception as e:
                stem_res.append((r['text'], r['text']))
        return stem_res
    except Exception as e:
        return list(map(lambda x: (x, x), mystem.lemmatize(text)))
        
    return stem_res

In [90]:
def nomalize(texts, with_pos=False):
    stopwords = load_stopwords()
    r = re.compile('^[А-ЯЙа-яй]*$')
    mystem = Mystem(entire_input=False)
    tokens_corpuse = []
    for text in texts:
        words = filter(lambda x: r.match(x[0]), __mystem_analisys__(mystem, text))
        tokens = filter(lambda w: w[0] not in stopwords, words)
        
        tokens_res = []
        if with_pos:
            tokens_res = list(map(lambda x: x[1], tokens))
        else:
            tokens_res = list(map(lambda x: x[0], tokens))
        
        tokens_corpuse.append(tokens_res)
    return tokens_corpuse

In [99]:
def init_word3vec():
    model_path = '/data/gensim/news_0_300_2.bin.gz'
    word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=True)
    word_vectors.init_sims(replace=True)
    return word_vectors

In [100]:
def similar(word_vectors, word):
    try:
        sim = word_vectors.most_similar(positive=[w])
        return list(map(lambda x: x[0], sim))
    except Exception as e:
        return [word]

In [116]:
def extractSemanticGroup(tokens_matrix, with_pos=False):
    start_time = time.time()
    word_vectors = init_word3vec()
    print("init time: %s" % (time.time() - start_time))
    res_seq = []
    sem_groups = {}
    
    start_time = time.time()

    for words in tokens_matrix:
        res_tokens = []
        for token in words:
            if token not in sem_groups:
                sims = similar(word_vectors, token)
                sem_groups[token] = token
                res_tokens.append(token)
                for sim in sims:
                   sem_groups[sim] = token
            else:
                res_tokens.append(sem_groups[token])
        res_seq.append(res_tokens)
    print("for time: %s" % (time.time() - start_time))
    if not with_pos:
        res_seq = map(lambda x: list(map(lambda y: y.split("_")[0], x)), res_seq)

    return list(res_seq)

In [117]:
norm_res = nomalize(texts, True)

In [120]:
extractSemanticGroup(norm_res)

init time: 9.179507970809937
for time: 2.4080276489257812e-05


[['крутой', 'приходить', 'домой'],
 ['мама', 'мыть', 'рама', 'рама', 'делать'],
 ['рыба', 'рубачить']]