In [None]:
pip install gensim spacy



In [None]:
!python -m spacy download "ru_core_news_sm"

Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.0/ru_core_news_sm-3.8.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import gensim.downloader as api
import spacy

nlp = spacy.load("ru_core_news_sm")

w2v = api.load(name="word2vec-ruscorpora-300")

PATH_TO_BOOK = "./book.txt"



In [12]:
def get_book_text(*, path: str) -> str:
  with open(file=path, mode="r", encoding="utf8") as file:
    return file.read()

def normolize_text(*, text: str) -> str:
  return text.replace("\n", "").lower()

In [None]:
from collections import Counter

tokens_counter = Counter()

def get_text_tokens(*, text: str) -> list[str]:
  doc = nlp(text=text)

  lemma_tokens = [f"{token.lemma_}_{token.pos_}" for token in doc if not token.is_stop and not token.is_punct]

  tokens_counter.update(lemma_tokens)

  return lemma_tokens


def get_most_popular(*, counter_struct: Counter, count: int) -> list[tuple[str, int]]:
  return counter_struct.most_common(count)

def get_most_unpopular(*, counter_struct: Counter, count: int) -> list[tuple[str, int]]:
  return counter_struct.most_common()[-count:]

book_text = get_book_text(path=PATH_TO_BOOK)
normolized_text = normolize_text(text=book_text)

tokenized_text = get_text_tokens(text=normolized_text)

five_most_popular = get_most_popular(
    counter_struct=tokens_counter,
    count=5
    )

five_most_unpopular = get_most_unpopular(
    counter_struct=tokens_counter,
    count=5
    )

print(five_most_popular)
print(five_most_unpopular)

[('коля_NOUN', 155), ('сказать_VERB', 123), ('лейтенант_NOUN', 65), ('товарищ_NOUN', 63), ('коля_PROPN', 62)]
[('променять_VERB', 1), ('оголенное_ADJ', 1), ('тоска_NOUN', 1), ('нестерпимо_ADV', 1), ('захотеться_VERB', 1)]


In [23]:
def find_most_similar_and_unsimilar_words(*, word: str, topn: int = 5) -> tuple[list[tuple[str, float]], list[tuple[str, float]]]:
  if word in w2v:
    most_similar = w2v.most_similar(positive=[word], topn=topn)
    most_unsimilar = w2v.most_similar(negative=[word], topn=topn)
    return most_similar, most_unsimilar
  else:
    return [], []


most_popular_similar_unsimilar = {
    token[0]: find_most_similar_and_unsimilar_words(word=token[0])
    for token in five_most_popular
}

most_unpopular_similar_unsimilar = {
    token[0]: find_most_similar_and_unsimilar_words(word=token[0])
    for token in five_most_unpopular
}

print(most_unpopular_similar_unsimilar)

print(most_popular_similar_unsimilar)

{'променять_VERB': ([('отписной_ADJ', 0.6108760833740234), ('покуплять_VERB', 0.5565236210823059), ('важский_ADJ', 0.5522485375404358), ('сазиков_NOUN', 0.5511545538902283), ('харатейный_ADJ', 0.5497941970825195)], [('шкурничество_NOUN', 0.1632651686668396), ('генерал-лейтенант::кейт_NOUN', 0.15048211812973022), ('стюардесса_NOUN', 0.14454029500484467), ('подташнивать_VERB', 0.14320816099643707), ('недисциплинированность_NOUN', 0.14041376113891602)]), 'оголенное_ADJ': ([], []), 'тоска_NOUN': ([('грусть_NOUN', 0.7532758116722107), ('тоскливый_ADJ', 0.7276431322097778), ('скука_NOUN', 0.70046067237854), ('отчаяние_NOUN', 0.6835792064666748), ('печаль_NOUN', 0.6749533414840698)], [('кимрский_ADJ', 0.11602100729942322), ('юнкор_NOUN', 0.09584140032529831), ('электротехнологический::персонал_NOUN', 0.09323256462812424), ('патентовед_NOUN', 0.09309279918670654), ('cit_NOUN', 0.08841274678707123)]), 'нестерпимо_ADV': ([('невыносимый_ADJ', 0.7199772596359253), ('нестерпимый_ADJ', 0.65321838855