In [201]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler

import nltk
# from nltk import word_tokenize - –Ω—É–∂–Ω–æ nltk.download('punkt')

from nltk import wordpunct_tokenize, wordnet
from nltk.stem import wordnet as WordNetLem
from nltk.stem import SnowballStemmer, StemmerI

import gensim
from gensim.corpora import Dictionary
from gensim.models import doc2vec

In [179]:
stop_words = \
    ['–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ', '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', 
     '–Ω–æ', '–¥–∞', '—Ç—ã', '–∫', '—É', '–∂–µ', '–≤—ã', '–∑–∞', '–±—ã', '–ø–æ', '—Ç–æ–ª—å–∫–æ', '–µ–µ', '–º–Ω–µ', '–±—ã–ª–æ', '–≤–æ—Ç', '–æ—Ç', 
     '–º–µ–Ω—è', '–µ—â–µ', '–Ω–µ—Ç', '–æ', '–∏–∑', '–µ–º—É', '—Ç–µ–ø–µ—Ä—å', '–∫–æ–≥–¥–∞', '–¥–∞–∂–µ', '–Ω—É', '–≤–¥—Ä—É–≥', '–ª–∏', '–µ—Å–ª–∏', '—É–∂–µ', 
     '–∏–ª–∏', '–Ω–∏', '–±—ã—Ç—å', '–±—ã–ª', '–Ω–µ–≥–æ', '–¥–æ', '–≤–∞—Å', '–Ω–∏–±—É–¥—å', '–æ–ø—è—Ç—å', '—É–∂', '–≤–∞–º', '–≤–µ–¥—å', '—Ç–∞–º', 
     '–ø–æ—Ç–æ–º', '—Å–µ–±—è', '–Ω–∏—á–µ–≥–æ', '–µ–π', '–º–æ–∂–µ—Ç', '–æ–Ω–∏', '—Ç—É—Ç', '–≥–¥–µ', '–µ—Å—Ç—å', '–Ω–∞–¥–æ', '–Ω–µ–π', '–¥–ª—è', '–º—ã', 
     '—Ç–µ–±—è', '–∏—Ö', '—á–µ–º', '–±—ã–ª–∞', '—Å–∞–º', '—á—Ç–æ–±', '–±–µ–∑', '–±—É–¥—Ç–æ', '—á–µ–≥–æ', '—Ä–∞–∑', '—Ç–æ–∂–µ', '—Å–µ–±–µ', '–ø–æ–¥', 
     '–±—É–¥–µ—Ç', '–∂', '—Ç–æ–≥–¥–∞', '–∫—Ç–æ', '—ç—Ç–æ—Ç', '—Ç–æ–≥–æ', '–ø–æ—Ç–æ–º—É', '—ç—Ç–æ–≥–æ', '–∫–∞–∫–æ–π', '—Å–æ–≤—Å–µ–º', '–Ω–∏–º', '–∑–¥–µ—Å—å', 
     '—ç—Ç–æ–º', '–æ–¥–∏–Ω', '–ø–æ—á—Ç–∏', '–º–æ–π', '—Ç–µ–º', '—á—Ç–æ–±—ã', '–Ω–µ–µ', '—Å–µ–π—á–∞—Å', '–±—ã–ª–∏', '–∫—É–¥–∞', '–∑–∞—á–µ–º', '–≤—Å–µ—Ö', 
     '–Ω–∏–∫–æ–≥–¥–∞', '–º–æ–∂–Ω–æ', '–ø—Ä–∏', '–Ω–∞–∫–æ–Ω–µ—Ü', '–¥–≤–∞', '–æ–±', '–¥—Ä—É–≥–æ–π', '—Ö–æ—Ç—å', '–ø–æ—Å–ª–µ', '–Ω–∞–¥', '–±–æ–ª—å—à–µ', '—Ç–æ—Ç', 
     '—á–µ—Ä–µ–∑', '—ç—Ç–∏', '–Ω–∞—Å', '–ø—Ä–æ', '–≤—Å–µ–≥–æ', '–Ω–∏—Ö', '–∫–∞–∫–∞—è', '–º–Ω–æ–≥–æ', '—Ä–∞–∑–≤–µ', '—Ç—Ä–∏', '—ç—Ç—É', '–º–æ—è', 
     '–≤–ø—Ä–æ—á–µ–º', '—Ö–æ—Ä–æ—à–æ', '—Å–≤–æ—é', '—ç—Ç–æ–π', '–ø–µ—Ä–µ–¥', '–∏–Ω–æ–≥–¥–∞', '–ª—É—á—à–µ', '—á—É—Ç—å', '—Ç–æ–º', '–Ω–µ–ª—å–∑—è', '—Ç–∞–∫–æ–π', 
     '–∏–º', '–±–æ–ª–µ–µ', '–≤—Å–µ–≥–¥–∞', '–∫–æ–Ω–µ—á–Ω–æ', '–≤—Å—é', '–º–µ–∂–¥—É']
stop_words.extend(['–æ—á–µ–Ω—å', '–æ–æ–æ—á–µ–Ω—å', '—ç—Ç–æ', '–¥–∞–Ω–Ω–æ–µ'])

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [163]:
df = pd.read_csv('data/coffee.csv')
df['rating'] = df['rating'].astype('float16')

In [164]:
df_tmp = df[df['rating'] < 3].iloc[:500]
df_tmp.shape

(500, 5)

In [203]:
df_tmp['text'].iloc[:3]

3     –°–∞–º—ã–π –±–æ–ª—å—à–æ–π –ø–ª—é—Å —ç—Ç–æ –º–µ—Å—Ç–æ—Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∏–µ, –Ω–∞–±–µ...
6     1. –î–æ—Å—Ç–∞–≤–∫–∞ –æ—á–µ–Ω—å –¥–æ–ª–≥–∞—è, –Ω–∞ —Ä–µ–∫–ª–∞–º–Ω–æ–π –±—Ä–æ—à—é—Ä–µ...
15    –ó–∞–∫–∞–∑–∞–ª–∏ —Ñ–æ-–±–æ –∏ —Ç–æ–º-—è–º\n–ö–æ—Ä–æ—á–µ –±–æ–ª—å—à–µ –Ω–µ –ø—Ä–∏–¥...
Name: text, dtype: object

In [206]:
# —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤ –ø–æ —Ä–µ–π—Ç–∏–Ω–≥—É - –Ω–µ —Å–±–∞–ª–ª–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∫–ª–∞—Å—Å—ã
df['rating'].value_counts()

5.0    88530
4.0    10811
1.0     8200
3.0     6335
2.0     3893
0.0       49
Name: rating, dtype: int64

In [205]:
# –≤—ã–¥–µ–ª–µ–Ω–∏–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –∏ –æ–±–æ—Ä–æ—Ç–æ–≤
pattern = r".+?[,.?!()]"
result = []
for text_x in df_tmp['text']:
    result_tmp = re.findall(pattern, text_x)
    result.append(result_tmp)
result

[['–°–∞–º—ã–π –±–æ–ª—å—à–æ–π –ø–ª—é—Å —ç—Ç–æ –º–µ—Å—Ç–æ—Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∏–µ,',
  ' –Ω–∞–±–µ—Ä–µ–∂–Ω–∞—è ,',
  ' —à–∏–∫–∞—Ä–Ω—ã–π –≤–∏–¥ –Ω–∞ –º–æ—Ä–µ!',
  ' –ö—Ä–∞—Å–∏–≤–æ,',
  ' —É—é—Ç–Ω–æ,',
  ' –≤–æ—Ç —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ –ø–ª—é—Å—ã –∑–∞–∫–æ–Ω—á–∏–ª–∏—Å—å .',
  '. –æ–≥–æ—Ä—á–∞–µ—Ç –æ—Ç–Ω–æ—à–µ–Ω–∏–µ –∫ –ø–æ—Å–µ—Ç–∏—Ç–µ–ª—è–º,',
  ' –æ—Ñ–∏—Ü–∏–∞–Ω—Ç—ã –Ω–µ–ø—Ä–∏–≤–µ—Ç–ª–∏–≤—ã–µ,',
  ' –Ω–µ –∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ –≤–∞–º,',
  ' –Ω–µ –¥–æ —Å–≤–∏–¥–∞–Ω–∏—è .',
  ' –õ–∏—Ü–∞ –Ω–µ–¥–æ–≤–æ–ª—å–Ω—ã–µ,',
  ' –Ω–µ–ø—Ä–∏—è—Ç–Ω—ã–µ,',
  ' –±–æ–ª—å—à–µ –Ω–µ —Ö–æ—á–µ—Ç—Å—è —Å–º–æ—Ç—Ä–µ—Ç—å –Ω–∞ —Ç–∞–∫–∏–µ!',
  ' –ö—É—Ö–Ω—è —Ç–æ–∂–µ –æ—Å—Ç–∞–≤–ª—è–µ—Ç –∂–µ–ª–∞—Ç—å –ª—É—á—à–µ–≥–æ,',
  ' –≤ –ª—é–ª–µ –∫–µ–±–∞–± –∫–æ—Å—Ç–∏ –ø–æ–ø–∞–¥–∞—é—Ç—Å—è,',
  ' —à–∞—à–ª—ã–∫ –∏–∑ –≥–æ–≤—è–¥–∏–Ω—ã —Å—É—Ö–æ–π –∏ –Ω–µ–≤–∫—É—Å–Ω—ã–π.',
  ' –ú—ã –Ω–∞ –æ—Ç–¥—ã—Ö–µ ,',
  ' –Ω–∞ –ø–æ–∑–∏—Ç–∏–≤–µ ,',
  ' –¥–µ–Ω–µ–≥ –Ω–µ –∂–∞–ª–µ–µ–º,',
  ' –Ω–æ —Ö–æ—á–µ—Ç—Å—è –ø—Ä–∏—Ö–æ–¥–∏—Ç—å —Ç—É–¥–∞ –≥–¥–µ –Ω–∞–º —Ä–∞–¥—ã!'],
 ['1

# –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö 
- —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è gensim, –ø–æ—Ç–æ–º—É —á—Ç–æ —Ç–∞–º –±–µ–∑ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏ –ø–æ–ª—É—á–∞–µ—Ç—Å—è
- —Å—Ç–µ–º–º–∏–Ω–≥/–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è
- —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø —Å–ª–æ–≤
- —Å–æ–∑–¥–∞–Ω–∏–µ –±–∏/—Ç—Ä–∏-–≥—Ä–∞–º–º

In [170]:
# –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [171]:
# –µ—Å—Ç—å –ø—Ä–æ–±–ª–µ–º–∞ - n—á—Ç–æ –º–æ–∂–Ω–æ —É–ª—É—á—à–∏—Ç—å - –º–∞–ª–æ–≤–∞—Ç–æ –º–µ—Å—Ç–∞ –ø–æ—Å–∞–¥–∫–∏. n –∞–Ω–≥–ª–∏–π—Å–∫–∞—è –æ—Å—Ç–∞–µ—Ç—Å—è.
tokens = list(sent_to_words(df_tmp['text']))
tokens[0][:32]

['—Å–∞–º—ã–∏',
 '–±–æ–ª—å—à–æ–∏',
 '–ø–ª—é—Å',
 '—ç—Ç–æ',
 '–Ω–∞–±–µ—Ä–µ–∂–Ω–∞—è',
 '—à–∏–∫–∞—Ä–Ω—ã–∏',
 '–≤–∏–¥',
 '–Ω–∞',
 '–º–æ—Ä–µ',
 '–∫—Ä–∞—Å–∏–≤–æ',
 '—É—é—Ç–Ω–æ',
 '–≤–æ—Ç',
 '—Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ',
 '–ø–ª—é—Å—ã',
 '–∑–∞–∫–æ–Ω—á–∏–ª–∏—Å—å',
 '–æ–≥–æ—Ä—á–∞–µ—Ç',
 '–æ—Ç–Ω–æ—à–µ–Ω–∏–µ',
 '–ø–æ—Å–µ—Ç–∏—Ç–µ–ª—è–º',
 '–æ—Ñ–∏—Ü–∏–∞–Ω—Ç—ã',
 '–Ω–µ–ø—Ä–∏–≤–µ—Ç–ª–∏–≤—ã–µ',
 '–Ω–µ',
 '–∑–¥—Ä–∞–≤—Å—Ç–≤—É–∏—Ç–µ',
 '–≤–∞–º',
 '–Ω–µ',
 '–¥–æ',
 '—Å–≤–∏–¥–∞–Ω–∏—è',
 '–ª–∏—Ü–∞',
 '–Ω–µ–¥–æ–≤–æ–ª—å–Ω—ã–µ',
 '–Ω–µ–ø—Ä–∏—è—Ç–Ω—ã–µ',
 '–±–æ–ª—å—à–µ',
 '–Ω–µ',
 '—Ö–æ—á–µ—Ç—Å—è']

In [180]:
# —Å—Ç–µ–º–º–∏–Ω–≥ –∏ —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø —Å–ª–æ–≤
stemmer = SnowballStemmer('russian')

stem_nltk = []
for sentence in tokens:
    stem_nltk.append(list(stemmer.stem(word_x) for word_x in sentence if word_x not in stop_words))
stem_nltk[0], len(stem_nltk[0])

(['—Å–∞–º—ã',
  '–±–æ–ª—å—à–æ',
  '–ø–ª—é—Å',
  '–Ω–∞–±–µ—Ä–µ–∂–Ω',
  '—à–∏–∫–∞—Ä–Ω—ã',
  '–≤–∏–¥',
  '–º–æ—Ä',
  '–∫—Ä–∞—Å–∏–≤',
  '—É—é—Ç–Ω',
  '—Å–æ–±—Å—Ç–≤–µ–Ω',
  '–ø–ª—é—Å',
  '–∑–∞–∫–æ–Ω—á',
  '–æ–≥–æ—Ä—á–∞',
  '–æ—Ç–Ω–æ—à–µ–Ω',
  '–ø–æ—Å–µ—Ç–∏—Ç–µ–ª',
  '–æ—Ñ–∏—Ü–∏–∞–Ω—Ç',
  '–Ω–µ–ø—Ä–∏–≤–µ—Ç–ª–∏–≤',
  '–∑–¥—Ä–∞–≤—Å—Ç–≤—É',
  '—Å–≤–∏–¥–∞–Ω',
  '–ª–∏—Ü',
  '–Ω–µ–¥–æ–≤–æ–ª—å–Ω',
  '–Ω–µ–ø—Ä–∏—è—Ç–Ω',
  '—Ö–æ—á–µ—Ç',
  '—Å–º–æ—Ç—Ä–µ—Ç',
  '—Ç–∞–∫',
  '–∫—É—Ö–Ω',
  '–æ—Å—Ç–∞–≤–ª—è',
  '–∂–µ–ª–∞',
  '–ª—É—á—à',
  '–ª—é–ª',
  '–∫–µ–±–∞–±',
  '–∫–æ—Å—Ç',
  '–ø–æ–ø–∞–¥–∞',
  '—à–∞—à–ª—ã–∫',
  '–≥–æ–≤—è–¥–∏–Ω',
  '—Å—É—Ö–æ',
  '–Ω–µ–≤–∫—É—Å–Ω—ã',
  '–æ—Ç–¥—ã—Ö',
  '–ø–æ–∑–∏—Ç–∏–≤',
  '–¥–µ–Ω–µ–≥',
  '–∂–∞–ª–µ',
  '—Ö–æ—á–µ—Ç',
  '–ø—Ä–∏—Ö–æ–¥',
  '—Ç—É–¥',
  '–Ω–∞–º',
  '—Ä–∞–¥'],
 46)

# –ö–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞
- CountVectorizer
- CountVectorizer(binary=True)
- Tfidf
- Doc2Vec

In [181]:
# –æ–±—ä–µ–¥–∏–Ω–∏–º —Ç–µ–∫—Å—Ç –≤ –æ—Ç–∑—ã–≤—ã
tok_stem_text = []
for word_x in range(len(stem_nltk)):
    tok_stem_text.append(" ".join(stem_nltk[word_x]))
tok_stem_text[0]

'—Å–∞–º—ã –±–æ–ª—å—à–æ –ø–ª—é—Å –Ω–∞–±–µ—Ä–µ–∂–Ω —à–∏–∫–∞—Ä–Ω—ã –≤–∏–¥ –º–æ—Ä –∫—Ä–∞—Å–∏–≤ —É—é—Ç–Ω —Å–æ–±—Å—Ç–≤–µ–Ω –ø–ª—é—Å –∑–∞–∫–æ–Ω—á –æ–≥–æ—Ä—á–∞ –æ—Ç–Ω–æ—à–µ–Ω –ø–æ—Å–µ—Ç–∏—Ç–µ–ª –æ—Ñ–∏—Ü–∏–∞–Ω—Ç –Ω–µ–ø—Ä–∏–≤–µ—Ç–ª–∏–≤ –∑–¥—Ä–∞–≤—Å—Ç–≤—É —Å–≤–∏–¥–∞–Ω –ª–∏—Ü –Ω–µ–¥–æ–≤–æ–ª—å–Ω –Ω–µ–ø—Ä–∏—è—Ç–Ω —Ö–æ—á–µ—Ç —Å–º–æ—Ç—Ä–µ—Ç —Ç–∞–∫ –∫—É—Ö–Ω –æ—Å—Ç–∞–≤–ª—è –∂–µ–ª–∞ –ª—É—á—à –ª—é–ª –∫–µ–±–∞–± –∫–æ—Å—Ç –ø–æ–ø–∞–¥–∞ —à–∞—à–ª—ã–∫ –≥–æ–≤—è–¥–∏–Ω —Å—É—Ö–æ –Ω–µ–≤–∫—É—Å–Ω—ã –æ—Ç–¥—ã—Ö –ø–æ–∑–∏—Ç–∏–≤ –¥–µ–Ω–µ–≥ –∂–∞–ª–µ —Ö–æ—á–µ—Ç –ø—Ä–∏—Ö–æ–¥ —Ç—É–¥ –Ω–∞–º —Ä–∞–¥'

In [182]:
# coding_frequency = CountVectorizer(analyzer='word',
#                                    # binary=True,
#                                    min_df=2,          # –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤—Ö–æ–∂–¥–µ–Ω–∏–π —Å–ª–æ–≤–∞
#                                    ngram_range=(2,3),   # –∫–∞–∫–∏–µ n-–≥—Ä–∞–º–º—ã —É—á–∏—Ç—ã–≤–∞—Ç—å
#                                    #stop_words=stopwords.words("russian")
#                                   )
#
#res_vectorizer = coding_frequency.fit_transform(tok_stem_text)
#
# —Ç–∞–±–ª–∏—Ü–∞ —á–∞—Å—Ç–æ—Ç—ã —Å–ª–æ–≤
#pd.DataFrame(res_vectorizer.toarray(), columns = coding_frequency.vocabulary_.keys())



coding_tfidf = TfidfVectorizer(min_df=2,          # –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤—Ö–æ–∂–¥–µ–Ω–∏–π —Å–ª–æ–≤–∞
                                ngram_range=(2,3),   # –∫–∞–∫–∏–µ n-–≥—Ä–∞–º–º—ã —É—á–∏—Ç—ã–≤–∞—Ç—å
                                #stop_words=stopwords.words("russian")
                                )

res_vectorizer = coding_tfidf.fit_transform(tok_stem_text)

# —Ç–∞–±–ª–∏—Ü–∞ —á–∞—Å—Ç–æ—Ç—ã —Å–ª–æ–≤
pd.DataFrame(res_vectorizer.toarray(), columns = coding_tfidf.vocabulary_.keys())

Unnamed: 0,–≤–∏–¥ –º–æ—Ä,–æ—Ç–Ω–æ—à–µ–Ω –ø–æ—Å–µ—Ç–∏—Ç–µ–ª,–æ—Å—Ç–∞–≤–ª—è –∂–µ–ª–∞,–∂–µ–ª–∞ –ª—É—á—à,–ª—é–ª –∫–µ–±–∞–±,—à–∞—à–ª—ã–∫ –≥–æ–≤—è–¥–∏–Ω,—Ö–æ—á–µ—Ç –ø—Ä–∏—Ö–æ–¥,–Ω–∞–º —Ä–∞–¥,–æ—Å—Ç–∞–≤–ª—è –∂–µ–ª–∞ –ª—É—á—à,–Ω–∞–ø–∏—Å–∞ –¥–æ—Å—Ç–∞–≤–∫,...,–≤—ã—Å—à —É—Ä–æ–≤–Ω,—Ç–∞–∫ –º–æ,–º—ã—Ç –ø–æ—Å—É–¥,–∑–∞–ø–µ—á–µ–Ω –∫–∞—Ä—Ç–æ—à–∫,–ø—Ä–∏–Ω–µ—Å–ª –∫–æ—Ñ,–ø—Ä–æ—Å–∏–¥–µ–ª –º–∏–Ω—É—Ç,–æ—Ñ–æ—Ä–º–ª–µ–Ω –∑–∞–∫–∞–∑,–≤–æ—Å–∫—Ä–µ—Å–µ–Ω –≤–µ—á–µ—Ä,–∑–∞–≤–µ–¥–µ–Ω –±—Ä–∞–ª,–∫–æ—Ç–ª–µ—Ç –¥–æ–º–∞—à–Ω
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.370297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [187]:
# –î–ª—è –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏ –Ω–∞–º –Ω—É–∂–µ–Ω —Å–ø–∏—Å–æ–∫ —Ü–µ–ª–µ–≤—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

# –û–±–Ω–æ–≤–∏—Ç–µ –º–æ–¥–µ–ª—å

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –º–æ–¥–µ–ª–∏
d2v_model = doc2vec.Doc2Vec(vector_size=30, # –¥–ª–∏–Ω–∞ –≤–µ–∫—Ç–æ—Ä–∞, –∫–æ—Ç–æ—Ä—ã–º –±—É–¥–µ—Ç –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ
                            min_count=2,    # min –∫–æ–ª-–≤–æ –≤—Å—Ç—Ä–µ—á–∞–Ω–∏—è —Å–ª–æ–≤–∞ –≤ –ø—Ä–ø–µ–¥–ª–æ–∂–µ–Ω–∏–∏ –¥–ª—è —É—á–µ—Ç–∞
                            epochs=30,      # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö
                           )
# –Ω–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
data_new = list(tagged_document(stem_nltk))
    
# —Ä–∞—Å—à–∏—Ä–∏—Ç—å —Å–ª–æ–≤–∞—Ä–Ω—ã–π –∑–∞–ø–∞—Å
d2v_model.build_vocab(data_new)
  
# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ Doc2Vec
d2v_model.train(data_new, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)
  
# –ê–Ω–∞–ª–∏–∑ –≤—ã—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
# analyze = d2v_model.infer_vector(['–ú–∞–º–∞ –º—ã–ª–∞ —Ä–∞–º—É'])
# analyze

doc2vec_vectorizer = np.array([d2v_model.infer_vector([text_x]) for text_x in tok_stem_text])

In [189]:
scal = MinMaxScaler()
doc2vec_vectorizer = scal.fit_transform(doc2vec_vectorizer)

# –ú–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–µ
- LDA sklearn
- LDA gensim
- LSI sklearn

In [183]:
model = LatentDirichletAllocation(n_components=5,   # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–º
                                  #learning_method='online',
                                  random_state=42,
                                  n_jobs=-1)

In [190]:
model.fit(doc2vec_vectorizer)   # –ø—Ä–∏–Ω–∏–º–∞–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç CountVectorizer –∏ –∞–Ω–∞–ª–æ–≥–∏—á–Ω—ã–µ

LatentDirichletAllocation(n_components=5, n_jobs=-1, random_state=42)

In [185]:
print("Perplexity", model.perplexity(res_vectorizer))
print("Log Likelihood", model.score(res_vectorizer))

Perplexity 7833.7254358155515
Log Likelihood -9009.372145278117


In [191]:
print("Perplexity", model.perplexity(doc2vec_vectorizer))
print("Log Likelihood", model.score(doc2vec_vectorizer))

Perplexity 39.98538330505931
Log Likelihood -27595.01211942672


In [192]:
result = pd.DataFrame(model.transform(doc2vec_vectorizer), columns=[str(i) for i in range(1, 6)])
thems = result.apply(lambda x: x.sort_values().index[-1], axis=1)
result['1'].sort_values(ascending=False)

440    0.958727
270    0.957811
88     0.955991
382    0.955927
438    0.955459
         ...   
268    0.010781
67     0.010685
186    0.010601
143    0.010188
488    0.010108
Name: 1, Length: 500, dtype: float64

In [195]:
df_tmp['text'].iloc[88]

'–ü–æ–∑–≤–æ–Ω–∏–ª–∏ –∑–∞–±—Ä–æ–Ω–∏—Ä–æ–≤–∞—Ç—å —Å—Ç–æ–ª–∏–∫, –ø–æ–∫—É—Ä–∏—Ç—å –∫–∞–ª—å—è–Ω –∫–æ–º–ø–∞–Ω–∏–µ–π, –¥–µ–≤—É—à–∫–∞ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä —Å–∫–∞–∑–∞–ª–∞, —á—Ç–æ —Å—Ç–æ–ª–∏–∫ –µ—Å—Ç—å –∏ –≤—Å–µ –æ—Ç–ª–∏—á–Ω–æ)\\n–ú—ã —Å–æ–±—Ä–∞–ª–∏—Å—å —Å —Ä–µ–±—è—Ç–∞–º–∏, –µ—Ö–∞–ª–∏ –∏–∑ –ö–∞–ª–∏–Ω–∏–Ω–≥—Ä–∞–¥–∞, –ø—É—Ç—å –Ω–µ –±–ª–∏–∂–Ω–∏–π, 50 –∫–º)\\n–í –∏—Ç–æ–≥–µ –ø—Ä–∏–µ—Ö–∞–ª–∏ –∏ —Ä–µ–±—è—Ç–∞ –Ω–∞ –±–∞—Ä–µ —Ä–∞–∑–≤–µ—Ä–Ω—É–ª–∏ –Ω–∞—Å —Å–æ —Å–ª–æ–≤–∞–º–∏ ¬´–≤—Å–µ —Å—Ç–æ–ª–∏–∫–∏ –∑–∞–Ω—è—Ç—ã –∏ —É –Ω–∞—Å –∑–∞–∫–æ–Ω—á–∏–ª—Å—è —Ç–∞–±–∞–∫¬ª - –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ –Ω–∞ –≤–µ—á–µ—Ä –±—ã–ª–æ –Ω–∞–º –æ–±–µ—Å–ø–µ—á–µ–Ω–æ, —É–µ—Ö–ª–∏ –∏–∑ –°–≤–µ—Ç–ª–æ–≥–æ—Ä—Å–∫–∞ —Å —É–∂–∞—Å–Ω—ã–º –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ–º. –°–ø–∞—Å–∏–±–æ, –æ—Ç–ª–∏—á–Ω—ã–π —Å–µ—Ä–≤–∏—Å!\\n\n'

In [139]:
def show_topics( vectorizer_x=None, model=None, n_words=20):
    feature_names = np.array(vectorizer_x.get_feature_names_out())
    top_words = []

    for topic_weights in model.components_:
        top_keywords_locs = (-topic_weights).argsort()[:n_words]
        top_words.append(feature_names.take(top_keywords_locs))
    return top_words

In [186]:
pd.DataFrame(show_topics(coding_tfidf, model, 20))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,–ø—Ä–æ—Å—Ç –æ—Ç–≤—Ä–∞—Ç–∏—Ç–µ–ª—å–Ω,—Å–æ–≤–µ—Ç –∑–∞–≤–µ–¥–µ–Ω,—Ö–∞–º—Å–∫ –ø–æ–≤–µ–¥–µ–Ω,–∑–∞–∫–∞–∑ –º–∏–Ω—É—Ç,—Ü–µ–Ω –ø–æ–∏–¥,—É—Ä–æ–≤–µ–Ω –æ–±—Å–ª—É–∂–∏–≤–∞–Ω,–ø–∏—Ü—Ü –≤–∫—É—Å–Ω,–ø—Ä–æ—Å—Ç –Ω–µ–≤–æ–∑–º–æ–∂–Ω,–ø–µ—Ä–≤ –æ—á–µ—Ä–µ–¥,–∑–∞–∫–∞–∑ –æ—Ç–≤–µ—Ç,–ø–æ—Å—É–¥ –≥—Ä—è–∑–Ω,—Ñ–æ –±–æ,–≤—ã—Å–æ–∫ —Ü–µ–Ω,–∑–∞–±—Ä–æ–Ω–∏—Ä–æ–≤–∞ —Å—Ç–æ–ª,–¥–µ–Ω —Ä–æ–∂–¥–µ–Ω,–¥—Ä—É–≥ –º–µ—Å—Ç,–≥–æ—Ç–æ–≤ –≤–∫—É—Å–Ω,–Ω–µ–∫–æ—Ç–æ—Ä –≤—Ä–µ–º,–≤–µ—á–µ—Ä –∏—Å–ø–æ—Ä—á,–ø–æ–ª–æ–≤–∏–Ω —Å—Ç–æ–ª
1,–¥–∞–Ω –∑–∞–≤–µ–¥–µ–Ω,–æ—Å—Ç–∞–≤–ª—è –∂–µ–ª–∞ –ª—É—á—à,–µ–¥ –≤–∫—É—Å–Ω,–æ—Å—Ç–∞–≤–ª—è –∂–µ–ª–∞,–∂–µ–ª–∞ –ª—É—á—à,–¥–µ–Ω —Ä–æ–∂–¥–µ–Ω,–∫–Ω–∏–≥ –∂–∞–ª–æ–±,–Ω–∞–ø–∏—Å–∞ –¥–æ—Å—Ç–∞–≤–∫,–Ω–∞–º–Ω –≤–∫—É—Å–Ω,–≥–æ—Ä–∞–∑–¥ –≤–∫—É—Å–Ω,—Å–ª–µ–¥—É—é—â –¥–µ–Ω,–æ—Ç–¥–µ–ª—å–Ω –∏—Å—Ç–æ—Ä,–≤—Ä–µ–º –æ–∂–∏–¥–∞–Ω,–æ–∂–∏–¥–∞–Ω –∑–∞–∫–∞–∑,–≤–∫—É—Å–Ω –º—è—Å,–≤–∫—É—Å–Ω –∫—É—Ö–Ω,–≥–æ—Å—Ç —Ç–∞–∫,–º–∏–Ω—É—Ç –Ω–∞–º,–∑–∞–∫–∞–∑ –ø–µ—Ä–≤—ã,—Å–∏—Ç—É–∞—Ü –ø—Ä–æ—Å—Ç
2,—Å–∞–ª–∞—Ç —Ü–µ–∑–∞—Ä,–¥–∞–Ω –∑–∞–≤–µ–¥–µ–Ω,–∂–∏–≤ –º—É–∑—ã–∫,—É–∂–∞—Å–Ω –æ–±—Å–ª—É–∂–∏–≤–∞–Ω,–ø–æ–∏–¥ —Ç—É–¥,—Å–ª–æ–µ–Ω —Ç–µ—Å—Ç,—Ä–µ—à –ø–æ—Å–µ—Ç,–æ—Ç–≤—Ä–∞—Ç–∏—Ç–µ–ª—å–Ω –∫–∞—á–µ—Å—Ç–≤,–ª–∏—Å—Ç —Å–∞–ª–∞—Ç,–Ω–∞—à —Å—Ç–æ–ª,—Ç–∞–∫ –≤–æ–æ–±—â,–¥—Ä—É–≥ –≥–æ—Å—Ç,—É–∂–∞—Å–Ω –º–µ—Å—Ç,–æ–¥–Ω –º–µ—Å—Ç,–º–æ –æ—Ç–∑,—Ö—É–∂ —Å—Ç–∞–ª,–∫—Ä–∞—Å–∏–≤ –º–µ—Å—Ç,–≤–∫—É—Å –ø–æ—Ä—Ü,–ø—è—Ç –º–∏–Ω—É—Ç,–∑–∞–∫–∞–∑–∞ —Ä–æ–ª–ª
3,–µ–¥ –≤–∫—É—Å–Ω,—É–∂–∞—Å–Ω –µ–¥,–µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω—ã –ø–ª—é—Å,—Å–æ—Ç—Ä—É–¥–Ω–∏–∫ —Å–æ–∂–∞–ª–µ–Ω,–¥–æ–ª–≥ –æ–±—Å–ª—É–∂–∏–≤–∞–Ω,—Å—ã—Ä —Ç–µ—Å—Ç,—Å–∫–∞–∑–∞ –∑–∞–∫—Ä—ã—Ç,—Å—é–¥ —Ç–æ—á–Ω,–∫–∞—Ñ –∫–∞—Ñ,–ø—Ä–æ—Å—Ç —É–∂–∞—Å,–æ–¥–Ω —Å–ª–æ–≤,—Ä–æ–ª–ª –≤–∫—É—Å–Ω,–Ω–∞–ø–∏—Å–∞ —Å–∫–∞–∑–∞,–ø–æ—Å–µ—â–µ–Ω –¥–∞–Ω –∑–∞–≤–µ–¥–µ–Ω,–ø–æ—Å–µ—â–µ–Ω –¥–∞–Ω,—Å–∞–º –µ–¥,—Ö–æ–¥ —Ç—É–¥,–≤–µ –¥–µ–Ω,–∂–µ–ª–∞–Ω –ø—Ä–∏—Ö–æ–¥,–¥–∞–Ω –∑–∞–≤–µ–¥–µ–Ω
4,–∫–∞—Ä—Ç–æ—à–∫ —Ñ—Ä–∏,–æ–¥–Ω –∑–≤–µ–∑–¥,–ø—Ä–æ—Å—Ç —É–∂–∞—Å,–¥–µ—Ç—Å–∫ –∫–æ–º–Ω–∞—Ç,—Ç–∞–∫ –æ—â—É—â–µ–Ω,—Å–¥–µ–ª–∞ –∑–∞–∫–∞–∑,—É–∂–∞—Å–Ω –æ–±—Å–ª—É–∂–∏–≤–∞–Ω,–ø–æ–∫ —Å–∞–º,–∫—É—à–∞ —Å–∞–º,—Ç—É–¥ –Ω–æ–≥–æ,–∑–∞–∫–∞–∑ –∂–¥–∞–ª,–µ–¥ –ø–æ–Ω—Ä–∞–≤,–∏—Å–ø–æ—Ä—á–µ–Ω –Ω–∞—Å—Ç—Ä–æ–µ–Ω,—Å–≤–æ –∑–∞–∫–∞–∑,—Ç–∞–∫ –æ–±—Å–ª—É–∂–∏–≤–∞–Ω,—Ö–∞–º—Å–∫ —Ç–æ–Ω,—É–∂–∞—Å–Ω –º–µ—Å—Ç,–∑–Ω–∞–µ—Ç —Ç–∞–∫,–æ—Å—Ç–∞–≤ –æ—Ç–∑,–∑–∞–≤–µ–¥–µ–Ω —Ö–æ—Ä–æ—à


## gensim

In [None]:
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(l) for l in processed_data]

# –ú–µ—Ç—Ä–∏–∫–∏

# –°–±–æ—Ä –≤—Å–µ–≥–æ –≤ Pipeline