In [73]:
import pandas as pd
import numpy as np
from pprint import pprint
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler

import nltk
# from nltk import word_tokenize - нужно nltk.download('punkt')

from nltk import wordpunct_tokenize, wordnet
from nltk.stem import wordnet as WordNetLem
from nltk.stem import SnowballStemmer, StemmerI

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import doc2vec
from gensim.models import LdaModel, LdaMulticore
from gensim.models import LsiModel
from gensim import models

In [42]:
stop_words = \
    ['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 
     'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 
     'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 
     'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 
     'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 
     'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 
     'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 
     'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 
     'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 
     'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 
     'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 
     'им', 'более', 'всегда', 'конечно', 'всю', 'между']
stop_words.extend(['очень', 'ооочень', 'это', 'данное'])

# Загрузка данных

In [2]:
df = pd.read_csv('data/coffee.csv')
df['rating'] = df['rating'].astype('float16')

In [45]:
df_tmp = df.iloc[:500]

# Токенизация, стемминг, удаление стоп слов

In [41]:
# лемматизация
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [63]:
# есть проблема - nчто можно улучшить - маловато места посадки. n английская остается.
tokenized = list(sent_to_words(df_tmp['text']))
tokenized[0][:10]

['глубинка',
 'страны',
 'во',
 'всех',
 'своих',
 'проявлениях',
 'ассортимент',
 'столовскии',
 'интерьер',
 'качество']

In [64]:
# стемминг и удаление стоп слов
stemmer = SnowballStemmer('russian')

stemming = []
for sentence in tokenized:
    stemming.append(list(stemmer.stem(word_x) for word_x in sentence if word_x not in stop_words))
stemming[0][:10], len(stemming[0])

(['глубинк',
  'стран',
  'сво',
  'проявлен',
  'ассортимент',
  'столовск',
  'интерьер',
  'качеств',
  'цен',
  'приемлем'],
 47)

# Кодирование (Tfidf / doc2vec)

## Sklearn Tfidf

In [102]:
# объединим текст в отзывы
tok_stem_text = []
for word_x in stemming:
    tok_stem_text.append(" ".join(word_x))
tok_stem_text[0]

'глубинк стран сво проявлен ассортимент столовск интерьер качеств цен приемлем средн бюджетно столово ссср чист нов хотел трасс поел желудок бастова знач риск оправда номер ночлег аналогичн толк пластиков окн нормальн закр штор окн тюл перв этаж плох мал заглядыва туалет душ совок повтор трасс руб соидет'

In [53]:
coding_tfidf = TfidfVectorizer(min_df=2,          # минимальное количество вхождений слова
                                ngram_range=(2,3),   # какие n-граммы учитывать
                                #stop_words=stopwords.words("russian")
                                )

res_vectorizer = coding_tfidf.fit_transform(tok_stem_text)

# таблица частоты слов
pd.DataFrame(res_vectorizer.toarray(), columns = coding_tfidf.vocabulary_.keys()).head()

Unnamed: 0,цен приемлем,перв этаж,пил коф,пробова десерт,нчто улучш,удобн расположен,мест мал,вкусн коф,имен туд,вид мор,...,центр москв,атмосферны рестора,питер москв,the right,рок ролл,вкусн пив,нотдельн спасиб,мог сказа,кухн большо,похож муз
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.352424,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## gensim Tfidf

In [69]:
#  сохранение извлеченных токенов в словарь
my_dictionary = corpora.Dictionary(stemming)
print(my_dictionary)

# преобразование слов в Bag of Word
bow_corpus =[my_dictionary.doc2bow(doc, allow_update = True) for doc in tokenized]
# print(bow_corpus)

Dictionary(3751 unique tokens: ['аналогичн', 'ассортимент', 'бастова', 'бюджетно', 'глубинк']...)


In [71]:
# Вес слова в корпусе Bag of Word перед применением TF-IDF (частота слов):
word_weight =[]
for doc in bow_corpus:
    for id, freq in doc:
        word_weight.append([my_dictionary[id], freq])
# print(word_weight)

In [75]:
# Вес слов после применения TF-IDF:
# создать модель TF-IDF
tfIdf = models.TfidfModel(bow_corpus, smartirs ='ntc')
  
# TF-IDF вес слова
weight_tfidf =[]
for doc in tfIdf[bow_corpus]:
    for id, freq in doc:
        weight_tfidf.append([my_dictionary[id], np.around(freq, decimals=3)])
# print(weight_tfidf)

In [95]:
# corpus - токенизированный текст
lexicon = gensim.corpora.Dictionary(corpus) 
tfidf   = gensim.models.TfidfModel(dictionary = lexicon, normalize = True) 
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]

NameError: name 'tokenize' is not defined

## gensim doc2vec

In [87]:
# Для обучения модели нам нужен список целевых документов
def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

# Обновите модель

# Инициализация модели
d2v_model = doc2vec.Doc2Vec(vector_size=30, # длина вектора, которым будет представлено предложение
                            min_count=2,    # min кол-во встречания слова в прпедложении для учета
                            epochs=30,      # количество эпох
                           )
# новые данные
data_new = list(tagged_document(stem_nltk))
    
# расширить словарный запас
d2v_model.build_vocab(data_new)
  
# Обучение модели Doc2Vec
d2v_model.train(data_new, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)
  
# Анализ выходных данных
# analyze = d2v_model.infer_vector(['Мама мыла раму'])
# analyze

doc2vec_vectorizer = np.array([d2v_model.infer_vector([text_x]) for text_x in tok_stem_text])

In [88]:
doc2vec_vectorizer

array([[ 0.00495361,  0.0052808 ,  0.01092705, ..., -0.00824159,
        -0.00615475, -0.00055768],
       [-0.0140129 , -0.00800666, -0.00662626, ..., -0.00389153,
        -0.0033671 ,  0.00423033],
       [-0.0021455 , -0.00324717,  0.00090764, ...,  0.01602945,
         0.0151157 , -0.00076643],
       ...,
       [ 0.00360031, -0.00615906,  0.00997726, ..., -0.0113132 ,
         0.00513004,  0.00636086],
       [ 0.00088269,  0.01320916, -0.00771323, ..., -0.01584101,
         0.0090939 , -0.00289908],
       [-0.00350721,  0.01143662,  0.01065524, ...,  0.00695964,
         0.01551486,  0.00590714]], dtype=float32)

In [None]:
scal = MinMaxScaler()
doc2vec_vectorizer = scal.fit_transform(doc2vec_vectorizer)

# Моделирование

In [86]:
# Create Dictionary
id2word = corpora.Dictionary(stem_nltk)

# Create Corpus
texts = stem_nltk

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1)]


In [91]:
dictionary = corpora.Dictionary(stem_nltk)
corpus = [[(i, vec_x_x) for i, vec_x_x in zip(range(30), vec_x)] for vec_x in doc2vec_vectorizer]
corpus[0]

[(0, 0.0049536107),
 (1, 0.0052808006),
 (2, 0.010927051),
 (3, -0.0015454899),
 (4, -0.0076337834),
 (5, 0.0069790105),
 (6, 0.012038878),
 (7, 0.0021290223),
 (8, -0.0003728688),
 (9, 0.009557775),
 (10, 0.013048987),
 (11, 0.0136705395),
 (12, 0.00030434132),
 (13, 0.0007641832),
 (14, 0.016414156),
 (15, 0.011137237),
 (16, -0.0064205476),
 (17, 0.005839382),
 (18, 0.008994835),
 (19, -0.01347044),
 (20, 0.0018960933),
 (21, -0.010848733),
 (22, -0.007888592),
 (23, 0.008643001),
 (24, -0.001959622),
 (25, -0.0028713148),
 (26, 0.0057159048),
 (27, -0.008241594),
 (28, -0.006154752),
 (29, -0.0005576819)]

In [92]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [93]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.000*"заня" + 0.000*"замен" + 0.000*"крабов" + 0.000*"име" + 0.000*"им" + '
  '0.000*"заяв" + 0.000*"лезут" + 0.000*"вилко" + 0.000*"выдернут" + '
  '0.000*"мрачн"'),
 (1,
  '0.000*"заня" + 0.000*"замен" + 0.000*"крабов" + 0.000*"име" + 0.000*"им" + '
  '0.000*"заяв" + 0.000*"лезут" + 0.000*"вилко" + 0.000*"выдернут" + '
  '0.000*"мрачн"'),
 (2,
  '0.000*"заня" + 0.000*"замен" + 0.000*"крабов" + 0.000*"име" + 0.000*"им" + '
  '0.000*"заяв" + 0.000*"лезут" + 0.000*"вилко" + 0.000*"выдернут" + '
  '0.000*"мрачн"'),
 (3,
  '0.000*"заня" + 0.000*"замен" + 0.000*"крабов" + 0.000*"име" + 0.000*"им" + '
  '0.000*"заяв" + 0.000*"лезут" + 0.000*"вилко" + 0.000*"выдернут" + '
  '0.000*"мрачн"'),
 (4,
  '0.000*"заня" + 0.000*"замен" + 0.000*"крабов" + 0.000*"име" + 0.000*"им" + '
  '0.000*"заяв" + 0.000*"лезут" + 0.000*"вилко" + 0.000*"выдернут" + '
  '0.000*"мрачн"'),
 (5,
  '0.000*"заня" + 0.000*"замен" + 0.000*"крабов" + 0.000*"име" + 0.000*"им" + '
  '0.000*"заяв" + 0.000*"лезут" + 0

In [94]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=stem_nltk, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -42.654178619384574

Coherence Score:  0.8954992094499092
