In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
# from nltk import word_tokenize - нужно nltk.download('punkt')

from nltk import wordpunct_tokenize, wordnet
from nltk.stem import wordnet as WordNetLem
from nltk.stem import SnowballStemmer, StemmerI

import gensim
from gensim.corpora import Dictionary
from gensim.models import doc2vec

In [140]:
stop_words = \
    []


# Загрузка данных

In [3]:
df = pd.read_csv('data/coffee.csv')
df['rating'] = df['rating'].astype('float16')

In [141]:
df_tmp = df.iloc[:500]
df_tmp.shape

(500, 5)

In [142]:
df_tmp['text'].iloc[5]

'Положительное место с хорошими блюдами , только персонал отличился высокомерием и надменностью, не рекомендую к посещению, останется неприятный осадок \n'

# Предобработка данных 
- токенизация gensim, потому что там без пунктуации получается
- стемминг/лемматизация
- удаление стоп слов
- создание би/три-грамм

In [118]:
# лемматизация
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [139]:
tokens = list(sent_to_words(df_tmp['text']))
tokens[5][:32]

['удобное',
 'расположение',
 'работает',
 'круглосуточно',
 'внутри',
 'очень',
 'мало',
 'места',
 'туалет',
 'так',
 'себе',
 'но',
 'выбор',
 'шаверм',
 'их',
 'вкус',
 'покрывает',
 'все',
 'недостатки']

In [131]:
# стемминг и удаление стоп слов
stemmer = SnowballStemmer('russian')

stem_nltk = []
for sentence in tokens:
    stem_nltk.append(list(stemmer.stem(word_x) for word_x in sentence if word_x not in stop_words))
stem_nltk[0]

['пил',
 'коф',
 'рим',
 'париж',
 'вкусн',
 'капуч',
 'фундучн',
 'молок',
 'фирмен',
 'сливк',
 'джинж',
 'пробова',
 'десерт',
 'очен',
 'необычн',
 'ребят',
 'барист',
 'больш',
 'молодц',
 'нчто',
 'улучш',
 'маловат',
 'мест',
 'посадко',
 'придума']

# Кодирование текста
- CountVectorizer
- CountVectorizer(binary=True)
- Tfidf
- Doc2Vec

In [40]:
# coding_frequency = CountVectorizer(analyzer='word',
#                                    # binary=True,
#                                    min_df=2,          # минимальное количество вхождений слова
#                                    ngram_range=(2,3),   # какие n-граммы учитывать
#                                    #stop_words=stopwords.words("russian")
#                                   )


# coding_tfidf = TfidfVectorizer(# min_df=2,          # минимальное количество вхождений слова
#                                 ngram_range=(2,3),   # какие n-граммы учитывать
#                                 #stop_words=stopwords.words("russian")
#                                 )

    

In [56]:
# Для обучения модели нам нужен список целевых документов
def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

# Обновите модель

# Инициализация модели
d2v_model = doc2vec.Doc2Vec(vector_size=30, # длина вектора, которым будет представлено предложение
                            min_count=2,    # min кол-во встречания слова в прпедложении для учета
                            epochs=30,      # количество эпох
                           )
# новые данные
data_new = list(tagged_document(stem_nltk))
    
# расширить словарный запас
d2v_model.build_vocab(data_new)
  
# Обучение модели Doc2Vec
d2v_model.train(data_new, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)
  
# Анализ выходных данных
analyze = d2v_model.infer_vector(['Мама мыла раму'])
analyze    

array([-0.00506188,  0.01428302,  0.00100153, -0.01065414, -0.00283817,
        0.0035261 , -0.01456202,  0.00311686, -0.01392682,  0.01010637,
        0.00846607,  0.01304028, -0.01377159,  0.01310898, -0.0103531 ,
        0.0023483 , -0.0076668 ,  0.00474966,  0.0120494 , -0.00812947,
       -0.00716008,  0.0026629 ,  0.01313637, -0.01159228, -0.0071721 ,
        0.01278654, -0.01153663, -0.01072278, -0.0152599 ,  0.01348103],
      dtype=float32)

In [43]:
tok_stem_text = []
for word_x in range(len(stem_nltk)):
    tok_stem_text.append(" ".join(stem_nltk[word_x]))
tok_stem_text[0]

'глубинк стран во всех сво проявлен ассортимент столовск интерьер качеств цен приемлем для средн бюджетно столово ссср не все чист все не нов что бы вы хотел на трасс поел желудок не бастова знач риск был оправда номер для ночлег аналогичн толк пластиков окн нормальн не закр штор на окн нет тольк тюл для перв этаж плох мал ли кто заглядыва туалет душ совок но повтор для трасс да за руб соидет'

In [62]:
doc2vec_vectorizer = np.array([d2v_model.infer_vector([text_x]) for text_x in tok_stem_text])

In [65]:
from sklearn.preprocessing import MinMaxScaler

In [69]:
scal = MinMaxScaler()
doc2vec_vectorizer = scal.fit_transform(doc2vec_vectorizer)

In [44]:
res_vectorizer = coding_tfidf.fit_transform(tok_stem_text)

In [49]:
# таблица частоты слов
pd.DataFrame(res_vectorizer.toarray(), columns = coding_tfidf.vocabulary_.keys())

Unnamed: 0,глубинк стран,стран во,во всех,всех сво,сво проявлен,проявлен ассортимент,ассортимент столовск,столовск интерьер,интерьер качеств,качеств цен,...,посл прохожден магазин,прохожден магазин за,магазин за покупк,за покупк заход,покупк заход эт,заход эт каф,эт каф заход,каф заход убед,заход убед вам,убед вам понрав
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Моделирование
- LDA sklearn
- LDA gensim
- LSI sklearn

In [70]:
model = LatentDirichletAllocation(n_components=5,   # количество тем
                                  #learning_method='online',
                                  random_state=42,
                                  n_jobs=-1)

In [78]:
model.fit(doc2vec_vectorizer)   # принимает результат CountVectorizer и аналогичные

LatentDirichletAllocation(n_components=5, n_jobs=-1, random_state=42)

In [39]:
print("Perplexity", model.perplexity(res_vectorizer))
print("Log Likelihood", model.score(res_vectorizer))

Perplexity 2634.328178659377
Log Likelihood -36940.238464014314


In [73]:
print("Perplexity", model.perplexity(doc2vec_vectorizer))
print("Log Likelihood", model.score(doc2vec_vectorizer))

Perplexity 40.0481816580527
Log Likelihood -27659.74708201889


In [89]:
result = pd.DataFrame(model.transform(doc2vec_vectorizer), columns=[str(i) for i in range(1, 6)])
thems = result.apply(lambda x: x.sort_values().index[-1], axis=1)
thems

0      2
1      5
2      2
3      1
4      5
      ..
495    5
496    2
497    4
498    5
499    4
Length: 500, dtype: object

In [91]:
thems == '2'

0       True
1      False
2       True
3      False
4      False
       ...  
495    False
496     True
497    False
498    False
499    False
Length: 500, dtype: bool

In [111]:
df_tmp[['text']][thems == '2']

Unnamed: 0,text
0,Глубинка страны во всех своих проявлениях. Асс...
2,"Не очень удобное расположение, от метро идти м..."
5,"Положительное место с хорошими блюдами , тольк..."
10,Удобное расположение👍. Работает круглосуточно...
12,Лучший шашлык в Звенигороде.\nПриветливый и ве...
...,...
478,"Великолепное место, очень необычное. На террит..."
483,Одно из любимых мест в Москве.здесь осень атмо...
485,"Самый любимый ресторан, вмещающий в себя насто..."
491,Всегда всё очень вкусно! Заказываем только мир...


## gensim

In [None]:
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(l) for l in processed_data]

# Метрики

# Сбор всего в Pipeline