In [37]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk import word_tokenize#  - нужно nltk.download('punkt')

from nltk import wordpunct_tokenize, wordnet
from nltk.stem import wordnet as WordNetLem
from nltk.stem import SnowballStemmer, StemmerI

import gensim
from gensim.corpora import Dictionary
from gensim.models import doc2vec
from gensim.models import CoherenceModel

In [47]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
stop_words_eng = stopwords.words('english')
# for i in stop_words_eng:
#   print(i, end=', ')
print(stop_words_eng)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [51]:
stop_words_ru = stopwords.words('russian')
# for i in stop_words_eng:
#   print(i, end=', ')
print(stop_words_ru)

['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впр

In [2]:
corpus = ["The elephant sneezed at the sight of potatoes.",
          "Bats can see via echolocation. See the bat sight sneeze!",
          "Wondering, she opened the door to the studio."]

# corpus = ['Мамы нету, она ушла на улицу, а я бегал.',]

# Предобработка

## Токенизация nltk

In [None]:
# выделение слов, lower()
tokens = [
    [word_x.lower() for word_x in nltk.wordpunct_tokenize(text_x)]
    for text_x in corpus]
tokens[0]

['the', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potatoes', '.']

##  Токенизация gensim с удалением пунктуации

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [4]:
tokens = list(sent_to_words(corpus))
tokens[0]

['the', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potatoes']

- Попробовать word_tokenize (для этого нужно nltk.download('punkt'))

## Стемминг nltk

In [None]:
# stemmer = SnowballStemmer('russian')
stemmer = SnowballStemmer('english')
processed_data = []
for sentence in tokens:
    processed_data.append(list(stemmer.stem(word_x) for word_x in sentence))
processed_data[0]

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']

## Лемматизация nltk

In [9]:
nltk.download('wordnet')# Нужно
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# подготовка данных
processed_data = []
for x, doc in enumerate(tokens):
    doc_out = []
    for word in doc:
        lemmatized_word = lemmatizer.lemmatize(word)  # лемматизация
        if lemmatized_word:
            #print (lemmatized_word)
            doc_out.append(lemmatized_word)

    processed_data.append(doc_out) # processed_data это список слов
processed_data

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[['the', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potato'],
 ['bat',
  'can',
  'see',
  'via',
  'echolocation',
  'see',
  'the',
  'bat',
  'sight',
  'sneeze'],
 ['wondering', 'she', 'opened', 'the', 'door', 'to', 'the', 'studio']]

## Создание би/три грамм gensim

In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[processed_data], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[processed_data[2]]])

['wondering', 'she', 'opened', 'the', 'door', 'to', 'the', 'studio']


- разобраться с параметрами и тем, как именно он разделяет на би-граммы и три-граммы
- разобраться, как делать skip-n-граммы

In [12]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [13]:
make_bigrams(processed_data)

[['the', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potato'],
 ['bat',
  'can',
  'see',
  'via',
  'echolocation',
  'see',
  'the',
  'bat',
  'sight',
  'sneeze'],
 ['wondering', 'she', 'opened', 'the', 'door', 'to', 'the', 'studio']]

# Кодирование текста

## sk -> Count_Vectorizer()
- учится на списке текста, а не списке слов
- binary = True - активация бинарного учета слов есть/нет

In [15]:
coding_frequency = CountVectorizer(analyzer='word',
                                   # binary=True,
                                   # min_df=2,          # минимальное количество вхождений слова
                                   ngram_range=(2,3),   # какие n-граммы учитывать
                                   #stop_words=stopwords.words("russian")
                                  )


In [16]:
tok_stem_text = []
for i in range(len(processed_data)):
    tok_stem_text.append(" ".join(processed_data[i]))
tok_stem_text

['the elephant sneezed at the sight of potato',
 'bat can see via echolocation see the bat sight sneeze',
 'wondering she opened the door to the studio']

In [17]:
res_vectorizer = coding_frequency.fit_transform(tok_stem_text)

In [18]:
# таблица частоты слов
pd.DataFrame(res_vectorizer.toarray(), columns = coding_frequency.vocabulary_.keys())

Unnamed: 0,the elephant,elephant sneezed,sneezed at,at the,the sight,sight of,of potato,the elephant sneezed,elephant sneezed at,sneezed at the,...,the door,door to,to the,the studio,wondering she opened,she opened the,opened the door,the door to,door to the,to the studio
0,1,1,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
1,0,0,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,1,1,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,1,1,1,0,0,1,1


## sk - Tfidf_Vectorizer()

In [19]:
coding_tfidf = TfidfVectorizer(# min_df=2,          # минимальное количество вхождений слова
                                ngram_range=(2,3),   # какие n-граммы учитывать
                                #stop_words=stopwords.words("russian")
                                )


In [20]:
res_vectorizer = coding_tfidf.fit_transform(tok_stem_text)
# таблица частоты слов
pd.DataFrame(res_vectorizer.toarray(), columns = coding_tfidf.vocabulary_.keys())

Unnamed: 0,the elephant,elephant sneezed,sneezed at,at the,the sight,sight of,of potato,the elephant sneezed,elephant sneezed at,sneezed at the,...,the door,door to,to the,the studio,wondering she opened,she opened the,opened the door,the door to,door to the,to the studio
0,0.27735,0.27735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.27735,0.27735,0.27735,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.242536,0.242536,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27735,0.27735,...,0.0,0.0,0.0,0.27735,0.27735,0.27735,0.0,0.0,0.27735,0.27735


## gensim - doc2bow () -- Частота слов

In [27]:
# Create Dictionary
id2word = Dictionary(processed_data)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in processed_data]

# View
for corpus_x in corpus:
    print(corpus_x)



[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)]
[(4, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1)]
[(6, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]


In [26]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('at', 1),
  ('elephant', 1),
  ('of', 1),
  ('potato', 1),
  ('sight', 1),
  ('sneezed', 1),
  ('the', 2)]]

## gensim - Doc2Vec()
- понять, как именно происходит обучение
- на что влияет количество эпох, вообще параметры Doc2Vec
- увидеть логику в весах (похожие отзывы должны сходиться)

In [28]:
# Для обучения модели нам нужен список целевых документов
def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

In [29]:
# тренировочные данные
data_train = list(tagged_document(processed_data[:2]))
data_train

[TaggedDocument(words=['the', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potato'], tags=[0]),
 TaggedDocument(words=['bat', 'can', 'see', 'via', 'echolocation', 'see', 'the', 'bat', 'sight', 'sneeze'], tags=[1])]

In [30]:
# Обновите модель

# Инициализация модели
d2v_model = doc2vec.Doc2Vec(vector_size=8, # длина вектора, которым будет представлено предложение
                            min_count=1,    # min кол-во встречания слова в прпедложении для учета
                            epochs=30,      # количество эпох
                           )
# новые данные
data_new = list(tagged_document(processed_data))

# расширить словарный запас
d2v_model.build_vocab(data_new)

# Обучение модели Doc2Vec
d2v_model.train(data_new, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

# Анализ выходных данных
analyze = d2v_model.infer_vector(['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'])
analyze

array([ 0.00599566, -0.05792361, -0.06131075,  0.02410918, -0.02576405,
       -0.00572477, -0.02444453,  0.03992262], dtype=float32)

In [31]:
d2v_model.infer_vector(['a', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'])

array([-0.01691687, -0.00139169,  0.00394075, -0.02051513, -0.00307254,
       -0.0126081 , -0.03165334, -0.0232961 ], dtype=float32)

# Модели

## sk -> LDA()

In [32]:
model_type = LatentDirichletAllocation(n_components=5,   # количество тем
                                      #learning_method='online',
                                      random_state=42,
                                      n_jobs=-1)

In [33]:
model_type.fit(res_vectorizer)   # принимает результат CountVectorizer и аналогичные

In [34]:
print("Log Likelihood", model_type.score(res_vectorizer))
print("Perplexity", model_type.perplexity(res_vectorizer))

Log Likelihood -71.5153058127203
Perplexity 549.8728032364252


## sk -> LSI()

## gensim -> LDA() + метрики coherance и perplexy

In [41]:
# stemmer = SnowballStemmer('russian')
# stemmer = SnowballStemmer('english')
# stem_nltk = []
# for sentence in tokens:
#     stem_nltk.append(list(stemmer.stem(word_x) for word_x in sentence))
# stem_nltk[0]


nltk.download('wordnet')# Нужно
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# подготовка данных
processed_data = []
for x, doc in enumerate(tokens):
    doc_out = []
    for word in doc:
        lemmatized_word = lemmatizer.lemmatize(word)  # лемматизация
        if lemmatized_word:
            #print (lemmatized_word)
            doc_out.append(lemmatized_word)

    processed_data.append(doc_out) # processed_data это список слов

# словарь из стемминга
id2word = Dictionary(processed_data)

# кодируем корпус
corpus = [id2word.doc2bow(text) for text in processed_data]

model=gensim.models.ldamodel.LdaModel(corpus=corpus,
                                      id2word=id2word,
                                      num_topics=3,
                                      random_state=42,
                                      update_every=1,
                                      # chunksize=100,
                                      # passes=10,
                                      # alpha='auto',
                                      #per_word_topics=True,
                                     )
coherencemodel = CoherenceModel(model=model, texts=processed_data, dictionary=id2word, coherence='c_v')
print("Coherence: ", coherencemodel.get_coherence())
print('Perplexity: ', model.log_perplexity(corpus))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Coherence:  0.2714203823462893
Perplexity:  -3.817860328234159
