In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

import matplotlib.pyplot as plt
%matplotlib inline


Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/24/38/6d81eff34c84c9158d3b7c846bff978ac88b0c2665548941946d3d591158/pyLDAvis-3.2.2.tar.gz (1.7MB)
[K     |████████████████████████████████| 1.7MB 10.7MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.2.2-py2.py3-none-any.whl size=135593 sha256=d2975ecb326f43658091b92ee2a6db9613f35a35fcb18ed1a80f5965e37a01e3
  Stored in directory: /root/.cache/pip/wheels/74/df/b6/97234c8446a43be05c9a8687ee0db1f1b5ade5f27729187eae
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.15 pyLDAvis-3.2.2


  from collections import Iterable


In [None]:
stopwords = set(nltk_stopwords.words('russian'))

In [None]:
import sqlite3

In [None]:
conn = sqlite3.connect("/content/drive/MyDrive/anekdot-ru.db")
cursor = conn.cursor()
df_anek = pd.read_sql_query("SELECT text FROM anecdot WHERE date BETWEEN '2019-01-01' and '2019-01-31'", conn)

In [None]:
df_anek

Unnamed: 0,text
0,1 января — День святого рассола и минералки.
1,"Уровень тишины 1 января: слышно, как у соседей..."
2,В 8 часов утра 1 января собачники могли бы зах...
3,"- А я вот загадал на Новый Год желание, чтобы ..."
4,Новый год без ели – что Дед Мороз без хмеля!\n...
...,...
1995,"Это как надо было не хотеть дочку, чтобы назва..."
1996,"Всё было отлично, но потом в комнату зашёл экс..."
1997,Хитрожопый краб торговал своими клешнями. Это ...
1998,"- А ведь я полюбил тебя, дорогая, за твою кули..."


In [None]:
def preprocess(data):
  data_tokenized = []
  for text in data:
    text_c = text.lower()
    text_cle = re.findall(r'\w+', text_c)
    text_clean = [w for w in text_cle if not w in stopwords]
    data_tokenized.append(text_clean)

  return data_tokenized

anekdots = preprocess(df_anek['text'])

In [None]:
anekdots[:2]

[['1', 'января', 'день', 'святого', 'рассола', 'минералки'],
 ['уровень',
  'тишины',
  '1',
  'января',
  'слышно',
  'соседей',
  'сверху',
  'вибрирует',
  'телефон']]

In [None]:
!pip install pymorphy2
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [None]:
anek_lem = []
for t in anekdots:
  sent = []
  for w in t:
    if w.isdigit() is False:
      sent.append(morph.parse(w)[0].normal_form)
  anek_lem.append(sent)

In [None]:
anek_lem[:2]

[['январь', 'день', 'святой', 'рассол', 'минералка'],
 ['уровень',
  'тишина',
  'январь',
  'слышный',
  'сосед',
  'сверху',
  'вибрировать',
  'телефон']]

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(anek_lem)
# Create Corpus
texts = anek_lem
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:2])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]]


In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('день', 1), ('минералка', 1), ('рассол', 1), ('святой', 1), ('январь', 1)]]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, 
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
for t, top_words in lda_model.print_topics(num_topics=50, num_words=10):
  print('Topic', t+1, ':', top_words)

Topic 1 : 0.014*"это" + 0.009*"писать" + 0.009*"год" + 0.008*"письмо" + 0.007*"человек" + 0.006*"мужик" + 0.006*"свой" + 0.005*"дедушка" + 0.005*"россия" + 0.005*"секс"
Topic 2 : 0.013*"это" + 0.005*"человек" + 0.005*"похожий" + 0.005*"ранний" + 0.004*"сделать" + 0.004*"новый" + 0.004*"год" + 0.004*"ещё" + 0.004*"вместо" + 0.004*"почему"
Topic 3 : 0.009*"свой" + 0.009*"всё" + 0.007*"это" + 0.006*"который" + 0.005*"работать" + 0.005*"один" + 0.005*"новый" + 0.005*"водка" + 0.005*"любить" + 0.004*"путин"
Topic 4 : 0.014*"это" + 0.007*"который" + 0.007*"человек" + 0.006*"почему" + 0.006*"страна" + 0.006*"ввести" + 0.006*"венесуэла" + 0.005*"знать" + 0.005*"власть" + 0.005*"против"
Topic 5 : 0.010*"это" + 0.006*"путин" + 0.005*"человек" + 0.005*"украина" + 0.005*"делать" + 0.005*"два" + 0.005*"знать" + 0.004*"россия" + 0.004*"мой" + 0.004*"свой"
Topic 6 : 0.010*"новый" + 0.009*"год" + 0.007*"россия" + 0.005*"япония" + 0.005*"время" + 0.005*"говорить" + 0.005*"маленький" + 0.004*"экономика"

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=anek_lem, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Perplexity:  -9.259945161246456
# Coherence Score:  0.4773445677408298

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=anek_lem, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
num_of_docs = len(corpus)
num_of_docs
type(len(corpus))

int

In [None]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

In [None]:
model_results['Alpha']
model_results['Beta']
model_results['Coherence']

In [None]:
result = pd.read_csv('/content/lda_tuning_results.csv')
result = pd.DataFrame(result)
result

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.185373
1,75% Corpus,2,0.01,0.31,0.220443
2,75% Corpus,2,0.01,0.61,0.207812
3,75% Corpus,2,0.01,0.9099999999999999,0.178013
4,75% Corpus,2,0.01,symmetric,0.187350
...,...,...,...,...,...
535,100% Corpus,10,asymmetric,0.01,0.468723
536,100% Corpus,10,asymmetric,0.31,0.557989
537,100% Corpus,10,asymmetric,0.61,0.463407
538,100% Corpus,10,asymmetric,0.9099999999999999,0.394659


In [None]:
filt = result['Beta'] == '0.01'
result.loc[filt]
filt2 = result['Alpha'] == '0.01'
result.loc[filt2]

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.185373
1,75% Corpus,2,0.01,0.31,0.220443
2,75% Corpus,2,0.01,0.61,0.207812
3,75% Corpus,2,0.01,0.9099999999999999,0.178013
4,75% Corpus,2,0.01,symmetric,0.187350
...,...,...,...,...,...
510,100% Corpus,10,0.01,0.01,0.384688
511,100% Corpus,10,0.01,0.31,0.449954
512,100% Corpus,10,0.01,0.61,0.446260
513,100% Corpus,10,0.01,0.9099999999999999,0.370303


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=20,
                                           alpha=0.61,
                                           eta=0.31)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
for t, top_words in lda_model.print_topics(num_topics=8, num_words=10):
  print('Topic', t+1, ':', top_words)

Topic 1 : 0.014*"сказать" + 0.011*"женщина" + 0.009*"мужик" + 0.008*"мужчина" + 0.008*"жена" + 0.007*"какой" + 0.007*"думать" + 0.007*"знать" + 0.006*"должный" + 0.006*"коррупция"
Topic 2 : 0.017*"россия" + 0.013*"человек" + 0.011*"стать" + 0.010*"деньга" + 0.007*"сам" + 0.006*"новость" + 0.006*"сенатор" + 0.006*"ребёнок" + 0.006*"который" + 0.006*"правительство"
Topic 3 : 0.027*"говорить" + 0.011*"самый" + 0.011*"ещё" + 0.009*"давать" + 0.009*"ты" + 0.008*"хороший" + 0.007*"купить" + 0.007*"начать" + 0.006*"найти" + 0.005*"дело"
Topic 4 : 0.024*"свой" + 0.009*"украина" + 0.008*"сделать" + 0.005*"сша" + 0.005*"москва" + 0.005*"интересно" + 0.004*"таки" + 0.004*"гражданин" + 0.003*"выборы" + 0.003*"имя"
Topic 5 : 0.014*"всё" + 0.013*"венесуэла" + 0.012*"президент" + 0.011*"яйцо" + 0.010*"весь" + 0.008*"мадурый" + 0.008*"путин" + 0.008*"тот" + 0.008*"каждый" + 0.007*"народ"
Topic 6 : 0.017*"хотеть" + 0.013*"время" + 0.013*"делать" + 0.010*"очень" + 0.009*"русский" + 0.008*"спрашивать" + 