In [1]:
import pandas as pd
import numpy as np

import tomotopy as to
import tomotopy.coherence as coherence

import time

## Loading data & creating a list of documents

In [2]:
# Loading data - dataset containing lemmatized texts
names = ['id',
         'headline',
         'description',
         'article_body',
         'all_text',
         'text_lem',
         'date',
         'newspaper',
         'city',
         'md_index',
         'url']

data = pd.read_excel('SV-women-data-lem.xlsx',
                     index_col = 0,
                     names = names)
data

Unnamed: 0,id,headline,description,article_body,all_text,text_lem,date,newspaper,city,md_index,url
0,1,«Женщины для утех» по-прежнему нужны: Япония и...,Во время колонизации Кореи Японией в XX веке к...,Во время колонизации Кореи Японией в XX веке к...,«Женщины для утех» по-прежнему нужны: Япония и...,женщина утеха нужный япония китай борьба время...,2016-01-03,ИА Regnum,Москва,2.312,http://regnum.ru/news/2048238.html
1,2,В Норвегии мигрантов научат не насиловать женщин,Наплыв беженцев из Сирии и других стран Ближне...,\n ...,В Норвегии мигрантов научат не насиловать женщ...,норвегия мигрант научить насиловать беженец си...,2016-01-04,Известия (iz.ru),Москва,1.113,http://izvestia.ru/news/600956
2,3,В Кельне задержали первого подозреваемого в на...,\tПрокуратура Кельна сообщила о задержании пер...,"Сейчас+6˚CСейчас в Санкт-ПетербургеОблачно, Бе...",В Кельне задержали первого подозреваемого в на...,кёльн задержать первый подозревать нападение п...,2016-01-05,Фонтанка (fontanka.ru),Санкт-Петербург,1.186,http://www.fontanka.ru/2016/01/05/077/
3,4,В Дании полицейские научат беженцев вежливому ...,В одной из пяти административных областей Дани...,В одной из пяти административных областей Дани...,В Дании полицейские научат беженцев вежливому ...,дания полицейский научить беженец вежливый обр...,2016-01-05,Российская газета (rg.ru),Москва,1.266,http://www.rg.ru/2016/01/05/keln-site.html
4,5,А каждого Михеля демократично переименуют в Му...,"Наш спецкор Дарья Асламова выясняет, в чем при...",В теплом кафе мой собеседник небрежно скидывае...,А каждого Михеля демократично переименуют в Му...,михель демократично переименовать спецкор дарь...,2016-01-06,Комсомольская правда (kp.ru),Москва,0.960,http://www.kp.ru/daily/26476/3347751/
...,...,...,...,...,...,...,...,...,...,...,...
15337,15338,"«Фантазируя об изнасиловании, женщина чувствуе...","Почему во время месячных хочется больше секса,...",\n— Получаю особенное удовольствие от секса во...,"«Фантазируя об изнасиловании, женщина чувствуе...",фантазировать изнасилование женщина чувствоват...,2023-12-28,Газета.Ru,Москва,1.963,https://www.gazeta.ru/culture/news/2023/12/28/...
15338,15339,Обвиняемого в домогательствах екатеринбургског...,Обвиняемого в домогательствах екатеринбургског...,6 октября – ИА SM.News. Обвиняемого в домогате...,Обвиняемого в домогательствах екатеринбургског...,обвинять домогательство екатеринбургский масса...,2023-12-29,Eadaily.com,Москва,1.469,https://eadaily.com/ru/news/2023/12/29/prosto-...
15339,15340,Анджелина Джоли и Гвинет Пэлтроу обвинили Вайн...,Звезды Голливуда Анджелина Джоли и Гвинет Пэлт...,\n ...,Анджелина Джоли и Гвинет Пэлтроу обвинили Вайн...,анджелина джоля гвинет пэлтроу обвинить вайншт...,2023-12-29,Известия (iz.ru),Москва,1.200,https://iz.ru/1628155/2023-12-29/brata-maikla-...
15340,15341,"Адвокат Людмила Айвар рассказала, как российск...","Адвокат Людмила Айвар рассказала, как российск...","7 марта 2023, 16:59 — Общественная служба но...","Адвокат Людмила Айвар рассказала, как российск...",адвокат людмила айвар рассказать российский же...,2023-12-30,Общественная служба новостей (osnmedia.ru),Москва,0.959,https://www.osnmedia.ru/kultura/bolee-600-deya...


In [3]:
# Getting docs from data
docs = pd.DataFrame(data.text_lem)

In [4]:
# Extract unique words and calculate their number
uniq_words = list(filter(lambda x: x, set(docs.text_lem.str.cat(sep=' ').strip().split(' '))))
uniq_words_len = len(uniq_words)
print('The number of unique words is', uniq_words_len)

The number of unique words is 63240


In [5]:
# Transform documents into word lists
words_in_docs = list(map(lambda x: x.split(), docs.text_lem.dropna().values))

## Main attributes

In [6]:
'''
# Topic distribution for each document (probabilities)
mdl.docs[0].get_topic_dist() 

# Word distribution for each topic (only probabilities)
mdl.get_topic_word_dist(topic_id = 0)

# Word distribution for each topic (words + probabilities)
mdl.get_topic_words(topic_id=0, top_n=50)

# Number of words allocated to each topic (num)
mdl.get_count_by_topics()

# hLDA Words distribution for each of the live topics (only probabilities)
words_topics_distr = list(map(lambda x: hlda.get_topic_word_dist(x) if hlda.is_live_topic(x) else [], range(hlda.k)))
'''

print('')




## Setting hyperparameters

In [7]:
seed = 12345

#num_topics = list(np.arange(5, 30, 5)) + list(np.arange(30, 110, 10))
#eta_hlda = = [0.0001, 0.001, 0.01, 0.2,  0.3, 0.5, 0.7, 1]

eta_hlda = 0.2
num_topics = 40
alpha = 0.001 # https://ethen8181.github.io/machine-learning/clustering/topic_model/LDA.html
eta = 0.00001 # https://ethen8181.github.io/machine-learning/clustering/topic_model/LDA.html
gamma = 0.001

## Latent Dirichlet Allocation (LDAModel)

In [8]:
%%time

lda = to.LDAModel(k=num_topics, alpha=alpha, eta=eta, seed=seed)
list(map(lda.add_doc, words_in_docs))
lda.train(workers=1, iter = 100)
print(lda.summary(topic_word_top_n = 30), end='\n\n')

ch = coherence.Coherence(lda, top_n = 50)
print('-----------------------------------------------')
print('Coherence score =', ch.get_score(), end='\n\n')

<Basic Info>
| LDAModel (current version: 0.12.7)
| 15342 docs, 2850809 words
| Total Vocabs: 63240, Used Vocabs: 63240
| Entropy of words: 8.23205
| Entropy of term-weighted words: 8.23205
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -8.85266
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 40 (the number of topics between 1 ~ 32767)
| alpha: [0.001] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 1e-05 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 12345 (random seed)
| trained in version 0.12.7
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.21325

## Hierarchical LDA (HLDAModel)

In [9]:
%%time

hlda = to.HLDAModel(depth=3, alpha=alpha, eta=eta_hlda, gamma=gamma, seed=seed)
list(map(hlda.add_doc, words_in_docs))
hlda.train(workers=1, iter = 100)
hlda.summary(topic_word_top_n = 30)

ch = coherence.Coherence(hlda, top_n = 50)
print('-----------------------------------------------')
print('Coherence score =', ch.get_score(), end='\n\n')

<Basic Info>
| HLDAModel (current version: 0.12.7)
| 15342 docs, 2850809 words
| Total Vocabs: 63240, Used Vocabs: 63240
| Entropy of words: 8.23205
| Entropy of term-weighted words: 8.23205
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -8.32406
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| depth: 3 (the maximum depth level of hierarchy between 2 ~ 32767)
| alpha: [0.001] (hyperparameter of Dirichlet distribution for document-depth level, given as a single `float` in case of symmetric prior and as a list with length `depth` of `float` in case of asymmetric prior.)
| eta: 0.2 (hyperparameter of Dirichlet distribution for topic-word)
| gamma: 0.001 (concentration coeficient of Dirichlet Process)
| seed: 12345 (random seed)
| trained in version 0.12.7
|

## Pachinko Allocation (PAModel)

In [10]:
%%time

pa = to.PAModel(k1=3, k2=num_topics, alpha=alpha, subalpha=alpha, eta=eta, seed=seed)
list(map(pa.add_doc, words_in_docs))
pa.train(workers=1, iter = 100)
pa.summary(topic_word_top_n = 30)

ch = coherence.Coherence(pa, top_n = 50)
print('-----------------------------------------------')
print('Coherence score =', ch.get_score(), end='\n\n')

<Basic Info>
| PAModel (current version: 0.12.7)
| 15342 docs, 2850809 words
| Total Vocabs: 63240, Used Vocabs: 63240
| Entropy of words: 8.23205
| Entropy of term-weighted words: 8.23205
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 1
| Log-likelihood per word: -10.20365
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k1: 3 (the number of super topics between 1 ~ 32767)
| k2: 40 (the number of sub topics between 1 ~ 32767)
| alpha: [0.001] (initial hyperparameter of Dirichlet distribution for document-super topic, given as a single `float` in case of symmetric prior and as a list with length `k1` of `float` in case of asymmetric prior.)
| subalpha: [0.001] (initial hyperparameter of Dirichlet distribution for super-sub topic, given as a single `float` in case of symmetric prior a

## Hierarchical PA (HPAModel)

In [11]:
%%time

hpa = to.HPAModel(k1=3, k2=num_topics, alpha=alpha, subalpha=alpha, eta=eta, seed=seed)
list(map(hpa.add_doc, words_in_docs))
hpa.train(workers=1, iter = 100)
hpa.summary(topic_word_top_n = 30)

ch = coherence.Coherence(hpa, top_n = 50)
print('-----------------------------------------------')
print('Coherence score =', ch.get_score(), end='\n\n')

<Basic Info>
| HPAModel (current version: 0.12.7)
| 15342 docs, 2850809 words
| Total Vocabs: 63240, Used Vocabs: 63240
| Entropy of words: 8.23205
| Entropy of term-weighted words: 8.23205
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 1
| Log-likelihood per word: -9.88751
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k1: 3 (the number of super topics between 1 ~ 32767)
| k2: 40 (the number of sub topics between 1 ~ 32767)
| alpha: [0.001] (initial hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k1 + 1` of `float` in case of asymmetric prior.)
| subalpha: [0.001] (initial hyperparameter of Dirichlet distribution for super-sub topic, given as a single `float` in case of symmetric prior and

## Correlated Topic Model (CTModel)

In [12]:
%%time

ctm = to.CTModel(k=num_topics, smoothing_alpha=alpha, eta=eta, seed=seed)
list(map(ctm.add_doc, words_in_docs))
ctm.train(workers=1, iter = 100)
ctm.summary(topic_word_top_n = 30)

ch = coherence.Coherence(pa, top_n = 50)
print('-----------------------------------------------')
print('Coherence score =', ch.get_score(), end='\n\n')

<Basic Info>
| CTModel (current version: 0.12.7)
| 15342 docs, 2850809 words
| Total Vocabs: 63240, Used Vocabs: 63240
| Entropy of words: 8.23205
| Entropy of term-weighted words: 8.23205
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 2
| Log-likelihood per word: -7.00474
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 40 (the number of topics between 1 ~ 32767)
| smoothing_alpha: [0.001] (small smoothing value for preventing topic counts to be zero, given as a single `float` in case of symmetric and as a list with length `k` of `float` in case of asymmetric.)
| eta: 1e-05 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 12345 (random seed)
| trained in version 0.12.7
|
<Parameters>
| prior_mean (Prior mean of Logit-normal for the per-document topic distribution