In [1]:
import tomotopy as to
import pandas as pd
import numpy as np

In [2]:
# load data from csv file, which contains a lemmatized dataset

df = pd.read_csv('Amazon_lem.csv', header=None, names=['docs'])


In [3]:
#extract unique words and calculate their number
uniq_words = list(filter(lambda x: x, set(df.docs.str.cat(sep=' ').strip().split(' '))))
uniq_words_len = len(uniq_words)
print('the number of unique words', uniq_words_len)

число уникальных слов 50965


In [4]:
# transform documents into word lists
words_in_docs = list(map(lambda x: x.split(), df.docs.dropna().values))

['cs', 'okstate', 'chong', 'kermit', 'available', 'windows', 'article', 'steve', 'frampton', 'wondering', 'kermit', 'package', 'actual', 'package', 'usual', 'ftp', 'sites', 'chong']


In [5]:
# setting parameters of hLDA model
# depth of the hierarchical model
level = 3
#set the initial value of parameter alpha
alpha = 0.001
#set value of parameter eta
myeta = [0.001, 0.01, 0.2,  0.3, 0.5, 0.7, 1]
#set value of parameter gamma
mygamma = [0.001]
#set the number of runs of the same model
mruns = 20

myfile = 'HLDA_Renyi_Amazon.csv'

In [6]:
def get_topic_info(x):
    return {
        'parent_topic': hlda.parent_topic(x),
        'children_topics': hlda.children_topics(x),
        'level': hlda.level(x) + 1
    } if hlda.is_live_topic(x) else {}

In [7]:
import time
start = time.time()

with open(myfile, 'a') as file: 
    file.write('gamma' + ';' + 'eta' +';' + 'alpha' +';' + 'Num topics (lev 1)' + ';' + 'Renyi(lev 1)' +';' + 
               'Num topics (lev 2)' + ';' + 'Renyi1(lev 2)' +';' 
                + 'perplexity' + ';' + 'Log-likelihood'  + '\n')
    
for myrun in range(mruns):
    for gamma in mygamma:
        for eta in myeta:
            # Initialization of the model with parameters
            hlda = to.HLDAModel(depth=level, alpha=alpha, gamma=gamma, eta=eta)
            # Download documents
            list(map(hlda.add_doc, words_in_docs))
    
            # train the model
            hlda.train(workers=8, iter = 100)
            # Get a list of probabilities from 0 to k
            words_topics_distr = list(map(lambda x: hlda.get_topic_word_dist(x) if hlda.is_live_topic(x) else [], range(hlda.k)))

        
            # Create a DataFrame for our dimensions
            df_words_probs = pd.DataFrame(words_topics_distr).T.dropna(how='all', axis=1)
            df_words_probs.insert(0, 'words', hlda.vocabs)
            df_words_probs.set_index('words', inplace=True)
            df_words_probs.columns = df_words_probs.columns.astype(str)
            topic_info = pd.DataFrame(list(map(get_topic_info, range(hlda.k)))).dropna()

            mmlist = list(df_words_probs)
            num_word, num_col = df_words_probs.shape
    
# ----------------------------------------------------------------------------------------------------
            # calculate Renyi entropy for the second level
            level1_id = []
            for i in mmlist:
                tt = hlda.level(int(i))
                if tt==1:
                    level1_id.append(i)

            ilevel1 = len(level1_id)
            print('topic number (level 1)= ', ilevel1)
            word_ratio1 = 0
            sum_prob1 = 0

            thresh=1/num_word
    
            myprob1 = df_words_probs[level1_id].values

            word_ratio1 = np.count_nonzero(myprob1>thresh)
            sum_prob1 = myprob1[myprob1>thresh].sum()
            #Shannon entropy
            Sh1  = np.log(word_ratio1/(num_word*ilevel1))
            #Internal energy
            En1 = -np.log(sum_prob1/ilevel1)
            #Free energy
            Fen1 = En1 - Sh1*ilevel1
            #Renyi entropy
            Renyi1 = Fen1/(ilevel1-1)

# ----------------------------------------------------------------------------------------------------
            #calculate Renyi entropy for the third level
            Renyi2 = 0
            ilevel2 = 0
            if level>2:
                level2_id = []
                for i in mmlist:
                    tt1 = hlda.level(int(i))
                    if tt1==2:
                        level2_id.append(i)
        
                ilevel2 = len(level2_id)
                print('topics num (lev 2): ', ilevel2)
    
                myprob2 = df_words_probs[level2_id].values

                word_ratio2 = np.count_nonzero(myprob2>thresh)
                sum_prob2 = myprob2[myprob2>thresh].sum()
                Sh2  = np.log(word_ratio2/(num_word*ilevel2))
                En2 = -np.log(sum_prob2/ilevel2)
                Fen2 = En1 - Sh1*ilevel2
                Renyi2 = Fen1/(ilevel2-1)

    # ----------------------------------------------------------------------------------------------------
            # output the results to a csv format file
            with open(myfile, 'a') as file: 
                file.write(str(gamma) + ';' + str(eta) + ';' + str(alpha) + ';' + str(ilevel1) + ';' + str(Renyi1) +';' + 
                           str(ilevel2) + ';' + str(Renyi2) + ';'
                           + str(hlda.perplexity) +';' + str(hlda.ll_per_word) +   '\n')
        
            print('------------------------------')

    
    
stop = time.time()
print('time of execution (sec)', stop - start)
print('---------------------------')

расчет запустился
topic number (level 1)=  375
topics num (lev 2):  1015
gamma :  0.001
eta :  0.001
------------------------------
topic number (level 1)=  63
topics num (lev 2):  260
gamma :  0.001
eta :  0.01
------------------------------
topic number (level 1)=  8
topics num (lev 2):  15
gamma :  0.001
eta :  0.2
------------------------------
topic number (level 1)=  6
topics num (lev 2):  11
gamma :  0.001
eta :  0.3
------------------------------
topic number (level 1)=  4
topics num (lev 2):  6
gamma :  0.001
eta :  0.5
------------------------------
topic number (level 1)=  3
topics num (lev 2):  5
gamma :  0.001
eta :  0.7
------------------------------
topic number (level 1)=  2
topics num (lev 2):  4
gamma :  0.001
eta :  1
------------------------------
topic number (level 1)=  309
topics num (lev 2):  987
gamma :  0.001
eta :  0.001
------------------------------
topic number (level 1)=  87
topics num (lev 2):  303
gamma :  0.001
eta :  0.01
-----------------------------