# Code for numerical experiments with hPAM model

## Import of necessary packages

In [3]:
import tomotopy as to
import pandas as pd
import numpy as np

## Loading data

In [6]:
#load data from a csv file, which contains a lemmatized dataset (for example, Amazon dataset)
df = pd.read_csv('Amazon_lem.csv', header=None, names=['docs'])

## Extracting vocabulary

In [5]:
#extract unique words and calculate their number
uniq_words = list(filter(lambda x: x, set(df.docs.str.cat(sep=' ').strip().split(' '))))
uniq_words_len = len(uniq_words)
print('the number of unique words is', uniq_words_len)

the number of unique words 31486


## Transformation of documents

In [25]:
# transform documents into word lists
words_in_docs = list(map(lambda x: x.split(), df.docs.dropna().values))

## Setting hPAM model parameters

In [26]:
# set parameters for the loop

# Set the number of super topics (second hierarchical level)
level1_topics = [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,33,34,36,38,40,42,44,46,48,50,52, 54, 56, 58,60]

# Set the number of subtopics (third hierarchical level)
level2_topics = [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,33,34,36,38,40,42,44,46,48,50,52, 54, 56, 58,60]

# set values of parameter eta 
eta_level = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1]
 
#set the initial value of parameter alpha
alpha = 0.0001

#set the name of the file for recording the results
myfile = 'HPAM_newRenyi_WoS_lem_run1.csv'

## Training hPAM models in a loop for selected parameters and calculating Renyi entropy, log-likelihood, and perplexity

In [1]:
# Calculate Renyi entropy for each level of hPAM model

import time
start = time.time()

def process_words_probs(x):
    words = list(map(lambda y: y[0], x))
    probs = list(map(lambda y: y[1], x))
    return words, probs

with open(myfile, 'a') as file: 
    file.write('eta' +';' + 'alpha' +';' + 'Num topics (lev 1)' + ';' +  'WRD (lev 1)' + ';' + 'prob_sum (lev 1)' + ';' + 
               'Renyi(lev 1)' +';' + 'Num topics (lev 2)' + ';' +'WRD (lev 2)' + ';' + 'prob_sum (lev 2)' + ';' + 
             'Renyi(lev 2)' +';'+ 'perplexity' + ';' + 'Log-likelihood'  + '\n')

for eta in eta_level:
    Renyi_lev1 = []
    topic_lev1 = []
    
    #organize cycle for the second hierarchical level
    for ilevel1 in level1_topics:
        #cycle for the third hierarchical level   
        for ilevel2 in level2_topics:
            # Model initialization
            hpam = to.HPAModel(k1=ilevel1, k2=ilevel2, alpha=alpha, eta=eta)
            list(map(hpam.add_doc, words_in_docs))
       
            # train the model
            # set the number of iterations
            hpam.train(iter=100, workers = 10)
        
            #obtain a list of lists (word - probability) for each topic from 0 to level1_topics + level2_topics
            words_topics_distr = list(map(lambda x: hpam.get_topic_words(x, uniq_words_len), range(1 + ilevel1 + ilevel2)))
            df_words_probs = pd.DataFrame(np.zeros((uniq_words_len, 2 + ilevel1 + ilevel2)))
            df_words_probs.columns = ['words'] + df_words_probs.columns.to_list()[:-1]
            df_words_probs['words'] = uniq_words
            df_words_probs.set_index('words', inplace=True)
        
            for i, words_topics_col in enumerate(words_topics_distr):
                words, probs = process_words_probs(words_topics_col)
                df_words_probs.loc[words, i] = probs
        
            num_word, num_col = df_words_probs.shape
  #------------------------------------------------------------------------------------------------------      
            # calculation of Renyi entropy for level 2 
            #set the threshold
            thresh=1/num_word
            word_ratio1 = 0
            sum_prob1 = 0                
       
            myprob1 = df_words_probs.values[:, 1:ilevel1+1]
            num_word, num_col = myprob1.shape
            
            #select the maximum probability for each word 
            mymax = myprob1.max(axis=1)
            # mymax contains the list of maximum probabilities 
            # select the probabilities larger than thresh
            mywrd = mymax[mymax>thresh]
            # calculate the number of such values
            word_ratio1 = len(mywrd)
            # calculate the sum of those probabilities
            sum_prob1 = sum(mywrd)
            #Shannon entropy
            Sh1  = np.log(word_ratio1/(num_word*ilevel1))
            #Internal energy
            En1 = -np.log(sum_prob1/ilevel1)
            #Free energy
            Fen1 = En1 - Sh1*ilevel1
            #Renyi entropy
            if ilevel1==1: 
                Renyi1 = Fen1/(ilevel1)
            else: Renyi1 = Fen1/(ilevel1-1)
   #--------------------------------------------------------------------------------------------------------                         
            # calculation of Renyi entropy for level 3 
            
            myprob2 = df_words_probs.values[:, ilevel1+1:]
            word_ratio2 = 0
            sum_prob2 = 0
            
            # select the maximum probability for each word
            mymax2 = myprob2.max(axis=1)
            # mymax contains the list of maximum probabilities 
            #select the probabilities larger than thresh
            mywrd2 = mymax2[mymax2>thresh]
            #calculate the number of such values
            word_ratio2 = len(mywrd2)
            #calculate the sum of those probabilities
            sum_prob2 = sum(mywrd2)
            #Shannon entropy
            Sh2  = np.log(word_ratio2/(num_word*ilevel2))
            #Internal energy
            En2 = -np.log(sum_prob2/ilevel2)
            #Free energy
            Fen2 = En2 - Sh2*ilevel2
            #Renyi entropy
            if ilevel2==1: 
                Renyi2 = Fen2/(ilevel2)
            else: Renyi2 = Fen2/(ilevel2-1)
            
            print('eta: ', eta, ' ','topics level2: ', ilevel2, ' ', word_ratio2, ' ', sum_prob2, ' ', Renyi2)
        
    #---------------------------------------------------------------------------------------------------------    
        # recording the results to a csv file
            #calculate \rho and \tilde{P} for level 2
            wrd1_lev1 = word_ratio1/(num_word*ilevel1)
            sum_prob1_lev1 = (sum_prob1/ilevel1)
            
            #calculate \rho and \tilde{P} for level 3
            wrd1_lev2 = word_ratio2/(num_word*ilevel2)
            sum_prob2_lev2 = (sum_prob2/ilevel2)
        
            with open(myfile, 'a') as file: 
                file.write(str(eta) + ';' + str(alpha) + ';' + str(ilevel1) + ';' 
                           + str(wrd1_lev1) + ';' + str(sum_prob1_lev1) + ';'    
                                 + str(Renyi1) +';' + str(ilevel2) + ';' 
                                 + str(wrd1_lev2) + ';' + str(sum_prob2_lev2) + ';'
                             + str(Renyi2) + ';'  + str(hpam.perplexity) +';' + str(hpam.ll_per_word) + '\n')
                                                                                                                       
        
        
stop = time.time()
print('time of execution (sec)', stop - start)
print('---------------------------')

NameError: name 'myfile' is not defined