In [1]:
import sys
sys.path.append('../')
import LLDA as llda

import os
import json

In [2]:
def get_leafs(term_tree):
    terms = set()
    queue = [term_tree]

    while(queue):
        curr_dict = queue.pop()
        for key in curr_dict:
            if(curr_dict[key]):
                queue.append(curr_dict[key])
            else:
                terms.add(key.lower())
    return terms

dataset_dir = "./dataset"

labeled_documents = []

for doi in os.listdir(dataset_dir):
    text_path = os.path.join(dataset_dir, doi, f"{doi}.txt")
    if(not os.path.exists(text_path)):
        continue
    text = open(text_path, encoding="utf-8", errors="ignore").read()
    
    terms_path = os.path.join(dataset_dir, doi, f"{doi}.json")
    if(not os.path.exists(terms_path)):
        continue
    
    gt_term_tree = json.load(open(terms_path))
    gt_terms = get_leafs(gt_term_tree)
    
    labeled_documents.append((text, list(gt_terms)))

In [3]:
# new a Labeled LDA model
# llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001)
# llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002)
llda_model = llda.LldaModel(labeled_documents=labeled_documents[1:], alpha_vector=0.01)
print(llda_model)

# training
# llda_model.training(iteration=10, log=True)
while True:
    print("iteration %s sampling..." % (llda_model.iteration + 1))
    llda_model.training(1)
    print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
    print("delta beta: %s" % llda_model.delta_beta)
    if llda_model.iteration == 20:
        break
    if llda_model.is_convergent(method="beta", delta=0.01):
        break


Labeled-LDA Model:
	K = 143
	M = 137
	T = 106523
	WN = 1091712
	LN = 358
	alpha = 0.01
	eta = 0.001
	perplexity = 1925.7680167071444
	
iteration 1 sampling...
gibbs sample count:  1091712
after iteration: 1, perplexity: 1740.6882151379436
delta beta: 102.83698105452528
iteration 2 sampling...
gibbs sample count:  1091712
after iteration: 2, perplexity: 1709.899807776168
delta beta: 54.67326460224616
iteration 3 sampling...
gibbs sample count:  1091712
after iteration: 3, perplexity: 1698.980268614891
delta beta: 45.86089945716773
iteration 4 sampling...
gibbs sample count:  1091712
after iteration: 4, perplexity: 1695.2869054529367
delta beta: 42.52762792704342
iteration 5 sampling...
gibbs sample count:  1091712
after iteration: 5, perplexity: 1693.563905203378
delta beta: 41.027677151263305
iteration 6 sampling...
gibbs sample count:  1091712
after iteration: 6, perplexity: 1692.5695891442679
delta beta: 39.27094403700969
iteration 7 sampling...
gibbs sample count:  1091712
after it

In [5]:
document = labeled_documents[0][0]

topics = llda_model.inference(document=document, iteration=100, times=10)

In [6]:
print(topics)
print(labeled_documents[0][1])

[('common_topic', 0.581244299599682), ('touch screens', 0.09407234690489696), ('logic', 0.05190369519506322), ('ethnographic studies', 0.043698934907629515), ('cultural characteristics', 0.04263040798647536), ('computing / technology policy', 0.04163820441683221), ('project management techniques', 0.03576130635048434), ('interaction design', 0.026449857466140963), ('religious orientation', 0.017901642096907696), ('hci theory, concepts and models', 0.016680468472731518), ('personal computers and pc applications', 0.01603171998488792), ('empirical studies in hci', 0.014085474521357132), ('sound and music computing', 0.006224169315722972), ('seniors', 0.004010792121903643), ('object oriented languages', 0.0027896184977274628), ('social network analysis', 0.0014157981705292593), ('real-time systems', 0.0010723430887297086), ('agile software development', 0.0004617562766416183), ('information systems applications', 0.0004235946008861127), ('geographic characteristics', 0.0003472712493751013