In [3]:
import sys
import LLDA as llda

import spacy
import pytextrank

import os
import json

In [4]:
dataset_dir = "./Data Collection/dataset"

In [5]:
def get_leafs(term_tree):
    terms = set()
    queue = [term_tree]

    while(queue):
        curr_dict = queue.pop()
        for key in curr_dict:
            if(curr_dict[key]):
                queue.append(curr_dict[key])
            else:
                terms.add(key.lower())
    return terms

labeled_documents = []

for doi in os.listdir(dataset_dir):
    text_path = os.path.join(dataset_dir, doi, f"{doi}.txt")
    if(not os.path.exists(text_path)):
        continue
    text = open(text_path, encoding="utf-8", errors="ignore").read()
    
    terms_path = os.path.join(dataset_dir, doi, f"{doi}.json")
    if(not os.path.exists(terms_path)):
        continue
    
    gt_term_tree = json.load(open(terms_path))
    gt_terms = get_leafs(gt_term_tree)
    
    labeled_documents.append((text, list(gt_terms)))

In [None]:
summarized_documents = []


In [10]:

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("topicrank")

for text, terms in labeled_documents[202:]:
    doc = nlp(text)
    
    tr = doc._.textrank

    # Print summary of article
    summary = " ".join([str(line) for line in tr.summary()])
    
    summarized_documents.append((summary, terms))
    

In [17]:
N = len(summarized_documents)
training = summarized_documents[:int(N*0.7)]

test = summarized_documents[int(N*0.7):]

In [22]:
# new a Labeled LDA model
# llda_model = llda.LldaModel(labeled_documents=training, alpha_vector="50_div_K", eta_vector=0.001)
# llda_model = llda.LldaModel(labeled_documents=training, alpha_vector=0.02, eta_vector=0.002)
llda_model = llda.LldaModel(labeled_documents=training, alpha_vector=0.01)

# training
# llda_model.training(iteration=10, log=True)
while True:
    print("iteration %s sampling..." % (llda_model.iteration + 1))
    llda_model.training(1)
    print("delta beta: %s" % llda_model.delta_beta)
    if llda_model.iteration == 400:
        break
    if llda_model.is_convergent(method="beta", delta=0.01):
        break

iteration 1 sampling...
gibbs sample count:  28072
delta beta: 224.216060419976
iteration 2 sampling...
gibbs sample count:  28072
delta beta: 143.2637424663921
iteration 3 sampling...
gibbs sample count:  28072
delta beta: 130.46703830819138
iteration 4 sampling...
gibbs sample count:  28072
delta beta: 124.64138270453839
iteration 5 sampling...
gibbs sample count:  28072
delta beta: 120.49203957465848
iteration 6 sampling...
gibbs sample count:  28072
delta beta: 119.02042666887128
iteration 7 sampling...
gibbs sample count:  28072
delta beta: 118.7701054428324
iteration 8 sampling...
gibbs sample count:  28072
delta beta: 115.47599110919231
iteration 9 sampling...
gibbs sample count:  28072
delta beta: 114.92105010176043
iteration 10 sampling...
gibbs sample count:  28072
delta beta: 117.39705320136383
iteration 11 sampling...
gibbs sample count:  28072
delta beta: 113.6471852727032
iteration 12 sampling...
gibbs sample count:  28072
delta beta: 112.61244140150752
iteration 13 sampl

In [23]:
# Save model
# save to disk
save_model_dir = "./saved_model"
llda_model.save_model_to_dir(save_model_dir)

'charmap' codec can't encode characters in position 6-7: character maps to <undefined>: Write [{'alpha_ve...] to file [./saved_model\llda_model.json] error: json.dump error


In [24]:
def iou(pred, exp):
    # print(len(exp), len(pred))
    # print(pred, exp)
    return len(pred & exp) / len(pred | exp) * 100

def accuracy(pred, exp):
    return len(pred & exp) / len(exp) * 100

accuracies = []
ious = []
preds = []
for document, gt_terms in test:
    topics = llda_model.inference(document=document, iteration=100, times=10)
    terms = {term for term,conf in topics}
    
    gt_terms = set(gt_terms)
    
    acc = accuracy(terms, gt_terms)
    iou_score = iou(terms, gt_terms)
    
    print("Accuracy:", acc)
    print("IOU:", iou_score)
    
    accuracies.append(acc)
    ious.append(iou_score)
    preds.append(terms)

Accuracy: 27.27272727272727
IOU: 1.25
Accuracy: 100.0
IOU: 0.8620689655172413
Accuracy: 50.0
IOU: 0.4291845493562232
Accuracy: 100.0
IOU: 1.293103448275862
Accuracy: 100.0
IOU: 1.293103448275862
Accuracy: 33.33333333333333
IOU: 0.4273504273504274
Accuracy: 66.66666666666666
IOU: 0.8583690987124464
Accuracy: 100.0
IOU: 1.293103448275862
Accuracy: 60.0
IOU: 1.282051282051282
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 50.0
IOU: 0.8547008547008548
Accuracy: 100.0
IOU: 1.7241379310344827
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 100.0
IOU: 0.8620689655172413
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 0.0
IOU: 0.0
Accuracy: 100.0
IOU: 0.43103448275862066
Accuracy: 66.66666666666666
IOU: 2.553191489361702
Accuracy: 66.66666666666666
IOU: 0.8583690987124464
Accuracy: 

In [25]:
total_acc = sum(accuracies)/len(accuracies)
total_iou = sum(ious)/len(ious)

print("Total Accuracy:", total_acc)
print("Total IOU:", total_iou)

Total Accuracy: 78.46875
Total IOU: 0.9661216196599403
