In [None]:
!pip install topmine

In [None]:
!python -m spacy download en_core_web_sm

In [7]:
from topmine.phrase_mining import PhraseMining
from topmine.phrase_lda import PhraseLDA
from pathlib import Path
import time
import json
import spacy

### Data preprocessing

In [None]:
in_path = 'Data/Stems'
doc_list = []
doc_names = []

for folder in Path(in_path).iterdir():
    num = folder.stem
    folder_path = Path(in_path + '/' + num)

    for item in Path(folder_path).iterdir():
        doc_names.append(item.name[:-4])
        with open(item, 'r', errors="ignore") as fin:
            text = fin.readline()
            doc_list.append(text)

### PhraseMining

In [None]:
a = PhraseMining(doc_list, max_phrase_size=4)
start = time.time()
partitioned_docs, index_vocab = a.mine()
stop = time.time()
print(stop - start)

### PhraseLDA

In [None]:
model = PhraseLDA(partitioned_docs=partitioned_docs, index_vocab=index_vocab, num_topics=20, iterations=100, optimization_iterations=10)

start = time.time()
lda = model.run()
stop = time.time()

print(stop - start)

### Getting a list of topic phrases

#### Getting document topics

In [11]:
doc_phrase_topics = lda[0]

In [12]:
topics = []
for doc_topic_counts in model.__dict__['n_d_t_phrases']:
    max_count = max(doc_topic_counts)
    topics.append(doc_topic_counts.index(max_count))

#### Getting the final results

In [None]:
doc_phrases = []
nlp = spacy.load('en_core_web_sm')

for doc_id in range(len(doc_list)):
    doc_phrases.append([])
    for phrase_id in range(len(doc_phrase_topics[doc_id])):
        if len(partitioned_docs[doc_id][phrase_id]) == 1 and doc_phrase_topics[doc_id][phrase_id] == topics[doc_id]:
            phrase_index = partitioned_docs[doc_id][phrase_id][0]
            phrase = nlp(index_vocab[phrase_index])[0].lemma_
            doc_phrases[doc_id].append(phrase)
    doc_phrases[doc_id].sort()

### Calculating metrics

In [33]:
answers = None

with open("Data/Answers.json", 'r') as f:
    answers = json.load(f)

In [None]:
N = len(doc_phrases)

accuracy = 0.0
precision = 0.0
recall = 0.0

for i in range(N):
    doc_name = doc_names[i][:8]
    TP, FN = 0, 0
    for word in answers[doc_name]:
        if word in doc_names[i]:
            TP += 1
        else:
            FN += 1
    FP = len(doc_names[i]) - TP
    precision += TP / (TP + FP)
    recall += TP / (TP + FN)

precision /= N
recall /= N
F1_score = 2 * precision * recall / (precision + recall)