In [1]:
import os.path

DATASET = 'clicr'
DATASET_PATH = '../datasets/'+DATASET+'/cases-titles.txt'

OUTPUT_PATH = 'output/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += DATASET + '/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)

with open(DATASET_PATH) as f:
    sentences = f.readlines()

## Gemsim LDA

In [2]:
N_TOPICS = 7
TOP_WORDS = 10
MAX_DF = 0.99

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel

tfidf_model = TfidfVectorizer(max_df=MAX_DF)

tfidf = tfidf_model.fit_transform(sentences)
terms = tfidf_model.get_feature_names_out()
terms_by_sentence = tfidf_model.inverse_transform(tfidf)

dictionary = Dictionary(terms_by_sentence)
corpus = [dictionary.doc2bow(text) for text in terms_by_sentence]
texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]

In [4]:
model = LdaModel(corpus, N_TOPICS, dictionary)

cm = CoherenceModel(model=model, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v', topn=TOP_WORDS)
# cm = CoherenceModel(topics=[topic for topic in model.show_topics()], texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v', topn=TOP_WORDS)

coherence = cm.get_coherence()  # get coherence value
coherence_per_topic = cm.get_coherence_per_topic()

In [5]:
print('Coerência total: '+str(coherence))  
with open(OUTPUT_PATH + 'ldaresults.txt', "a") as file:
    print('Hyper-parameters: ', file=file) 
    print('Resulting Topics: '+str(N_TOPICS) + '\t TfIdf Threshold: '+str(MAX_DF) + '\t Top Words: '+str(TOP_WORDS), file=file)   
    print('', file=file)

    for i in range(len(model.get_topics())):
        print('Topic '+str(i), file=file)    
        print('Top Words: '+model.print_topic(i,topn=TOP_WORDS), file=file)
        print('Coherence: '+str(coherence_per_topic[i]), file=file)
        print('', file=file)
    
        print('Topic '+str(i))    
        print('Top Words: '+model.print_topic(i,topn=TOP_WORDS))
        print('Coherence: '+str(coherence_per_topic[i]))
        print('')
      
    print('Total Coherence: '+str(coherence), file=file)  
    print('----------------------------------------------------------------------------', file=file)  

Coerência total: 0.3941785698230483
Topic 0
Top Words: 0.035*"of" + 0.020*"in" + 0.020*"with" + 0.019*"and" + 0.018*"to" + 0.016*"as" + 0.015*"acute" + 0.014*"the" + 0.013*"presenting" + 0.012*"an"
Coherence: 0.3483961073440106

Topic 1
Top Words: 0.032*"of" + 0.017*"the" + 0.015*"after" + 0.013*"and" + 0.013*"following" + 0.013*"for" + 0.013*"injury" + 0.011*"with" + 0.009*"in" + 0.009*"acute"
Coherence: 0.3498337459605543

Topic 2
Top Words: 0.051*"of" + 0.033*"and" + 0.032*"in" + 0.026*"the" + 0.016*"to" + 0.014*"with" + 0.012*"case" + 0.008*"for" + 0.008*"treatment" + 0.008*"review"
Coherence: 0.43258555819863664

Topic 3
Top Words: 0.056*"of" + 0.040*"in" + 0.032*"an" + 0.026*"the" + 0.024*"with" + 0.019*"and" + 0.016*"unusual" + 0.014*"patient" + 0.011*"case" + 0.010*"cause"
Coherence: 0.4103051129050274

Topic 4
Top Words: 0.038*"in" + 0.034*"of" + 0.028*"with" + 0.019*"old" + 0.018*"an" + 0.017*"year" + 0.015*"and" + 0.012*"the" + 0.012*"cell" + 0.012*"carcinoma"
Coherence: 0.4

## Printar os 10 melhores e os 10 piores tópicos

In [6]:
from numpy import argsort

sorted_coherences = sorted(coherence_per_topic)
args = argsort(coherence_per_topic)

j=0
for i in args:
    j+=1
    # print(i)
    # print(args[i])
    # print(coherence_per_topic[args[i]])
    if (j<10 or j>190):
        print('Tópico '+str(i))
        print('coherence: ' + str(coherence_per_topic[i]))
        print('Top Words: '+model.print_topic(args[i],topn=20))

Tópico 0
coherence: 0.3483961073440106
Top Words: 0.035*"of" + 0.020*"in" + 0.020*"with" + 0.019*"and" + 0.018*"to" + 0.016*"as" + 0.015*"acute" + 0.014*"the" + 0.013*"presenting" + 0.012*"an" + 0.011*"rare" + 0.008*"patient" + 0.007*"cause" + 0.007*"secondary" + 0.007*"small" + 0.007*"thrombosis" + 0.006*"by" + 0.006*"artery" + 0.006*"due" + 0.005*"induced"
Tópico 1
coherence: 0.3498337459605543
Top Words: 0.032*"of" + 0.017*"the" + 0.015*"after" + 0.013*"and" + 0.013*"following" + 0.013*"for" + 0.013*"injury" + 0.011*"with" + 0.009*"in" + 0.009*"acute" + 0.008*"by" + 0.008*"an" + 0.007*"pancreatitis" + 0.006*"induced" + 0.006*"traumatic" + 0.006*"abdominal" + 0.005*"haematoma" + 0.005*"replacement" + 0.005*"syndrome" + 0.005*"high"
Tópico 6
coherence: 0.38107575818146494
Top Words: 0.038*"in" + 0.034*"of" + 0.028*"with" + 0.019*"old" + 0.018*"an" + 0.017*"year" + 0.015*"and" + 0.012*"the" + 0.012*"cell" + 0.012*"carcinoma" + 0.009*"to" + 0.008*"for" + 0.007*"child" + 0.007*"as" + 0.0