In [1]:
import os.path

DATASET = 'clicr'
DATASET_PATH = '../datasets/'+DATASET+'/titles.txt'

OUTPUT_PATH = 'output/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += DATASET + '/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)

with open(DATASET_PATH) as f:
    sentences = f.readlines()

## Gemsim LDA

In [8]:
N_TOPICS = 7
TOP_WORDS = 20
MAX_DF = 0.95
MIN_DF = 1

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel

tfidf_model = TfidfVectorizer(max_df=MAX_DF, min_df=MIN_DF)

tfidf = tfidf_model.fit_transform(sentences)
terms = tfidf_model.get_feature_names_out()
terms_by_sentence = tfidf_model.inverse_transform(tfidf)

dictionary = Dictionary(terms_by_sentence)
corpus = [dictionary.doc2bow(text) for text in terms_by_sentence]
texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]

In [10]:
model = LdaModel(corpus, N_TOPICS, dictionary)

cm = CoherenceModel(model=model, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v', topn=TOP_WORDS)
coherence = cm.get_coherence()  # get coherence value
coherence_per_topic = cm.get_coherence_per_topic()

In [11]:
print('Coerência total: '+str(coherence))  

with open(OUTPUT_PATH + 'ldaresults.txt', "a") as file:
    print('Hyper-parameters: ', file=file) 
    print('Resulting Topics: '+str(N_TOPICS) + '\t TfIdf Threshold: '+str(MAX_DF) + '\t Top Words: '+str(TOP_WORDS), file=file)   
    print('', file=file)

    for i in range(len(model.get_topics())):
        print('Topic '+str(i), file=file)    
        print('Top Words: '+model.print_topic(i,topn=20), file=file)
        print('Coherence: '+str(coherence_per_topic[i]), file=file)
        print('', file=file)
    
        print('Topic '+str(i))    
        print('Top Words: '+model.print_topic(i,topn=20))
        print('Coherence: '+str(coherence_per_topic[i]))
        print('')
      
        
    print('Total Coherence: '+str(coherence), file=file)  
    print('----------------------------------------------------------------------------', file=file)  

Coerência total: 0.24707072645089134
Topic 0
Top Words: 0.025*"in" + 0.024*"of" + 0.023*"and" + 0.021*"with" + 0.013*"cancer" + 0.010*"diagnosis" + 0.010*"the" + 0.009*"syndrome" + 0.008*"for" + 0.008*"an" + 0.008*"disease" + 0.007*"patient" + 0.007*"breast" + 0.007*"after" + 0.006*"case" + 0.005*"recurrent" + 0.005*"positive" + 0.005*"presenting" + 0.005*"delayed" + 0.005*"valve"
Coherence: 0.24535025168705923

Topic 1
Top Words: 0.045*"in" + 0.042*"with" + 0.029*"patient" + 0.027*"and" + 0.021*"of" + 0.013*"an" + 0.012*"as" + 0.010*"pulmonary" + 0.009*"syndrome" + 0.009*"to" + 0.009*"presenting" + 0.008*"acute" + 0.008*"cell" + 0.008*"carcinoma" + 0.006*"after" + 0.006*"for" + 0.006*"treatment" + 0.006*"spinal" + 0.005*"cause" + 0.005*"rare"
Coherence: 0.2402498703554657

Topic 2
Top Words: 0.039*"of" + 0.023*"syndrome" + 0.023*"the" + 0.022*"in" + 0.020*"with" + 0.015*"and" + 0.010*"an" + 0.009*"as" + 0.009*"acute" + 0.009*"case" + 0.008*"to" + 0.008*"for" + 0.008*"two" + 0.006*"ven