In [1]:
import os.path

DATASET = 'clicr'
DATASET_PATH = '../datasets/'+DATASET+'/cases-titles.txt'

OUTPUT_PATH = 'output/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += DATASET + '/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)

with open(DATASET_PATH) as f:
    sentences = f.readlines()

## Gemsim LDA

In [2]:
k = 3
MAX_DF = 0.1

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel

tfidf_model = TfidfVectorizer(max_df=MAX_DF, smooth_idf=True, use_idf=True)

tfidf = tfidf_model.fit_transform(sentences)
terms = tfidf_model.get_feature_names_out()
terms_by_sentence = tfidf_model.inverse_transform(tfidf)

print("Total Words: " , len(terms))

dictionary = Dictionary(terms_by_sentence)
corpus = [dictionary.doc2bow(text) for text in terms_by_sentence]

Total Words:  12126


In [4]:
model = LdaModel(corpus, k, id2word=dictionary)

TOP_WORDS = 50

topics = []
for topic in model.show_topics(num_topics=-1, formatted=False, num_words = TOP_WORDS):
    words = []
    for i in range(TOP_WORDS):
        words.append(topic[:][:][:][:][:][1][i][0])
    topics.append(words)
    
cm = CoherenceModel(topics=topics, texts=terms_by_sentence, dictionary=dictionary, coherence='c_v', topn=TOP_WORDS)

coherence = cm.get_coherence() 
coherence_per_topic = cm.get_coherence_per_topic()

In [5]:
print('Total Coherence: '+str(coherence))  

with open(OUTPUT_PATH + 'ldaresults.txt', "a") as file:
    print('Hyper-parameters: ', file=file) 
    print('Resulting Topics: '+str(k) + '\t TfIdf Threshold: '+str(MAX_DF) + '\t Top Words: '+str(TOP_WORDS), file=file)   
    print('', file=file)
    print('Total Coherence: '+str(coherence), file=file)  
    
    for i in range(len(model.get_topics())):
        print('Topic '+str(i), file=file) 
        print('Top words: ',topics[i][:TOP_WORDS], file=file)
        print('Coherence: '+str(coherence_per_topic[i]), file=file)
        print('', file=file)
    
        print('Topic '+str(i))    
        print('Vocabulary: ', len(topics[i]))
        print('Top words: ',topics[i][:TOP_WORDS])
        print('Coherence: '+str(coherence_per_topic[i]))
        print('')
      
    print('----------------------------------------------------------------------------', file=file)  

Total Coherence: 0.26198984067217296
Topic 0
Vocabulary:  50
Top words:  ['rare', 'to', 'as', 'for', 'artery', 'case', 'presentation', 'acute', 'unusual', 'cause', 'after', 'syndrome', 'presenting', 'by', 'complication', 'patient', 'following', 'treatment', 'induced', 'secondary', 'disease', 'bilateral', 'coronary', 'anterior', 'pulmonary', 'obstruction', 'associated', 'aneurysm', 'due', 'aortic', 'severe', 'injury', 'cardiac', 'right', 'diagnosis', 'from', 'fracture', 'traumatic', 'type', 'report', 'valve', 'malignant', 'on', 'perforation', 'bowel', 'lymphoma', 'stroke', 'cell', 'dislocation', 'pancreatitis']
Coherence: 0.23658422318064676

Topic 1
Vocabulary:  50
Top words:  ['disease', 'to', 'patient', 'unusual', 'treatment', 'following', 'old', 'management', 'acute', 'year', 'is', 'for', 'diagnosis', 'as', 'syndrome', 'tumour', 'diagnostic', 'after', 'presenting', 'case', 'associated', 'child', 'not', 'mimicking', 'or', 'non', 'sinus', 'cause', 'due', 'bilateral', 'trauma', 'it', '

-----------------

## Evaluating random topics

In [6]:
import random
import time
# print(model.topics[0].words[-10:])

TOP_WORDS = 50

random_topics = []
for i in range(k):
    r = terms.copy()
    random.shuffle(r)
    random_topics.append(r[:TOP_WORDS])

for t in random_topics:
    print(t[:TOP_WORDS])

random_cm = CoherenceModel(topics=random_topics, texts=terms_by_sentence, dictionary=dictionary, coherence="c_v",topn=TOP_WORDS)

print(random_cm.get_coherence())
print(random_cm.get_coherence_per_topic())

['modalities' 'phosphaturic' 'lumbosciatic' 'congenital' 'duplication'
 'rendition' 'mycobacterium' 'nivolumab' 'trip' 'foods' 'atopic'
 'appropriate' 'transatrial' 'sigmoidorectal' 'straddling' 'trapdoor'
 'generation' 'triplet' 'astrocytic' 'leak' 'femoroacetabular'
 'intramucosal' 'beanbag' 'folate' 'minor' 'intrathyroidal' 'outlet'
 'lactam' 'hypersomnolence' 'wineskins' 'svad' 'sensorineural'
 'interventricular' 'iga' 'amelonatic' 'synuclein' 'fibroelastoma'
 'infantile' 'approaches' 'lamellar' 'breaks' 'refers' 'shunting' 'iib'
 'inspire' 'xerophthalmia' 'kiss' 'negativity' 'individualised' 'pong']
['overload' 'poly' 'uncovered' 'pucker' 'trifascicular' 'nuclei'
 'temozolomide' 'placement' 'prudent' 'alien' 'substitution'
 'perforations' 'malformations' 'atopic' '54' 'janeway' 'lymphomatoid'
 'fifty' 'convulsant' 'intensive' 'supply' 'mononucleosis' 'buttocks'
 'carcinoid' 'glaucoma' 'coils' 'preserve' 'microfistulae' 'pence'
 'lipoarabinomannan' 'microcatheter' 'osteonecrosis' '

-------------

## Printing the 10 best and worses topics in ascending sort

In [7]:
from numpy import argsort

sorted_coherences = sorted(coherence_per_topic)
args = argsort(coherence_per_topic)

j=0
for i in args:
    j+=1
    # print(i)
    # print(args[i])
    # print(coherence_per_topic[args[i]])
    if (j<10 or j>190):
        print('Tópico '+str(i))
        print('coherence: ' + str(coherence_per_topic[i]))
        print('Top Words: '+model.print_topic(args[i],topn=TOP_WORDS))

Tópico 2
coherence: 0.1892304232436122
Top Words: 0.008*"disease" + 0.008*"to" + 0.007*"patient" + 0.007*"unusual" + 0.007*"treatment" + 0.007*"following" + 0.006*"old" + 0.006*"management" + 0.006*"acute" + 0.005*"year" + 0.005*"is" + 0.005*"for" + 0.005*"diagnosis" + 0.005*"as" + 0.005*"syndrome" + 0.004*"tumour" + 0.004*"diagnostic" + 0.004*"after" + 0.004*"presenting" + 0.004*"case" + 0.004*"associated" + 0.004*"child" + 0.003*"not" + 0.003*"mimicking" + 0.003*"or" + 0.003*"non" + 0.003*"sinus" + 0.003*"cause" + 0.003*"due" + 0.003*"bilateral" + 0.003*"trauma" + 0.003*"it" + 0.003*"nerve" + 0.002*"surgical" + 0.002*"use" + 0.002*"disorder" + 0.002*"clinical" + 0.002*"sign" + 0.002*"pulmonary" + 0.002*"two" + 0.002*"boy" + 0.002*"spontaneous" + 0.002*"pregnancy" + 0.002*"unilateral" + 0.002*"important" + 0.002*"gastrointestinal" + 0.002*"resistant" + 0.002*"successful" + 0.002*"adult" + 0.002*"on"
Tópico 0
coherence: 0.23658422318064676
Top Words: 0.019*"patient" + 0.013*"syndrome" 