In [None]:
!pip install wordcloud
!pip install gensim
!pip install -U sentence-transformers

In [2]:
class Topic:
    def __init__(self, cluster):
        self.cluster = cluster
        self.sentences = []
        self.sentences_dump = ""
        self.words = []
        self.texts = []
        self.sorted_terms = []
        self.coherence = 0.0
        
class Collection:
    def __init__(self):
        self.sentences = []
        self.words = []     
        self.topics = []
        self.topics_coherence = 0.0


# Step 1: Sentences Modeling

In [3]:
MODEL = 'biobert-nli'

In [4]:
import os.path
import pandas as pd
from sentence_transformers import SentenceTransformer

DATASET = 'clicr'
DATASET_PATH = '../datasets/'+DATASET+'/cases-titles.txt'

collection = Collection()
with open(DATASET_PATH) as f:
    collection.sentences = f.readlines()

OUTPUT_PATH = 'output/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += DATASET + '/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += MODEL + '/'
if not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH, exist_ok=True)

In [5]:
# model = SentenceTransformer(MODEL)
# embeddings = model.encode(sentences)

MODEL_PATH = '../../models/' + MODEL
model = SentenceTransformer(MODEL_PATH)
embeddings = model.encode(collection.sentences)

# Step 2: Sentences Aggregation

In [6]:
DISTANCE_THRESHOLD =  200  # 300 200 190 150 130 125 120 105 103 44.3  31.2 25.8 biobert-nli

In [7]:
from sklearn.cluster import AgglomerativeClustering

clustering_model = AgglomerativeClustering(linkage='ward', distance_threshold=DISTANCE_THRESHOLD, n_clusters=None)
clustering_model = clustering_model.fit(embeddings)

n_clusters = clustering_model.n_clusters_

print('Distance Threshold: ', DISTANCE_THRESHOLD)
print('Resulting Clusters:',  n_clusters)

Distance Threshold:  200
Resulting Clusters: 3


# Step 3: Representing Topics

In [8]:
MAX_DF = 0.99

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

collection.topics = [Topic(i) for i in range(n_clusters)]
    
for i in range(len(collection.sentences)):
    cluster_index = clustering_model.labels_[i]
    collection.topics[cluster_index].sentences.append(collection.sentences[i])
    collection.topics[cluster_index].sentences_dump += collection.sentences[i] + " "

tfidf_model = TfidfVectorizer(max_df=MAX_DF)
c_tfidf = tfidf_model.fit_transform([topic.sentences_dump for topic in collection.topics])
tfidf_matrix = c_tfidf.toarray()

terms = tfidf_model.get_feature_names_out()
collection.words = terms

terms_by_topic = tfidf_model.inverse_transform(c_tfidf)
        
for i, topic in enumerate(collection.topics):
    sorted_term_indexes = np.argsort(tfidf_matrix[topic.cluster])
    sorted_terms = [terms[j] for j in sorted_term_indexes]   
    topic.sorted_terms = sorted_terms
    topic.words = terms_by_topic[i]

# Validação

In [10]:
TOP_WORDS = 10

In [None]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

dictionary = Dictionary(terms_by_topic)
corpus = [dictionary.doc2bow(t) for t in terms_by_topic] # in BoW format.
texts = [[dictionary[word_id] for word_id, freq in c] for c in corpus] # (list of list of str, optional) –  texts Tokenized in bow format

cm = CoherenceModel(topics=[topic.words for topic in collection.topics], texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v',topn=TOP_WORDS)

collection.topics_coherence = cm.get_coherence()
collection.coherence_per_topic = cm.get_coherence_per_topic()

for i, coherence in enumerate(collection.coherence_per_topic):
    collection.topics[i].coherence = coherence

print('Total coherence: ', collection.topics_coherence)
print('Coherence by topic: ', collection.coherence_per_topic)    

In [12]:
with open(OUTPUT_PATH + '/[minimal]topic-attention-results.txt', "a") as file:
    print('Hyper-parameters: ', file=file) 
    print('Language Model: ' + MODEL + '\t Distance Threshold: '+str(DISTANCE_THRESHOLD) + '\t Resulting Topics: '+str(n_clusters) + '\t TfIdf Threshold: '+str(MAX_DF) + '\t Top Words: '+str(TOP_WORDS), file=file)   
    print("", file=file)
    for topic in collection.topics:
        print('Topic '+str(topic.cluster), file=file)    
        print('Documents length: '+str(len(topic.sentences)), file=file)
        print('Terms length: '+str(len(topic.words)), file=file)

        print('Coherence: '+str(topic.coherence), file=file)
        print('Top Words: '+str(topic.sorted_terms[-TOP_WORDS:]), file=file)  
        print('', file=file)

    print('Total Coherence: '+str(collection.topics_coherence), file=file)  
    print('Total Terms considered: '+str(len(collection.words)), file=file)
    print('----------------------------------------------------------------------------', file=file)  