In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
from gensim.corpora.dictionary import Dictionary
from gensim.models import Word2Vec
from hdbscan import HDBSCAN
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import squareform
from scipy.cluster import hierarchy as sch
from collections import defaultdict
from tqdm import tqdm
import re
import tomotopy as tp
from operator import itemgetter
import operator
from gensim.models import KeyedVectors
import warnings
import pickle
from collections import Counter
import os
warnings.filterwarnings('ignore')
import torch
from itertools import product
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

2024-11-22 13:48:47.519098: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# enter the dataset
dataset_name = '20'

In [3]:
stop_words_file = '../Datasets/stopwords_en.txt'
stop_words =  [line.strip() for line in open(stop_words_file, encoding="utf-8").readlines()]

# For covid dataset
# stop_words_file = '../Datasets/stopwords_kor.txt'
# stop_words =  [line.strip() for line in open(stop_words_file, encoding="utf-8").readlines()]

In [4]:
data = pd.read_feather(f'../Datasets/{dataset_name}/{dataset_name}.ftr')
texts =data.words
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
docs = data.document.to_list()

##For covid dataset
# data = pd.read_feather(f'../Datasets/{dataset_name}/{dataset_name}.ftr')
# texts =data.okt
# dictionary = Dictionary(texts)
# corpus = [dictionary.doc2bow(text) for text in texts]
# docs = data.corrected_twit.to_list()

In [5]:
# Using Word2Vec for embedding model
embedding_model = Word2Vec.load(f'../models/{dataset_name}/word2vec.model')

In [6]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, text):
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        # Tokenize the text
        tokens = text.split()
        tokens = [self.tagger.stem(token) for token in tokens if token not in stop_words]
        
        return tokens

## For covid dataset
# class CustomTokenizer:
#     def __init__(self, tagger):
#         self.tagger = tagger
#     def __call__(self, sent):
#         #sent = sent[:1000000]
#         hangul = re.compile('[^ 0-9가-힣+]')
#         sent = hangul.sub(' ', sent)
#         sent = " ".join(sent.split())
#         word_tokens = self.tagger.pos(sent, stem=True)
#         temp = [word[0] for word in word_tokens if (word[1] =='Adjective' or  word[1] =='Noun')]
#         result = [word for word in temp if (len(word) > 1  and ( not word in stop_words))]

#         return result

In [7]:
def assign_topic(doc, topics):
    # topic dictionary
    topic_probabilities = {}

    # for the topic
    for topic_id, topic_words in topics.items():
        # save the priority
        topic_word_priority = {word: priority for priority, word in enumerate(reversed(topic_words))}

        # extract the words in current topic
        doc_words_in_topic = [word for word in doc if word in topic_word_priority]

        # Calculate the sum of the priorities for each word
        priority_sum = sum(topic_word_priority[word] for word in doc_words_in_topic)
    
        # Store the probability of belonging to a topic in a dictionary
        topic_probabilities[topic_id] = priority_sum
    
    #  Choose topics with the highest probability
    most_probable_topic = max(topic_probabilities.items(), key=operator.itemgetter(1))[0]
    return most_probable_topic

In [8]:
def get_word_embedding_based_similarity(topic_dict, topic1, topic2):
    '''similarity = 0
    cnt=0 
    for i in topic_dict.get(topic1)[:10]:
        for j in topic_dict.get(topic2)[:10]:
            try:
                s = embedding_model.wv.similarity(i, j)
                similarity += s
                cnt+=1
            except KeyError:
                pass'''
    s=0
    topic_a = topic_dict.get(topic1)
    topic_b = topic_dict.get(topic2)
    for j in range(len(topic_a)):
        for k in range(len(topic_b)):
            try:
                s+=embedding_model.wv.similarity(topic_a[j], topic_b[k])
            except KeyError:
                continue
            
    return s/(len(topic_a)*(len(topic_a)))

In [9]:
## similarity
def get_similarity(topic_dict, topn_dict):
    similarity = []
    for parents in topn_dict:
        for child in topn_dict.get(parents):
            if sum(x == y for x, y in zip(topic_dict.get(parents),topic_dict.get(child))) >= len(topic_dict.get(parents)) / 2:  
                   similarity.append(get_word_embedding_based_similarity(topic_dict, parents, child)/2)
            else: similarity.append(get_word_embedding_based_similarity(topic_dict, parents, child))
          
    else: return sum(similarity)/len(similarity)

In [10]:
## diversity
def get_diversity(topic_dict, topn_dict):
    
    diversity=[]
    for i in topn_dict: 
        if len(topn_dict.get(i)) ==1 : continue
        else: 
            d_list = topn_dict.get(i)
            for j in range(len(d_list)):
                for k in range(len(d_list)):
                    if j!=k:diversity.append(1 - get_word_embedding_based_similarity(topic_dict, d_list[j], d_list[k]))
               
        

    return sum(diversity) / len(diversity)

In [11]:
## coherence
def get_coherence(topic_dict, topn_dict):
    # Make empty set for all values
    unique_topics = set()

    # Traverse all keys and values in the dictionary to remove duplicates and add unique int values to the set
    for key, value in topn_dict.items():
        unique_topics.update([key] + value)

    topics = []
    for i in unique_topics:
        topics.append(topic_dict.get(i))
    cm = CoherenceModel(topics = topics,
                    texts =texts,
                    dictionary = dictionary,
                   coherence = 'c_npmi')
    coherence = cm.get_coherence()
    coherence = (coherence +1)/2
    return coherence

In [12]:
## HARIN metric
def HARIN(topic_dict,topn_dict):
    s = get_similarity(topic_dict, topn_dict)
    d = get_diversity(topic_dict, topn_dict)
    c = get_coherence(topic_dict, topn_dict)
    return s, d, c

In [13]:
# this is for the bertopic
def get_hierarchy(model, k):
    distance_function = lambda x: 1 - cosine_similarity(x)
    linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

    # Calculate distance
    embeddings = model.c_tf_idf_[model._outliers:k+1]
    X = distance_function(embeddings)

    # Make sure it is the 1-D condensed distance matrix with zeros on the diagonal
    np.fill_diagonal(X, 0)
    X = squareform(X)

    # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix
    Z = linkage_function(X)

    # Calculate basic bag-of-words to be iteratively merged later
    documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": model.topics_})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    documents_per_topic.loc[documents_per_topic.Topic != -1, :]
    ## top 10 topics
    documents_per_topic = documents_per_topic.loc[:k+1]
    clean_documents = documents_per_topic.Document.values

    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
    # and will be removed in 1.2. Please use get_feature_names_out instead.
    #if version.parse(sklearn_version) >= version.parse("1.0.0"):
    #words = model.vectorizer_model.get_feature_names_out()
    #else:
    words = model.vectorizer_model.get_feature_names_out()

    bow = model.vectorizer_model.transform(clean_documents)

    # Extract clusters
    hier_topics = pd.DataFrame(columns=["Parent_ID", "Parent_Name", "Topics",
                                        "Child_Left_ID", "Child_Left_Name",
                                        "Child_Right_ID", "Child_Right_Name"])
    for index in (range(len(Z))):

        # Find clustered documents
        clusters = sch.fcluster(Z, t=Z[index][2], criterion='distance') - model._outliers
        cluster_df = pd.DataFrame({"Topic": range(len(clusters)), "Cluster": clusters})
        cluster_df = cluster_df.groupby("Cluster").agg({'Topic': lambda x: list(x)}).reset_index()
        #cluster_df = cluster_df.loc[:10]
        nr_clusters = len(clusters)

        # Extract first topic we find to get the s aet of topics in a merged topic
        topic = None
        val = Z[index][0]
        while topic is None:
            if val - len(clusters) < 0:
                topic = int(val)
            else:
                val = Z[int(val - len(clusters))][0]
        clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]]

        # Group bow per cluster, calculate c-TF-IDF and extract words
        grouped = csr_matrix(bow[clustered_topics].sum(axis=0))
        c_tf_idf = model.ctfidf_model.fit_transform(grouped)
        selection = documents.loc[documents.Topic.isin(clustered_topics), :]
        selection.Topic = 0
        words_per_topic = model._extract_words_per_topic(words, selection, c_tf_idf)

        # Extract parent's name and ID
        parent_id = index + len(clusters)
        parent_name = "_".join([x[0] for x in words_per_topic[0]][:20])

        # Extract child's name and ID
        Z_id = Z[index][0]
        child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters

        if Z_id - nr_clusters < 0:
            child_left_name = "_".join([x[0] for x in model.get_topic(Z_id)][:20])
        else:
            child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name

        # Extract child's name and ID
        Z_id = Z[index][1]
        child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters

        if Z_id - nr_clusters < 0:
            child_right_name = "_".join([x[0] for x in model.get_topic(Z_id)][:20])
        else:
            child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name

        # Save results
        hier_topics.loc[len(hier_topics), :] = [parent_id, parent_name,
                                                clustered_topics,
                                                int(Z[index][0]), child_left_name,
                                                int(Z[index][1]), child_right_name]

    hier_topics["Distance"] = Z[:, 2]
    hier_topics = hier_topics.sort_values("Parent_ID", ascending=False)
    hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]].astype(str)
    return hier_topics

In [14]:
def find_parent(dictionary, value):
    for key, values in dictionary.items():
        if value in values:
            return key
    return None

In [15]:
## bertopic
def get_bertopics(topk):
    umap_model = UMAP(n_neighbors=15, n_components=5, 
          min_dist=0.0, metric='cosine', random_state=33)
    custom_tokenizer = CustomTokenizer(PorterStemmer())
    vectorizer = CountVectorizer(tokenizer = custom_tokenizer)
    model = BERTopic(
     top_n_words=30,
     umap_model = umap_model,
     vectorizer_model = vectorizer,
     ).load(f'../models/{dataset_name}/bertopic')


    hier_topics = get_hierarchy(model,topk)
    parent_dict = {}
    topic_dict = {}

    # Start from the first row in hier_topics
    root = hier_topics.iloc[0]['Parent_ID']
    lid = hier_topics.iloc[0]['Child_Left_ID']
    rid = hier_topics.iloc[0]['Child_Right_ID']
    parent_dict[int(root)] = [int(lid), int(rid)]

    if len(hier_topics[hier_topics['Parent_ID']==root]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==root]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(root))]
    topic_dict[int(root)] = topic_words

    if len(hier_topics[hier_topics['Parent_ID']==lid]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==lid]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(lid))]
    topic_dict[int(lid)] = topic_words

    if len(hier_topics[hier_topics['Parent_ID']==rid]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==rid]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(rid))]
    topic_dict[int(rid)] = topic_words

    llid = hier_topics[hier_topics['Parent_ID']==lid]['Child_Left_ID'].values[0]
    lrid =  hier_topics[hier_topics['Parent_ID']==lid]['Child_Right_ID'].values[0]
    parent_dict[int(lid)] = [int(llid), int(lrid)]

    if len(hier_topics[hier_topics['Parent_ID']==llid]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==llid]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(llid))]
    topic_dict[int(llid)] = topic_words

    if len(hier_topics[hier_topics['Parent_ID']==lrid]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==lrid]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(lrid))]
    topic_dict[int(lrid)] = topic_words

    rlid = hier_topics[hier_topics['Parent_ID']==rid]['Child_Left_ID'].values[0]
    rrid =  hier_topics[hier_topics['Parent_ID']==rid]['Child_Right_ID'].values[0]
    parent_dict[int(rid)] = [int(rlid), int(rrid)]

    if len(hier_topics[hier_topics['Parent_ID']==rlid]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==rlid]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(rlid))]
    topic_dict[int(rlid)] = topic_words

    if len(hier_topics[hier_topics['Parent_ID']==rrid]['Parent_Name']) > 0:
        topic_words = hier_topics[hier_topics['Parent_ID']==rrid]['Parent_Name'].values[0].split('_')
    else:
        topic_words = [word[0] for word in model.get_topic(int(rrid))]
    topic_dict[int(rrid)] = topic_words

    topn_dict = parent_dict
    
    return topic_dict,topn_dict

In [16]:
## hlda
def get_hldas(topk):
    model = tp.HLDAModel().load(f'../models/{dataset_name}/hlda.bin')
    parent_dict = {}
    topic_dict = {}
    parent_dict[0] = list(model.children_topics(0))
    topic_dict[0] = [words[0] for words in model.get_topic_words(0,top_n=20)]
    for depth1 in model.children_topics(0):
        parent_dict[depth1] = list(model.children_topics(depth1))
        topic_dict[depth1] = [words[0] for words in model.get_topic_words(depth1,top_n=20)]
        for topic in model.children_topics(depth1):
            topic_dict[topic] = [words[0] for words in model.get_topic_words(topic,top_n=20)]
                                                      
            
    topic_numdoc = {}
    total =0
    for depth1 in model.children_topics(0):
        for topic in model.children_topics(depth1):
            total +=model.num_docs_of_topic(topic)
            topic_numdoc[topic] = model.num_docs_of_topic(topic)


    res = dict(sorted(topic_numdoc.items(), key=itemgetter(1), reverse=True)[:topk])
    topn_dict = {}
    for child in res:
        if model.parent_topic(child) in topn_dict:
            temp = topn_dict.get(model.parent_topic(child))
            temp.append(child)
            topn_dict[model.parent_topic(child)] = temp

        else : topn_dict[model.parent_topic(child)] = [child]  

    temp = list(topn_dict.keys())
    topn_dict[0] = temp
    
    return topic_dict, topn_dict

In [17]:
def get_cluhtm_topics(line):
    line=line.replace('\'','')
    line=line.split("[")[1]
    line=line.split("]")[0]
    return line.split(", ")

In [18]:
## cluhtm
def get_cluhtms(topk):
    file=open(f'../models/{dataset_name}/hierarchical_structure_cluhtm.txt', 'r')
    readdata = file.readlines()
    raw = pd.read_feather(f'../Datasets/{dataset_name}/{dataset_name}.ftr')
    raw['label'] = range(len(raw))

    num_depth0 = 0
    num_depth1 = 0
    num_depth2 = 0
    for line in readdata:
        if '\t' not in line: num_depth0+=1
        elif '\t\t'not in line: num_depth1+=1
        else: num_depth2+=1

    num_depth1 += num_depth0
    num_depth2 += num_depth1

    topic_dict={}
    parent_dict={}
    leaf_topics={}

    for line in readdata:
        if '\t' not in line:
            num_depth0 -=1
            topic_dict[num_depth0] = get_cluhtm_topics(line)

        elif '\t\t'not in line:
            num_depth1 -=1
            topic_dict[num_depth1] = get_cluhtm_topics(line)
            if parent_dict.get(num_depth0) is None :
                parent_dict[num_depth0] = [num_depth1]
            else :
                temp = parent_dict.get(num_depth0)
                temp.append(num_depth1)
                parent_dict[num_depth0] = temp    
        else:
            num_depth2 -=1
            leaf_topics[num_depth2] = get_cluhtm_topics(line)
            topic_dict[num_depth2] = get_cluhtm_topics(line)
            if parent_dict.get(num_depth1) is None :
                parent_dict[num_depth1] = [num_depth2]
            else :
                temp = parent_dict.get(num_depth1)
                temp.append(num_depth2)
                parent_dict[num_depth1] = temp
        
        
    for i in range(len(raw)):
        raw.label[i] = assign_topic(raw.words[i],leaf_topics)
            
        
    res = raw['label'].value_counts()[:topk].index
    topn_dict = {}
    for child in res:
        if find_parent(parent_dict, child) in topn_dict:
            temp = topn_dict.get(find_parent(parent_dict, child))
            temp.append(child)
            topn_dict[find_parent(parent_dict, child)] = temp
        else : topn_dict[find_parent(parent_dict, child)] = [child] 

    for i in list(topn_dict.keys()):    
        if find_parent(parent_dict, i) in topn_dict:
            temp = topn_dict.get(find_parent(parent_dict, i))
            temp.append(i)
            topn_dict[find_parent(parent_dict, i)] = temp

        else : topn_dict[find_parent(parent_dict, i)] = [i]  
        

    return topic_dict, topn_dict

In [19]:
def get_hyhtm_topics(line):
    line=line.replace('\'','')
    line=line.replace('\t','')
    line=line.replace('\n','')
    return line.split(" ")

In [20]:
## hyhtm
def get_hyhtms(topk):
    file=open(f'../models/{dataset_name}/hierarchical_structure_hyhtm.txt', 'r')
    readdata = file.readlines()
    raw = pd.read_feather(f'../Datasets/{dataset_name}/{dataset_name}.ftr')
    raw['label'] = range(len(raw))

    num_depth0 = 0
    num_depth1 = 0
    num_depth2 = 0
    for line in readdata:
        if '\t' not in line: num_depth0+=1
        elif '\t\t'not in line: num_depth1+=1
        else: num_depth2+=1

    num_depth1 += num_depth0
    num_depth2 += num_depth1

    topic_dict={}
    parent_dict={}
    leaf_topics={}

    for line in readdata:
        # depth0 : L0 topic(root topic)
        if '\t' not in line:
            num_depth0 -=1
            topic_dict[num_depth0] = get_hyhtm_topics(line)

        # depth1 : L1 topic(parent topic)
        elif '\t\t'not in line:
            num_depth1 -=1
            topic_dict[num_depth1] = get_hyhtm_topics(line)
            if parent_dict.get(num_depth0) is None :
                parent_dict[num_depth0] = [num_depth1]
            else :
                temp = parent_dict.get(num_depth0)
                temp.append(num_depth1)
                parent_dict[num_depth0] = temp

        # depth2 : L2 topic(child topic)
        else:
            num_depth2 -=1
            leaf_topics[num_depth2] = get_hyhtm_topics(line)
            topic_dict[num_depth2] = get_hyhtm_topics(line)
            if parent_dict.get(num_depth1) is None :
                parent_dict[num_depth1] = [num_depth2]
            else :
                temp = parent_dict.get(num_depth1)
                temp.append(num_depth2)
                parent_dict[num_depth1] = temp

    for i in range(len(raw)):
        raw.label[i] = assign_topic(raw.words[i],leaf_topics)


    res = raw['label'].value_counts()[:topk].index
    topn_dict = {}
    for child in res:
        if find_parent(parent_dict, child) in topn_dict:
            temp = topn_dict.get(find_parent(parent_dict, child))
            temp.append(child)
            topn_dict[find_parent(parent_dict, child)] = temp
        else : topn_dict[find_parent(parent_dict, child)] = [child] 

    for i in list(topn_dict.keys()):    
        if find_parent(parent_dict, i) in topn_dict:
            temp = topn_dict.get(find_parent(parent_dict, i))
            temp.append(i)
            topn_dict[find_parent(parent_dict, i)] = temp

        else : topn_dict[find_parent(parent_dict, i)] = [i]  

    return topic_dict, topn_dict

In [21]:
## Calculate the HARIN score
score_df = pd.DataFrame(columns=['model','k', 's', 'd', 'c','s+d','s+c','c+d','HARIN'])
models = ['bertopic', 'cluhtm', 'hlda', 'hyhtm']
output = []
for model_name in models:
    print(model_name)
    for topk in range(10,60,10):
        if model_name =='bertopic':
            topic_dict, topn_dict = get_bertopics(topk)
        elif model_name =='cluhtm':
            topic_dict, topn_dict = get_cluhtms(topk)
        if model_name =='hlda':
            topic_dict, topn_dict = get_hldas(topk)
        elif model_name =='hyhtm':
            topic_dict, topn_dict = get_hyhtms(topk)    
        
        
      # Calculate the baseline metrics
        s, d, c= HARIN(topic_dict, topn_dict)
        output.append({'model': model_name, 'k': topk, 's' : s, 'd' : d, 'c' : c, 's+d' : s+d, 's+c' : s+c, 'c+d' : c+d, 's+c+d': s+c+d, 'HARIN': (0.33 * s) + (0.33 * d)+ (0.33 * c)})
        
score_df = pd.DataFrame(output)

bertopic
cluhtm
hlda
hyhtm
