In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import ast

In [7]:
df = pd.read_csv("results_cluster/cur_res/full_res_reuters.csv")
df.shape

(12892, 14)

In [None]:
def compute_accuracy(df):
    list_Trustworthiness = df["Trustworthiness"].tolist()
    list_Continuity = df["Continuity"].tolist()
    list_Shephard = df["Shephard Diagram Correlation"].tolist()

    results = []
    for i in range(len(list_Trustworthiness)):
        results.append(round(0.25*list_Trustworthiness[i] + 0.25*list_Continuity[i] + 0.5*(0.5*(list_Shephard[i] + 1)), 2))
    return results

In [None]:
def compute_perception(df):
    list_NeighborhoodHit = df["7-Neighborhood Hit"].tolist()
    list_DistanceConsistency = df["Distance consistency"].tolist()

    results = []
    for i in range(len(list_NeighborhoodHit)):
        results.append(round(0.5*list_NeighborhoodHit[i] + 0.5*list_DistanceConsistency[i], 2))
    return results

## Generate Entries for Topic Models

In [None]:
def generate_dataframe(corpus_file_layouts, corpus_file_model):
    df_layouts_all = pd.read_csv(corpus_file_layouts)
    
    # only keep the layouts with a topic model, i.e., TM is either lda, lsi, or nmf
    excluded_embeddings = ['bert', 'bow', 'tfidf']
    df_layouts = df_layouts_all[~df_layouts_all['TM'].isin(excluded_embeddings)]
    
    # detect the model, i.e., linear combined must be removed
    list_TM_short = [TM.replace('_linear_combined', '') for TM in df_layouts["TM"].tolist()]
    df_layouts["TM_short"] = list_TM_short
    
    # detect the number of topics
    list_n_topics = [name.split("n_topics_")[1].split("_")[0] for name in df_layouts["Experiment"].tolist()]
    df_layouts['n_topics'] = list_n_topics
    
    # compute the aggregated accuracy
    df_layouts["accuracy"] = compute_accuracy(df_layouts)
    
    # compute the aggregated perception
    df_layouts["perception"] = compute_perception(df_layouts)
    
    # derive the coherence measures
    df_model = pd.read_csv(corpus_file_model)
    
    list_coherence_c_v = []
    list_coherence_u_mass = []
    list_coherence_c_uci = []
    list_coherence_c_npmi = []
    
    for i in range(df_layouts.shape[0]):
        model_type = list_TM_short[i]
        n_topics = list_n_topics[i]
        list_coherence_c_v.append(df_model[(df_model["model_type"] == model_type) & (df_model["n_topics"] == int(n_topics))]['coherence_c_v'].tolist()[0])
        list_coherence_u_mass.append(df_model[(df_model["model_type"] == model_type) & (df_model["n_topics"] == int(n_topics))]['coherence_u_mass'].tolist()[0])
        list_coherence_c_uci.append(df_model[(df_model["model_type"] == model_type) & (df_model["n_topics"] == int(n_topics))]['coherence_c_uci'].tolist()[0])
        list_coherence_c_npmi.append(df_model[(df_model["model_type"] == model_type) & (df_model["n_topics"] == int(n_topics))]['coherence_c_npmi'].tolist()[0])
    
    df_layouts['coherence_c_v'] = list_coherence_c_v
    df_layouts['coherence_u_mass'] = list_coherence_u_mass
    df_layouts['coherence_c_uci'] = list_coherence_c_uci
    df_layouts['coherence_c_npmi'] = list_coherence_c_npmi
    
    return df_layouts

In [None]:
df_emails = generate_dataframe(corpus_file_layouts = "results_cluster/cur_res/full_res_emails.csv", corpus_file_model = "model_evaluations/model_evaluations/emails_model_evaluation.csv")
df_20newsgroups =  generate_dataframe(corpus_file_layouts = "results_cluster/cur_res/full_res_20_newsgroups.csv", corpus_file_model = "model_evaluations/model_evaluations/20_newsgroups_model_evaluation.csv")
df_bbc = generate_dataframe(corpus_file_layouts = "results_cluster/cur_res/full_res_bbc_news.csv", corpus_file_model = "model_evaluations/model_evaluations/bbc_news_model_evaluation.csv")
df_lyrics = generate_dataframe(corpus_file_layouts = "results_cluster/cur_res/full_res_lyrics.csv", corpus_file_model = "model_evaluations/model_evaluations/lyrics_model_evaluation.csv")
df_reuters = generate_dataframe(corpus_file_layouts = "results_cluster/cur_res/full_res_reuters.csv", corpus_file_model = "model_evaluations/model_evaluations/reuters_model_evaluation.csv")
df_7categories = generate_dataframe(corpus_file_layouts = "results_cluster/cur_res/full_res_seven_categories.csv", corpus_file_model = "model_evaluations/model_evaluations/seven_categories_model_evaluation.csv")

In [None]:
list_df = [df_emails, df_20newsgroups, df_bbc, df_lyrics, df_reuters, df_7categories]
df = pd.concat(list_df, axis=0, ignore_index=True)
df.shape

In [None]:
columns = ['embedding', 'DR', 'layout_quality_measure', 'embedding_quality_measure', 'value']
df_result = pd.DataFrame(columns = columns)

TM_list = ['lda', 'lsi', 'lsi_tfidf', 'nmf', 'nmf_tfidf']
DR_list = ['mds', 'som', 'tsne', 'umap']

for TM in TM_list:
    for DR in DR_list:
        df_selected = df[(df["TM_short"] == TM) & (df["DR"] == DR)]
        list_accuracy = df_selected["accuracy"].tolist()
        list_perception = df_selected["perception"].tolist()
        
        list_coherence_c_v = df_selected["coherence_c_v"].tolist()
        value_acc = stats.kendalltau(list_accuracy, list_coherence_c_v)[0]
        new_row_acc_c_v = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'acc', 'embedding_quality_measure': 'c_v', 'value':value_acc}
        #df_result.append(new_row_acc_c_v, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_acc_c_v])], ignore_index=True)
        value_per = stats.kendalltau(list_perception, list_coherence_c_v)[0]
        new_row_per_c_v = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'per', 'embedding_quality_measure': 'c_v', 'value':value_per}
        #df_result.append(new_row_per_c_v, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_per_c_v])], ignore_index=True)

        
        list_coherence_u_mass = df_selected["coherence_u_mass"].tolist()
        value_acc = stats.kendalltau(list_accuracy, list_coherence_u_mass)[0]
        new_row_acc_u_mass = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'acc', 'embedding_quality_measure': 'u_mass', 'value':value_acc}
        #df_result.append(new_row_acc_u_mass, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_acc_u_mass])], ignore_index=True)
        value_per = stats.kendalltau(list_perception, list_coherence_u_mass)[0]
        new_row_per_u_mass = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'per', 'embedding_quality_measure': 'u_mass', 'value':value_per}
        #df_result.append(new_row_per_u_mass, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_per_u_mass])], ignore_index=True)

        
        list_coherence_c_uci = df_selected["coherence_c_uci"].tolist()
        value_acc = stats.kendalltau(list_accuracy, list_coherence_c_uci)[0]
        new_row_acc_c_uci = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'acc', 'embedding_quality_measure': 'c_uci', 'value':value_acc}
        #df_result.append(new_row_acc_c_uci, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_acc_c_uci])], ignore_index=True)
        value_per = stats.kendalltau(list_perception, list_coherence_c_uci)[0]
        new_row_per_c_uci = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'per', 'embedding_quality_measure': 'c_uci', 'value':value_per}
        #df_result.append(new_row_per_u_uci, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_per_c_uci])], ignore_index=True)

        
        list_coherence_c_npmi = df_selected["coherence_c_npmi"].tolist()
        value_acc = stats.kendalltau(list_accuracy, list_coherence_c_npmi)[0]
        new_row_acc_c_npmi = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'acc', 'embedding_quality_measure': 'c_npmi', 'value':value_acc}
        #df_result.append(new_row_acc_c_npmi, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_acc_c_npmi])], ignore_index=True)
        value_per = stats.kendalltau(list_perception, list_coherence_c_npmi)[0]
        new_row_per_c_npmi = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'per', 'embedding_quality_measure': 'c_npmi', 'value':value_per}
        #df_result.append(new_row_per_c_npmi, ignore_index=True)
        df_result = pd.concat([df_result, pd.DataFrame([new_row_per_c_npmi])], ignore_index=True)

In [None]:
df_result

## Bert Models

In [None]:
# https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
df_quality_bert = pd.DataFrame({'model': ['bert_all-MiniLM-L6-v2', 'bert_all-distilroberta-v1', 'bert_all-mpnet-base-v2', 'bert_paraphrase-MiniLM-L3-v2', 'bert_paraphrase-albert-small-v2'],
                               'performance_sentence_embeddings': [68.06, 68.06, 69.57, 62.29, 64.46],
                              'performance_semantic_search': [49.54, 50.94, 57.02, 39.19, 40.04]})
df_quality_bert.head(5)

In [None]:
def get_model(experiment):
    model_list = ['bert_all-MiniLM-L6-v2', 'bert_all-distilroberta-v1', 'bert_all-mpnet-base-v2', 'bert_paraphrase-MiniLM-L3-v2', 'bert_paraphrase-albert-small-v2']
    for model in model_list:
        if model in experiment:
            return model

In [None]:
def get_model_performance_sentence_embeddings(model):
    return df_quality_bert[df_quality_bert["model"] == model]["performance_sentence_embeddings"].tolist()[0]

In [None]:
def get_model_performance_semantic_search(model):
    return df_quality_bert[df_quality_bert["model"] == model]["performance_semantic_search"].tolist()[0]

In [None]:
def generate_dataframe_bert(corpus_file_layouts):
    df_layouts_all = pd.read_csv(corpus_file_layouts)
    
    # only keep the layouts with BERT
    df_layouts = df_layouts_all[df_layouts_all['TM'].isin(['bert'])]
    
    # detect the model
    experiments_list = df_layouts["Experiment"].tolist()
    model_list = [get_model(experiment) for experiment in experiments_list]
    performance_sentence_embedding_list = [get_model_performance_sentence_embeddings(model) for model in model_list]
    performance_semantic_search_list = [get_model_performance_semantic_search(model) for model in model_list]

    df_layouts["performance_sentence_embedding"] = performance_sentence_embedding_list
    df_layouts["performance_semantic_search"] = performance_semantic_search_list
    
    # compute the aggregated accuracy
    df_layouts["accuracy"] = compute_accuracy(df_layouts)
    
    # compute the aggregated perception
    df_layouts["perception"] = compute_perception(df_layouts)
    
    return df_layouts

In [None]:
df_emails_bert = generate_dataframe_bert(corpus_file_layouts = "results_cluster/cur_res/full_res_emails.csv")
df_20newsgroups_bert =  generate_dataframe_bert(corpus_file_layouts = "results_cluster/cur_res/full_res_20_newsgroups.csv")
df_bbc_bert = generate_dataframe_bert(corpus_file_layouts = "results_cluster/cur_res/full_res_bbc_news.csv")
df_lyrics_bert = generate_dataframe_bert(corpus_file_layouts = "results_cluster/cur_res/full_res_lyrics.csv")
df_reuters_bert = generate_dataframe_bert(corpus_file_layouts = "results_cluster/cur_res/full_res_reuters.csv")
df_7categories_bert = generate_dataframe_bert(corpus_file_layouts = "results_cluster/cur_res/full_res_seven_categories.csv")

In [None]:
list_df_bert = [df_emails_bert, df_20newsgroups_bert, df_bbc_bert, df_lyrics_bert, df_reuters_bert, df_7categories_bert]
df_bert = pd.concat(list_df_bert, axis=0, ignore_index=True)
df_bert.shape

In [None]:
df_bert.head()

In [None]:
columns = ['embedding', 'DR', 'layout_quality_measure', 'embedding_quality_measure', 'value']
df_bert_result = pd.DataFrame(columns = columns)

TM_list = ['bert']
DR_list = ['mds', 'som', 'tsne', 'umap']

for TM in TM_list:
    for DR in DR_list:
        df_selected = df_bert[df_bert["DR"] == DR]
        list_accuracy = df_selected["accuracy"].tolist()
        list_perception = df_selected["perception"].tolist()
        
        list_performance_sentence_embeddings = df_selected["performance_sentence_embedding"].tolist()
        value_acc = stats.kendalltau(list_accuracy, list_performance_sentence_embeddings)[0]
        new_row_acc_sentence_embeddings = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'acc', 'embedding_quality_measure': 'performance_sentence_embeddings', 'value':value_acc}
        #df_result.append(new_row_acc_c_v, ignore_index=True)
        df_bert_result = pd.concat([df_bert_result, pd.DataFrame([new_row_acc_sentence_embeddings])], ignore_index=True)
        value_per = stats.kendalltau(list_perception, list_performance_sentence_embeddings)[0]
        new_row_per_sentence_embeddings = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'per', 'embedding_quality_measure': 'performance_sentence_embeddings', 'value':value_per}
        #df_result.append(new_row_per_c_v, ignore_index=True)
        df_bert_result = pd.concat([df_bert_result, pd.DataFrame([new_row_per_sentence_embeddings])], ignore_index=True)


        list_performance_semantic_search = df_selected["performance_semantic_search"].tolist()
        value_acc = stats.kendalltau(list_accuracy, list_performance_semantic_search)[0]
        new_row_acc_semantic_search = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'acc', 'embedding_quality_measure': 'performance_semantic_search', 'value':value_acc}
        #df_result.append(new_row_acc_c_v, ignore_index=True)
        df_bert_result = pd.concat([df_bert_result, pd.DataFrame([new_row_acc_semantic_search])], ignore_index=True)
        value_per = stats.kendalltau(list_perception, list_performance_semantic_search)[0]
        new_row_per_semantic_search = {'embedding': TM, 'DR': DR, 'layout_quality_measure': 'per', 'embedding_quality_measure': 'performance_semantic_search', 'value':value_per}
        #df_result.append(new_row_per_c_v, ignore_index=True)
        df_bert_result = pd.concat([df_bert_result, pd.DataFrame([new_row_per_semantic_search])], ignore_index=True)
        

In [None]:
df_bert_result

In [None]:
df_total = pd.concat([df_result, df_bert_result], ignore_index = True)
df_total.shape

In [None]:
df_total.to_csv("results_kendallstau.csv")