In [21]:
import pandas as pd
import statistics

In [11]:
def extend_with_category(df):
    # Get the unique values in n_topics and sort them
    unique_values = sorted(df['n_topics'].unique())
    
    # Create a mapping from n_topics values to categories
    categories = ['a', 'b', 'c', 'd', 'e']
    value_to_category = {val: cat for val, cat in zip(unique_values, categories)}
    
    # Map the n_topics values to the categories
    df['n_topics_category'] = df['n_topics'].map(value_to_category)
    
    return df

In [15]:
models_20newsgroups = pd.read_csv("model_evaluations/model_evaluations/20_newsgroups_model_evaluation.csv")
models_20newsgroups = extend_with_category(models_20newsgroups)

models_bbcnews = pd.read_csv("model_evaluations/model_evaluations/bbc_news_model_evaluation.csv")
models_bbcnews = extend_with_category(models_bbcnews)

models_emails = pd.read_csv("model_evaluations/model_evaluations/emails_model_evaluation.csv")
models_emails = extend_with_category(models_emails)

models_lyrics = pd.read_csv("model_evaluations/model_evaluations/lyrics_model_evaluation.csv")
models_lyrics = extend_with_category(models_lyrics)

models_reuters = pd.read_csv("model_evaluations/model_evaluations/reuters_model_evaluation.csv")
models_reuters = extend_with_category(models_reuters)

models_7categories = pd.read_csv("model_evaluations/model_evaluations/seven_categories_model_evaluation.csv")
models_7categories = extend_with_category(models_7categories)

In [18]:
list_models = [models_20newsgroups, models_bbcnews, models_emails, models_lyrics, models_reuters, models_7categories]
models = pd.concat(list_models, axis=0, ignore_index=True)
models.head()

Unnamed: 0,model_type,n_topics,alpha_lda,coherence_u_mass,coherence_c_v,coherence_c_uci,coherence_c_npmi,perplexity,n_topics_category
0,lda,10,auto,-1.647771,0.431769,-0.403714,0.000311,254.612658,a
1,lda,15,auto,-1.691589,0.425324,-0.329512,0.005311,285.102388,b
2,lda,20,auto,-2.084296,0.450647,-0.600277,0.009207,302.912619,c
3,lda,25,auto,-1.98001,0.425205,-0.466475,0.003768,329.199261,d
4,lda,30,auto,-2.358449,0.439588,-0.678974,0.005376,357.273727,e


In [24]:
columns = ['model', 'category', 'coherence_u_mass', 'coherence_c_v', 'coherence_c_uci', 'coherence_c_npmi']
df_result = pd.DataFrame(columns = columns)

category_list = ['a', 'b', 'c', 'd', 'e']
model_type_list = ['lda', 'lsi', 'lsi_tfidf', 'nmf', 'nmf_tfidf']

for category in category_list:
    for model_type in model_type_list:
        selected_models = models[(models["model_type"] == model_type) & (models['n_topics_category'] == category)]
        coherence_u_mass_avg = statistics.mean(selected_models["coherence_u_mass"].tolist())
        coherence_c_v_avg = statistics.mean(selected_models["coherence_c_v"].tolist())
        coherence_c_uci_avg = statistics.mean(selected_models["coherence_c_uci"].tolist())
        coherence_c_npmi_avg = statistics.mean(selected_models["coherence_c_npmi"].tolist())
        new_row = {'model':[model_type], 'category': [category], 'coherence_u_mass': [coherence_u_mass_avg], 'coherence_c_v': [coherence_c_v_avg], 'coherence_c_uci': [coherence_c_uci_avg], 'coherence_c_npmi': [coherence_c_npmi_avg]}
        df_result = pd.concat([df_result, pd.DataFrame(new_row)], ignore_index = True)
        #print(new_row)

df_result.head(15)

  df_result = pd.concat([df_result, pd.DataFrame(new_row)], ignore_index = True)


Unnamed: 0,model,category,coherence_u_mass,coherence_c_v,coherence_c_uci,coherence_c_npmi
0,lda,a,-1.567271,0.373511,-0.347025,-0.002286
1,lsi,a,-1.903042,0.428277,-0.44801,0.012848
2,lsi_tfidf,a,-2.336276,0.473144,-0.76026,0.023398
3,nmf,a,-1.538304,0.425854,-0.058815,0.020887
4,nmf_tfidf,a,-2.191367,0.438625,-0.81511,0.008344
5,lda,b,-1.740625,0.372081,-0.411464,-0.000788
6,lsi,b,-2.159265,0.411283,-0.70792,0.000884
7,lsi_tfidf,b,-2.902788,0.442486,-1.331937,0.001286
8,nmf,b,-1.788738,0.422933,-0.196007,0.01944
9,nmf_tfidf,b,-2.846206,0.417609,-1.517888,-0.015605
