In [1]:
import pandas as pd
import numpy as np

In [2]:
def categorize_n_topics(df):
    # Get the unique values of 'n_topics' sorted in ascending order
    unique_values = sorted(df['n_topics'].unique())
    
    # Create a mapping from n_topics values to categories
    mapping = {unique_values[i]: chr(97 + i) for i in range(len(unique_values))}
    # chr(97) is 'a', chr(98) is 'b', and so on
    
    # Apply the mapping to create the 'category' column
    df['category'] = df['n_topics'].map(mapping)
    
    return df

In [3]:
df1 = pd.read_csv("model_evaluations/model_evaluations/20_newsgroups_model_evaluation.csv")
df1 = categorize_n_topics(df1)

df2 = pd.read_csv("model_evaluations/model_evaluations/bbc_news_model_evaluation.csv")
df2 = categorize_n_topics(df2)

df3 = pd.read_csv("model_evaluations/model_evaluations/emails_model_evaluation.csv")
df3 = categorize_n_topics(df3)

df4 = pd.read_csv("model_evaluations/model_evaluations/lyrics_model_evaluation.csv")
df4 = categorize_n_topics(df4)

df5 = pd.read_csv("model_evaluations/model_evaluations/reuters_model_evaluation.csv")
df5 = categorize_n_topics(df5)

df6 = pd.read_csv("model_evaluations/model_evaluations/seven_categories_model_evaluation.csv")
df6 = categorize_n_topics(df6)

In [4]:
df = pd.concat([df1, df2, df3, df4, df5, df6])
df.head(10)

Unnamed: 0,model_type,n_topics,alpha_lda,coherence_u_mass,coherence_c_v,coherence_c_uci,coherence_c_npmi,perplexity,category
0,lda,10,auto,-1.647771,0.431769,-0.403714,0.000311,254.612658,a
1,lda,15,auto,-1.691589,0.425324,-0.329512,0.005311,285.102388,b
2,lda,20,auto,-2.084296,0.450647,-0.600277,0.009207,302.912619,c
3,lda,25,auto,-1.98001,0.425205,-0.466475,0.003768,329.199261,d
4,lda,30,auto,-2.358449,0.439588,-0.678974,0.005376,357.273727,e
5,lsi,10,,-2.456908,0.399068,-0.921959,-0.006494,-1.0,a
6,lsi,15,,-2.302925,0.404467,-0.796444,-0.005401,-1.0,b
7,lsi,20,,-2.532389,0.449883,-0.867506,0.006636,-1.0,c
8,lsi,25,,-2.575373,0.388208,-1.101582,-0.018556,-1.0,d
9,lsi,30,,-2.551562,0.39124,-1.025485,-0.012735,-1.0,e


In [5]:
# Assuming your original dataframe is called df
# Group by 'model_type' and 'category', then calculate the mean for 'coherence_c_v'
df_coherence_c_v = df.groupby(['model_type', 'category'])['coherence_c_v'].mean().reset_index()

# The result_df now contains the desired output
df_coherence_c_v.head(15)

Unnamed: 0,model_type,category,coherence_c_v
0,lda,a,0.373511
1,lda,b,0.372081
2,lda,c,0.372714
3,lda,d,0.367659
4,lda,e,0.365466
5,lsi,a,0.428277
6,lsi,b,0.411283
7,lsi,c,0.381732
8,lsi,d,0.34411
9,lsi,e,0.337499


In [6]:
df_coherence_c_v.to_csv("coherence_c_v.csv")

In [7]:
# Assuming your original dataframe is called df
# Group by 'model_type' and 'category', then calculate the mean for 'coherence_c_v'
df_coherence_u_mass = df.groupby(['model_type', 'category'])['coherence_u_mass'].mean().reset_index()
df_coherence_u_mass['coherence_u_mass'] = -1/df_coherence_u_mass['coherence_u_mass']

# The result_df now contains the desired output
df_coherence_u_mass.head(15)

Unnamed: 0,model_type,category,coherence_u_mass
0,lda,a,0.638052
1,lda,b,0.574506
2,lda,c,0.352882
3,lda,d,0.473312
4,lda,e,0.413598
5,lsi,a,0.525474
6,lsi,b,0.463121
7,lsi,c,0.336224
8,lsi,d,0.409444
9,lsi,e,0.399022


In [8]:
df_coherence_u_mass.to_csv("coherence_u_mass.csv")