In [None]:
import os
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Loading files

In [None]:
root_dir = 'simulations'
simulation_folders = os.listdir(root_dir)
dataframes = {}

for folder in simulation_folders:
    folder_path = os.path.join(root_dir, folder)
    
    if os.path.isdir(folder_path):
        json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
        
        metrics = []
        for json_file in json_files:
            json_file_path = os.path.join(folder_path, json_file)
            
            with open(json_file_path, 'r') as f:
                json_data = json.load(f)
            
            items = json_data['data']['items']

            metric_data = {item['title']: item['value'] for item in items if item['title'] != 'Time to discovery'}
            metrics.append(metric_data)

        df = pd.DataFrame(metrics)
        dataframes[folder] = df

# Access the DataFrame for a specific simulation folder like this:
# dataframes['Appenzeller-Herzog_2019_-m_logistic_-e_tfidf']

In [None]:
def average_metrics(df):
    metric_averages = {}
    for column in df.columns:
        if df[column].dtype == "object":
            metric_list = df[column].explode().tolist()
            metric_dict = {}
            for metric in metric_list:
                if metric[0] in metric_dict:
                    metric_dict[metric[0]].append(metric[1])
                else:
                    metric_dict[metric[0]] = [metric[1]]
            for key, values in metric_dict.items():
                new_col_name = f"{column} {key}"
                metric_averages[new_col_name] = round(sum(values) / len(values), 2)
        else:
            metric_averages[column] = round(df[column].mean(), 2)
    return metric_averages

    
# split up the name into categories
def split_name(name):
    # dataset name is everything before "-m"
    dataset = name.split("-m")[0]
    # model name is everything between "-m" and "-e"
    model = name.split("-m")[1].split("-e")[0]
    # embedding name is everything after "-e"
    embedding = name.split("-e")[1]
    # remove trailing underscores
    dataset = dataset[:-1]
    model = model[1:-1]
    embedding = embedding[1:]
    return dataset, model, embedding

In [None]:
result_list = []
for df in dataframes:
    dataset, model, embedding = split_name(df)
    result_list.append({**{'dataset': dataset}, **{'model': model}, **{'fe': embedding}, **average_metrics(dataframes[df])})

df = pd.DataFrame(result_list)

# make into a json file
with open("results.json", "w") as json_file:
    json.dump(result_list, json_file)

In [None]:
# encode the models and feature engineering methods
le = LabelEncoder()
df['model_encoded'] = le.fit_transform(df['model'])
df['fe_encoded'] = le.fit_transform(df['fe'])

# Data Science part

In [None]:
df

In [None]:
df.groupby("fe").mean()

In [None]:
df.groupby("model").mean()

In [None]:
# sort by wss 0.95
df.groupby("dataset").mean().sort_values(by="Work Saved over Sampling 0.95", ascending=False)

In [None]:
df[df["dataset"] == "Chou_2003"]

In [None]:
df[df["dataset"] == "Bos_2018"].groupby("model").mean()

In [None]:
df[df["dataset"] == "Bos_2018"].groupby("fe").mean()

In [None]:
# calculate the correlation coefficient between model selection and average time to discovery
model_corr = df['model_encoded'].corr(df['Recall 0.1'])
# calculate the correlation coefficient between feature engineering and average time to discovery
fe_corr = df['fe_encoded'].corr(df['Recall 0.1'])

print('Correlation coefficient for model selection:', model_corr)
print('Correlation coefficient for feature engineering:', fe_corr)