In [9]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.colors as mcolors

In [10]:
from read_wandb import wandb_results
api = wandb_results("NLP2024_PROJECT_207897091_322720103", wandb_username="noa-levi")

BASE_METRIC = "accuracy_per_mean_user_and_bot"

In [11]:
directory = 'sweeps_csvs'
if not os.path.exists(directory):
    os.makedirs(directory)

In [12]:
df = api.get_sweeps_results(["crl4evai"], metric=BASE_METRIC) 

config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
HPT_cols = [col for col in config_cols if df[col].nunique() > 1 and col != "config_expert_reliability_path"]
print(config_cols)
print(HPT_cols)


Total number of sweeps: 1
Download sweep_id='crl4evai' data...


100%|██████████| 107/107 [00:02<00:00, 43.84it/s]

['config_seed', 'config_task', 'config_agent', 'config_layers', 'config_dropout', 'config_features', 'config_input_dim', 'config_max_games', 'config_REVIEW_DIM', 'config_hidden_dim', 'config_output_dim', 'config_strategies', 'config_force_train', 'config_ENV_HPT_mode', 'config_architecture', 'config_basic_nature', 'config_total_epochs', 'config_FEATURES_PATH', 'config_bots_per_user', 'config_save_artifacts', 'config_zero_knowledge', 'config_online_sim_type', 'config_use_user_vector', 'config_human_train_size', 'config_loss_weight_type', 'config_ENV_LEARNING_RATE', 'config_personas_balanced', 'config_transformer_nheads', 'config_SIMULATION_EFs_PATH', 'config_OFFLINE_SIM_DATA_PATH', 'config_favorite_topic_method', 'config_personas_group_number', 'config_offline_simulation_size', 'config_simulation_bot_per_user', 'config_simulation_signal_error', 'config_simulation_user_improve', 'config_online_simulation_factor']
['config_seed', 'config_basic_nature', 'config_online_simulation_factor']





In [13]:
df[HPT_cols]

Unnamed: 0,config_seed,config_basic_nature,config_online_simulation_factor
0,2,27,4
1,1,27,4
2,5,27,0
3,4,27,0
4,3,27,0
...,...,...,...
102,5,17,0
103,4,17,0
104,3,17,0
105,2,17,0


In [14]:
numeric_cols = df.select_dtypes(include=np.number).columns
df_numeric = df[list(numeric_cols) + [col for col in HPT_cols if col not in numeric_cols]]

grouped = df_numeric.groupby([c for c in HPT_cols if c != "config_seed"])

mean_df = grouped.mean()
std_df = grouped.std()

for col in config_cols:
    if col not in mean_df.columns:
        mean_df[col] = df[col]

best_col = mean_df[[c for c in mean_df.columns if (BASE_METRIC in c and BASE_METRIC[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)

best_col

config_basic_nature  config_online_simulation_factor
17                   0                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch5
                     4                                  ENV_Test_accuracy_per_mean_user_and_bot_epoch13
18                   0                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch5
                     4                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch7
19                   0                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch5
                     4                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch5
20                   0                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch5
                     4                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch9
21                   0                                   ENV_Test_accuracy_per_mean_user_and_bot_epoch5
           

In [15]:
def result_metric(sweeps, group_name, drop_list=[0], drop_HPT=False, metric=BASE_METRIC, epoch="best"):
    df = api.get_sweeps_results(sweeps, metric=metric) 

    config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1 and col != "config_expert_reliability_path"]
    print(HPT_cols)
    if drop_HPT:
        df=df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]
    
    # Remove non-numeric columns before computing mean and std
    numeric_cols = df.select_dtypes(include=np.number).columns
    df_numeric = df[list(numeric_cols) + [col for col in HPT_cols if col not in numeric_cols]]

    grouped = df_numeric.groupby([c for c in HPT_cols if c != "config_seed"])

    mean_df = grouped.mean()
    std_df = grouped.std()
    
    # Re-add non-numeric columns before computing best_col
    for col in config_cols:
        if col not in mean_df.columns:
            mean_df[col] = df[col]

    if epoch=="best":
        best_col = mean_df[[c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)
    
    result = grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = grouped.apply(lambda x: x[best_col.loc[x.name]].std())


    df_cols = {'mean': means, 'std': stds, 'values': result.values}
    if epoch == "best": df_cols['epoch'] = best_col.apply(lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")

    df_cols['CI'] = result.apply(lambda x: bootstrap_ci(x))

    summary_df = pd.DataFrame(df_cols, index=best_col.index)

    ##############################################################################################
    summary_df['CI_length'] = summary_df['CI'].apply(lambda x: x[1] - x[0])
    summary_df['Min_Max'] = summary_df['values'].apply(lambda x: (max(x), min(x)))
    ##############################################################################################

    for d in drop_list:
        if d in summary_df.index:
            summary_df=summary_df.drop(d)
    if len(summary_df.index.names) == 1:
        return summary_df.rename_axis(group_name)
    else:
        return summary_df

def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound


# For HyperParameterTuning

For every configuration that you test in the sweep, you will receive in the table the average, standard deviation, all the values obtained for the different seed values, and also the confidence interval within which the result is located at a confidence level of 95%.

When epoch="best" is defined, you can check in which epoch the best result is obtained. If epoch=5 is defined, you will receive the result obtained for epoch number 5.

You can test multiple sweeps simultaneously by entering them into the list found in the first element of the function result_metric.

In [16]:
# Create the directory if it doesn't exist
directory = 'sweeps_csvs'
if not os.path.exists(directory):
    os.makedirs(directory)

HPT_results_df = result_metric(["ovva21dw"], "option", drop_HPT=False, epoch="best")
HPT_results_df

Total number of sweeps: 1
Download sweep_id='ovva21dw' data...


['config_seed', 'config_basic_nature', 'config_online_simulation_factor']


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,values,epoch,CI,CI_length,Min_Max
config_basic_nature,config_online_simulation_factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7808092709481611, 0.7964406128113167)",0.015631,"(0.8016956662531911, 0.7764086389631965)"
17,4,0.796194,0.011693,"[0.8019694066805008, 0.7868901298140516, 0.810...",13,"(0.7873978439547618, 0.8049904166561334)",0.017593,"(0.810896158615232, 0.7819940037282526)"
18,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7814496250843695, 0.7964235953981135)",0.014974,"(0.8016956662531911, 0.7764086389631965)"
18,4,0.796089,0.01299,"[0.8030776889613664, 0.7842413216572973, 0.812...",7,"(0.7863019370433715, 0.806019258816541)",0.019717,"(0.8121422701810458, 0.7813968915327314)"
19,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7818810005943119, 0.7964116458819326)",0.014531,"(0.8016956662531911, 0.7764086389631965)"
19,4,0.794848,0.003705,"[0.7982036627297778, 0.7887782426621658, 0.795...",8,"(0.7915764459696831, 0.7972339972443427)",0.005658,"(0.7982036627297778, 0.7887782426621658)"
20,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7818939217385784, 0.7964235953981135)",0.01453,"(0.8016956662531911, 0.7764086389631965)"
20,4,0.797374,0.009449,"[0.8003521474502342, 0.7867783567635749, 0.810...",9,"(0.7905102040464633, 0.8060388101867273)",0.015529,"(0.8107795609368641, 0.7867783567635749)"
21,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7818939217385784, 0.7966382607951921)",0.014744,"(0.8016956662531911, 0.7764086389631965)"
21,4,0.797378,0.011228,"[0.8062198400532707, 0.7799454924675318, 0.806...",8,"(0.7879674158130631, 0.8052751881758126)",0.017308,"(0.806972399012572, 0.7799454924675318)"


In [17]:
# sorted(HPT_results_df['mean'], reverse=True)

df_reset = HPT_results_df.copy().reset_index()

new_df = df_reset[['config_basic_nature', 'mean', 'std', 'CI_length']]
new_df = new_df[new_df['config_basic_nature'].isin([17,18, 19,20,21,22,23,24, 25,26,27,28])]
new_df.sort_values(by='mean', ignore_index=True, ascending=False)


Unnamed: 0,config_basic_nature,mean,std,CI_length
0,27,0.799531,0.015058,0.021295
1,24,0.798344,0.011427,0.017875
2,26,0.797757,0.009352,0.013794
3,21,0.797378,0.011228,0.017308
4,20,0.797374,0.009449,0.015529
5,25,0.796537,0.011355,0.01777
6,17,0.796194,0.011693,0.017593
7,18,0.796089,0.01299,0.019717
8,19,0.794848,0.003705,0.005658
9,22,0.794805,0.004497,0.006725


In [18]:
test_results_df = result_metric(["crl4evai"], "option", drop_HPT=False, epoch="best")
test_results_df

Total number of sweeps: 1
Download sweep_id='crl4evai' data...
['config_seed', 'config_basic_nature', 'config_online_simulation_factor']


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,values,epoch,CI,CI_length,Min_Max
config_basic_nature,config_online_simulation_factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7818939217385784, 0.7964119446198371)",0.014518,"(0.8016956662531911, 0.7764086389631965)"
17,4,0.796194,0.011693,"[0.8019694066805008, 0.7868901298140516, 0.810...",13,"(0.7873978439547618, 0.8049904166561334)",0.017593,"(0.810896158615232, 0.7819940037282526)"
18,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7808092709481611, 0.7964235953981135)",0.015614,"(0.8016956662531911, 0.7764086389631965)"
18,4,0.796089,0.01299,"[0.8030776889613664, 0.7842413216572973, 0.812...",7,"(0.7861726419314914, 0.8060053403124445)",0.019833,"(0.8121422701810458, 0.7813968915327314)"
19,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7807973214319804, 0.7964406128113167)",0.015643,"(0.8016956662531911, 0.7764086389631965)"
19,4,0.794267,0.005904,"[0.7965240328496532, 0.7904291665211234, 0.792...",5,"(0.7900641156099578, 0.7992622867184197)",0.009198,"(0.8033696675215694, 0.7884507116098983)"
20,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7814660444211954, 0.7964235953981135)",0.014958,"(0.8016956662531911, 0.7764086389631965)"
20,4,0.797374,0.009449,"[0.8003521474502342, 0.7867783567635749, 0.810...",9,"(0.7901369834956126, 0.8045231128448862)",0.014386,"(0.8107795609368641, 0.7867783567635749)"
21,0,0.789159,0.010007,"[0.7929287973550296, 0.7818916404961866, 0.801...",5,"(0.7808092709481611, 0.7964235953981135)",0.015614,"(0.8016956662531911, 0.7764086389631965)"
21,4,0.797378,0.011228,"[0.8062198400532707, 0.7799454924675318, 0.806...",8,"(0.7882501295045837, 0.8054219371728761)",0.017172,"(0.806972399012572, 0.7799454924675318)"


# Result for a specific epoch

In [19]:
sweep_results = result_metric(["5cq55yww"], "LLMs", drop_HPT=False, epoch=9)
sweep_results

Total number of sweeps: 1
Download sweep_id='5cq55yww' data...


['config_seed', 'config_basic_nature', 'config_online_simulation_factor']


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,values,CI,CI_length,Min_Max
config_basic_nature,config_online_simulation_factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25,0,0.787543,0.01109,"[0.7871779380666905, 0.7806213851453786, 0.806...","(0.7805279009522431, 0.796994618765841)",0.016467,"(0.8063335926191515, 0.7782800605163822)"
25,4,0.79442,0.008211,"[0.791596350880113, 0.7965605298326591, 0.8003...","(0.7875751960492146, 0.8005797073511378)",0.013005,"(0.8019937428220748, 0.7815947062796551)"
26,0,0.787543,0.01109,"[0.7871779380666905, 0.7806213851453786, 0.806...","(0.7805279009522431, 0.7973600202139047)",0.016832,"(0.8063335926191515, 0.7782800605163822)"
26,4,0.791804,0.008177,"[0.7948121342670303, 0.7835750277399327, 0.804...","(0.785869219389558, 0.7979093566718739)",0.01204,"(0.8044571613025505, 0.7835750277399327)"
27,0,0.787543,0.01109,"[0.7871779380666905, 0.7806213851453786, 0.806...","(0.7806190578068335, 0.7979217893662413)",0.017303,"(0.8063335926191515, 0.7782800605163822)"
27,4,0.791322,0.011624,"[0.8011903814003528, 0.7744913239002988, 0.801...","(0.781872375204576, 0.8000321374984735)",0.01816,"(0.8015505945219533, 0.7744913239002988)"
28,0,0.787543,0.01109,"[0.7871779380666905, 0.7806213851453786, 0.806...","(0.7801531302362803, 0.7969852494979419)",0.016832,"(0.8063335926191515, 0.7782800605163822)"
28,4,0.793883,0.004404,"[0.7982562824572048, 0.7898357035510667, 0.798...","(0.7901947760683602, 0.7973897637404075)",0.007195,"(0.7982562824572048, 0.7889286385504325)"
