In [1]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.colors as mcolors

In [2]:
from read_wandb import wandb_results
api = wandb_results("NLP2024_PROJECT_207897091_322720103", wandb_username="noa-levi")

BASE_METRIC = "accuracy_per_mean_user_and_bot"

In [3]:
df = api.get_sweeps_results(["emo2h460"], metric=BASE_METRIC) 

config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
HPT_cols = [col for col in config_cols if df[col].nunique() > 1 and col != "config_expert_reliability_path"]
print(config_cols)
print(HPT_cols)


Total number of sweeps: 1
Download sweep_id='emo2h460' data...


100%|██████████| 80/80 [00:01<00:00, 41.16it/s]


OSError: Cannot save file into a non-existent directory: 'sweeps_csvs'

In [8]:
df[HPT_cols]

Unnamed: 0,config_seed,config_basic_nature,config_truth_margin_type,config_reliability_threshold
0,5,19,random,8
1,5,19,constant,8
2,4,19,random,8
3,4,19,constant,8
4,3,19,random,8
...,...,...,...,...
85,3,17,constant,6
86,2,17,random,6
87,2,17,constant,6
88,1,17,random,6


In [13]:
numeric_cols = df.select_dtypes(include=np.number).columns
df_numeric = df[list(numeric_cols) + [col for col in HPT_cols if col not in numeric_cols]]

grouped = df_numeric.groupby([c for c in HPT_cols if c != "config_seed"])

mean_df = grouped.mean()
std_df = grouped.std()

for col in config_cols:
    if col not in mean_df.columns:
        mean_df[col] = df[col]

best_col = mean_df[[c for c in mean_df.columns if (BASE_METRIC in c and BASE_METRIC[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)

best_col

config_basic_nature  config_truth_margin_type  config_reliability_threshold
17                   constant                  6                                ENV_Test_accuracy_per_mean_user_and_bot_epoch6
                                               7                                ENV_Test_accuracy_per_mean_user_and_bot_epoch3
                                               8                                ENV_Test_accuracy_per_mean_user_and_bot_epoch3
                     random                    6                               ENV_Test_accuracy_per_mean_user_and_bot_epoch12
                                               7                                ENV_Test_accuracy_per_mean_user_and_bot_epoch9
                                               8                               ENV_Test_accuracy_per_mean_user_and_bot_epoch12
18                   constant                  6                               ENV_Test_accuracy_per_mean_user_and_bot_epoch11
                                   

In [20]:
def result_metric(sweeps, group_name, drop_list=[0], drop_HPT=False, metric=BASE_METRIC, epoch="best"):
    df = api.get_sweeps_results(sweeps, metric=metric) 

    config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1 and col != "config_expert_reliability_path"]
    print(HPT_cols)
    if drop_HPT:
        df=df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]
    
    # Remove non-numeric columns before computing mean and std
    numeric_cols = df.select_dtypes(include=np.number).columns
    df_numeric = df[list(numeric_cols) + [col for col in HPT_cols if col not in numeric_cols]]

    grouped = df_numeric.groupby([c for c in HPT_cols if c != "config_seed"])

    mean_df = grouped.mean()
    std_df = grouped.std()
    
    # Re-add non-numeric columns before computing best_col
    for col in config_cols:
        if col not in mean_df.columns:
            mean_df[col] = df[col]

    if epoch=="best":
        best_col = mean_df[[c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)
    
    result = grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = grouped.apply(lambda x: x[best_col.loc[x.name]].std())


    df_cols = {'mean': means, 'std': stds, 'values': result.values}
    if epoch == "best": df_cols['epoch'] = best_col.apply(lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")

    df_cols['CI'] = result.apply(lambda x: bootstrap_ci(x))

    summary_df = pd.DataFrame(df_cols, index=best_col.index)

    ##############################################################################################
    summary_df['CI_length'] = summary_df['CI'].apply(lambda x: x[1] - x[0])
    summary_df['Min_Max'] = summary_df['values'].apply(lambda x: (max(x), min(x)))
    ##############################################################################################

    for d in drop_list:
        if d in summary_df.index:
            summary_df=summary_df.drop(d)
    if len(summary_df.index.names) == 1:
        return summary_df.rename_axis(group_name)
    else:
        return summary_df

def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound


# For HyperParameterTuning

For every configuration that you test in the sweep, you will receive in the table the average, standard deviation, all the values obtained for the different seed values, and also the confidence interval within which the result is located at a confidence level of 95%.

When epoch="best" is defined, you can check in which epoch the best result is obtained. If epoch=5 is defined, you will receive the result obtained for epoch number 5.

You can test multiple sweeps simultaneously by entering them into the list found in the first element of the function result_metric.

In [28]:
# Create the directory if it doesn't exist
directory = 'sweeps_csvs'
if not os.path.exists(directory):
    os.makedirs(directory)

HPT_results_df = result_metric(["2twzh8vq"], "option", drop_HPT=False, epoch="best")
HPT_results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,std,values,epoch,CI,CI_length,Min_Max
config_basic_nature,config_truth_margin_type,config_reliability_threshold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
17,constant,6,0.794732,0.00464,"[0.8023727908157532, 0.7925199849190669, 0.792...",6,"(0.7917418913948676, 0.7985280382286175)",0.006786,"(0.8023727908157532, 0.790454288894145)"
17,constant,7,0.79587,0.008608,"[0.8062167349693287, 0.7921460575778261, 0.802...",3,"(0.7894474394624009, 0.802598654911652)",0.013151,"(0.8062167349693287, 0.7843165546503027)"
17,constant,8,0.791285,0.009769,"[0.7929908464419881, 0.786502977272025, 0.8066...",3,"(0.7842055867580097, 0.7992628640914253)",0.015057,"(0.8066947875070776, 0.780511370025345)"
17,random,6,0.794468,0.004165,"[0.7942746153814207, 0.7899814649173569, 0.801...",12,"(0.7916898114942839, 0.7981270999704113)",0.006437,"(0.8012722614582204, 0.7899814649173569)"
17,random,7,0.794812,0.009999,"[0.8025933136822736, 0.7855209317317611, 0.806...",9,"(0.7869164030772569, 0.8026863419650615)",0.01577,"(0.8069770221829002, 0.7845721143559714)"
17,random,8,0.791226,0.008316,"[0.7966144206626143, 0.781897911165404, 0.7930...",12,"(0.7847603651082119, 0.7969721646251393)",0.012212,"(0.8011049869697098, 0.781897911165404)"
18,constant,6,0.796967,0.005942,"[0.8009055147881209, 0.7921887938408031, 0.804...",11,"(0.7923576579280868, 0.8015754753425176)",0.009218,"(0.8043600883269144, 0.7900322657381553)"
18,constant,7,0.792664,0.005866,"[0.7909618315484146, 0.7916888928066996, 0.801...",18,"(0.7881600564717288, 0.7974812182935246)",0.009321,"(0.8015851223708361, 0.7853769091098419)"
18,constant,8,0.795432,0.011932,"[0.7889268595554985, 0.7888217405263205, 0.810...",12,"(0.7865454855130991, 0.8042980814412207)",0.017753,"(0.8102791855052849, 0.783131102993267)"
18,random,6,0.799666,0.006243,"[0.8023617376059781, 0.791867099606318, 0.8030...",18,"(0.7948228993992298, 0.8043447873205551)",0.009522,"(0.8066739568840647, 0.791867099606318)"


In [38]:
# sorted(HPT_results_df['mean'], reverse=True)

df_reset = HPT_results_df.copy().reset_index()

new_df = df_reset[['config_basic_nature', 'config_truth_margin_type', 'config_reliability_threshold', 'mean', 'std', 'CI_length']]
new_df = new_df[new_df['config_basic_nature'].isin([18, 19])]
new_df.sort_values(by='mean', ignore_index=True, ascending=False)


Unnamed: 0,config_basic_nature,config_truth_margin_type,config_reliability_threshold,mean,std,CI_length
0,18,random,6,0.799666,0.006243,0.009522
1,19,constant,7,0.799067,0.010508,0.015667
2,19,constant,6,0.798749,0.009063,0.013877
3,18,random,7,0.798691,0.00671,0.010582
4,19,constant,8,0.797543,0.008053,0.011967
5,19,random,7,0.797532,0.008642,0.013403
6,18,constant,6,0.796967,0.005942,0.009218
7,19,random,8,0.796805,0.012582,0.019132
8,19,random,6,0.796472,0.008178,0.012078
9,18,constant,8,0.795432,0.011932,0.017753


In [None]:
test_results_df = result_metric(["050g0cug"], "option", drop_HPT=False, epoch="best")
test_results_df

# Result for a specific epoch

In [8]:
sweep_results = result_metric(["kb9be58j"], "LLMs", drop_HPT=False, epoch=10)
sweep_results

Total number of sweeps: 1
Download sweep_id='kb9be58j' data...
['config_seed', 'config_features', 'config_input_dim', 'config_REVIEW_DIM', 'config_FEATURES_PATH', 'config_online_simulation_factor']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,std,values,CI
config_features,config_input_dim,config_REVIEW_DIM,config_FEATURES_PATH,config_online_simulation_factor,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BERT,49,36,data/BERT_PCA_36.csv,0,0.791324,0.00413,"[0.7913288142079798, 0.7964738446311359, 0.794...","(0.7883271049308528, 0.7945211780625762)"
BERT,49,36,data/BERT_PCA_36.csv,4,0.787746,0.002023,"[0.7853011998466384, 0.7880448817556057, 0.790...","(0.7861906690533982, 0.7893304571102691)"
EFs,50,37,data/EFs_by_GPT35.csv,0,0.795179,0.004638,"[0.788716651726378, 0.7969832115134915, 0.7940...","(0.7914422896969608, 0.7990928008535583)"
EFs,50,37,data/EFs_by_GPT35.csv,4,0.804494,0.004319,"[0.8113387801279366, 0.8016887599672099, 0.802...","(0.8015579276967888, 0.80837621364379)"
GPT4,49,36,data/GPT4_PCA_36.csv,0,0.792382,0.006579,"[0.7926385318762957, 0.8021579175889718, 0.783...","(0.7874851948109957, 0.7979366627582867)"
GPT4,49,36,data/GPT4_PCA_36.csv,4,0.789859,0.001764,"[0.7906707039977815, 0.7882686273661637, 0.789...","(0.7885350982715412, 0.7912191251647598)"
