In [58]:
import pandas as pd 
import wandb
from tqdm import tqdm
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.colors as mcolors

In [59]:
from read_wandb import wandb_results
api = wandb_results("NLP2024_PROJECT_207897091_322720103", wandb_username="noa-levi")

BASE_METRIC = "accuracy_per_mean_user_and_bot"
sweep_id = "5b4snpv8"

In [60]:
def result_metric(sweeps, group_name, drop_list=[0], drop_HPT=False, metric=BASE_METRIC, epoch="best"):
    df = api.get_sweeps_results(sweeps, metric=metric) 

    config_cols = [c for c in df.columns if "config_" in c and c!="config_wandb_run_id" and c!="config_online_simulation_size"]
    HPT_cols = [col for col in config_cols if df[col].nunique() > 1 and col != "config_expert_reliability_path"]
    print(HPT_cols)
    if drop_HPT:
        df=df.drop([c for c in HPT_cols if not c in ["config_LLM_SIM_SIZE", "config_seed"]], axis=1)
        HPT_cols = ["config_LLM_SIM_SIZE", "config_seed"]
    
    # Remove non-numeric columns before computing mean and std
    numeric_cols = df.select_dtypes(include=np.number).columns
    df_numeric = df[list(numeric_cols) + [col for col in HPT_cols if col not in numeric_cols]]

    grouped = df_numeric.groupby([c for c in HPT_cols if c != "config_seed"])

    mean_df = grouped.mean()
    std_df = grouped.std()
    
    # Re-add non-numeric columns before computing best_col
    for col in config_cols:
        if col not in mean_df.columns:
            mean_df[col] = df[col]

    if epoch=="best":
        best_col = mean_df[[c for c in mean_df.columns if (metric in c and metric[-4:] == c.split("_epoch")[0][-4:])]].idxmax(axis=1)
    else:
        best_col = mean_df[[c for c in mean_df.columns if f"{metric}_epoch{epoch}" in c]].idxmax(axis=1)
    
    result = grouped.apply(lambda x: x[best_col.loc[x.name]].values)
    means = grouped.apply(lambda x: x[best_col.loc[x.name]].mean())
    stds = grouped.apply(lambda x: x[best_col.loc[x.name]].std())


    df_cols = {'mean': means, 'std': stds, 'values': result.values}
    if epoch == "best": df_cols['epoch'] = best_col.apply(lambda x: int(x.split("epoch")[1]) if "epoch" in x else "last")

    df_cols['CI'] = result.apply(lambda x: bootstrap_ci(x))

    summary_df = pd.DataFrame(df_cols, index=best_col.index)

    ##############################################################################################
    summary_df['CI_length'] = summary_df['CI'].apply(lambda x: x[1] - x[0])
    summary_df['Min_Max'] = summary_df['values'].apply(lambda x: (max(x), min(x)))
    ##############################################################################################

    for d in drop_list:
        if d in summary_df.index:
            summary_df=summary_df.drop(d)
    if len(summary_df.index.names) == 1:
        return summary_df.rename_axis(group_name)
    else:
        return summary_df

def bootstrap_ci(data, n_bootstrap=1000, ci=0.95):
    bootstrapped_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (1 - ci) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_means, (1 + ci) / 2 * 100)
    return lower_bound, upper_bound


# Create the directory if it doesn't exist
directory = 'sweeps_csvs'
if not os.path.exists(directory):
    os.makedirs(directory)

HPT_results_df = result_metric([sweep_id], "option", drop_HPT=False, epoch="best")
HPT_results_df


Total number of sweeps: 1
Download sweep_id='5b4snpv8' data...


['config_seed', 'config_basic_nature']


Unnamed: 0_level_0,mean,std,values,epoch,CI,CI_length,Min_Max
option,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21,0.833078,0.003191,"[0.83598019938932, 0.832545889016663, 0.830392...",16,"(0.8305528838936489, 0.8356035344570991)",0.005051,"(0.836755692245096, 0.8297169948684581)"
30,0.834124,0.001149,"[0.8338939209224127, 0.8332705109967075, 0.833...",8,"(0.8333200927097224, 0.8351071248154778)",0.001787,"(0.8359459785937888, 0.8330827603163922)"
34,0.835844,0.001082,"[0.8353927249580638, 0.8366281573954447, 0.837...",7,"(0.8349914504558905, 0.8367337235167442)",0.001742,"(0.837076539698503, 0.8343121233086479)"


In [61]:
df = api.get_sweeps_results([sweep_id], metric="accuracy_per_mean_user_and_bot")
df.columns

Total number of sweeps: 1
Download sweep_id='5b4snpv8' data...


Index(['name', 'config_seed', 'config_task', 'config_agent', 'config_layers',
       'config_dropout', 'config_features', 'config_input_dim',
       'config_max_games', 'config_REVIEW_DIM',
       ...
       'ENV_Test_accuracy_per_mean_user_and_bot_epoch21',
       'ENV_Test_Probability to choose the right action_epoch5',
       'ENV_Train_Weighted probability to choose the right action:_epoch21',
       'ENV_Test_accuracy_strategy_23_epoch15',
       'ENV_Test_accuracy_strategy_93_epoch15',
       'ENV_Test_Weighted right action_epoch24',
       'ENV_Online Simulation_Right action_epoch15',
       'ENV_Test_TotalLoss_epoch1',
       'ENV_Online Simulation_Weighted probability to choose the right action:_epoch4',
       '_wandb.runtime'],
      dtype='object', length=954)

In [62]:
epoch_acc = [f'ENV_Test_accuracy_per_mean_user_and_bot_epoch{i}' for i in range(20)]
cols_to_keep = epoch_acc + ['config_online_simulation_factor']
df = df[cols_to_keep]

In [63]:
result = df.groupby('config_online_simulation_factor').mean().reset_index()
result = result[epoch_acc]
result

Unnamed: 0,ENV_Test_accuracy_per_mean_user_and_bot_epoch0,ENV_Test_accuracy_per_mean_user_and_bot_epoch1,ENV_Test_accuracy_per_mean_user_and_bot_epoch2,ENV_Test_accuracy_per_mean_user_and_bot_epoch3,ENV_Test_accuracy_per_mean_user_and_bot_epoch4,ENV_Test_accuracy_per_mean_user_and_bot_epoch5,ENV_Test_accuracy_per_mean_user_and_bot_epoch6,ENV_Test_accuracy_per_mean_user_and_bot_epoch7,ENV_Test_accuracy_per_mean_user_and_bot_epoch8,ENV_Test_accuracy_per_mean_user_and_bot_epoch9,ENV_Test_accuracy_per_mean_user_and_bot_epoch10,ENV_Test_accuracy_per_mean_user_and_bot_epoch11,ENV_Test_accuracy_per_mean_user_and_bot_epoch12,ENV_Test_accuracy_per_mean_user_and_bot_epoch13,ENV_Test_accuracy_per_mean_user_and_bot_epoch14,ENV_Test_accuracy_per_mean_user_and_bot_epoch15,ENV_Test_accuracy_per_mean_user_and_bot_epoch16,ENV_Test_accuracy_per_mean_user_and_bot_epoch17,ENV_Test_accuracy_per_mean_user_and_bot_epoch18,ENV_Test_accuracy_per_mean_user_and_bot_epoch19
0,0.823235,0.827054,0.829843,0.831069,0.831783,0.831242,0.832187,0.833371,0.832795,0.833014,0.832363,0.833242,0.832658,0.8329,0.832261,0.83316,0.832671,0.832739,0.832959,0.831521


In [68]:
half_ratio = result.iloc[0]
if len(result) >= 2:
    one_ratio = result.iloc[1]
else:
    one_ratio = None
two_ratio = result.iloc[2]
four_ratio = result.iloc[3]
x_axis = range(20)
plt.plot(x_axis, half_ratio, label='0.5', marker='^')
plt.plot(x_axis, one_ratio, label='1', marker='o')
plt.plot(x_axis, two_ratio, label='2', marker='v')
plt.plot(x_axis, four_ratio, label='4', marker='s')
plt.title(f'enter title here')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

IndexError: single positional indexer is out-of-bounds