In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
import re
import copy

%cd ..
from utils import *

%load_ext autoreload 
%autoreload 2
%config InlineBackend.figure_format = 'retina'

/scratch2/jsalle/ObsScaling


In [6]:
# Base benchmarks (MMLU, GSM8K, etc.)
instruct_llm_benchmark_eval = load_instruct_llm_benchmark_eval()

# Agent benchmarks
eval_result_path = "./eval_results/instruct_llm_agent_eval.csv"
agent_eval = pd.read_csv(eval_result_path)

instruct_llm_eval_with_agent = pd.merge(instruct_llm_benchmark_eval, agent_eval, on="Model")
eval_with_agent_models = instruct_llm_eval_with_agent['Model'].unique().tolist()

# SAD benchmarks
sad_eval = pd.read_csv("./eval_results/sad_benchmark_results.csv")
sad_eval.rename(columns={'model': 'Model'}, inplace=True)
sad_eval_models = sad_eval['Model'].unique().tolist()

In [3]:
sad_models = sad_eval["Model"].unique().tolist()
sad_types = sad_eval["type"].unique().tolist()
sad_variants = sad_eval["variant"].unique().tolist()


sad_intermediary_df = pd.DataFrame()

for model in sad_models:
    temp_dict = {
        "Model": model,
    }

    for t in sad_types:
        for v in sad_variants:
            temp_dict.update(
                {f'{t}_{v}': sad_eval[(sad_eval["Model"] == model) & (sad_eval["type"] == t) & (sad_eval["variant"] == v)]["score"].values[0]}
            )
    temp_df = pd.DataFrame(temp_dict, index=[0])

    sad_intermediary_df = pd.concat([sad_intermediary_df, temp_df], axis=0, ignore_index=True)

In [4]:
for c in sad_intermediary_df.columns:
    if ' ' in c:
        sad_intermediary_df.rename(columns={c: c.replace(' ', '_')}, inplace=True)

sad_intermediary_df

Unnamed: 0,Model,SAD_Plain_Prompt,SAD_Situating_Prompt,SAD-lite_Plain_Prompt,SAD-lite_Situating_Prompt,SAD-mini_Plain_Prompt,SAD-mini_Situating_Prompt,facts_Plain_Prompt,facts_Situating_Prompt,influence_Plain_Prompt,...,introspection_Plain_Prompt,introspection_Situating_Prompt,stages_Plain_Prompt,stages_Situating_Prompt,self-recognition_Plain_Prompt,self-recognition_Situating_Prompt,id-leverage_Plain_Prompt,id-leverage_Situating_Prompt,anti-imitation_Plain_Prompt,anti-imitation_Situating_Prompt
0,llama-2-7b,0.328841,0.306368,0.364873,0.340016,0.471816,0.418833,0.385027,0.360935,0.53125,...,0.379519,0.373241,0.374375,0.364375,0.5,0.5,0.038503,0.02406,0.093214,0.090714
1,llama-2-7b-chat,0.301525,0.313693,0.309089,0.331158,0.387865,0.452488,0.391385,0.441408,0.384375,...,0.215519,0.213593,0.347656,0.352656,0.5,0.5,0.255311,0.187394,0.016429,0.016429
2,llama-2-13b,0.323594,0.334734,0.353455,0.365446,0.42175,0.449539,0.376201,0.408362,0.45,...,0.350194,0.346815,0.355625,0.37625,0.5,0.5,0.034209,0.027695,0.198929,0.205893
3,llama-2-13b-chat,0.349043,0.376247,0.355007,0.390173,0.489223,0.552331,0.486278,0.541054,0.51875,...,0.227463,0.2205,0.396406,0.408281,0.5125,0.56875,0.286907,0.252642,0.015,0.0425
4,llama-2-70b,0.322603,0.351961,0.354197,0.391297,0.495903,0.585435,0.428984,0.50902,0.5625,...,0.229713,0.218074,0.380625,0.36625,0.500625,0.5,0.024703,0.024309,0.131071,0.133571
5,llama-2-70b-chat,0.365819,0.402261,0.374817,0.416498,0.509007,0.618683,0.505382,0.583412,0.55625,...,0.228065,0.228546,0.384375,0.423125,0.54125,0.52875,0.317556,0.328511,0.027857,0.032857
6,llama-3-70b-chat,0.451451,0.493406,0.452624,0.500807,0.60639,0.691563,0.565677,0.641054,0.653125,...,0.34813,0.358046,0.469062,0.484375,0.656875,0.736875,0.444964,0.439295,0.022321,0.022321
7,davinci-002,0.293974,0.300627,0.323768,0.331345,0.4135,0.429911,0.361013,0.386524,0.421875,...,0.239769,0.234178,0.366875,0.35625,0.505625,0.50625,0.008729,0.011011,0.153929,0.153929
8,gpt-3.5-turbo-0613,0.358278,0.384165,0.368927,0.405154,0.468703,0.546873,0.510929,0.572572,0.476562,...,0.237456,0.248857,0.39875,0.413906,0.510625,0.508125,0.306299,0.311186,0.067321,0.067321
9,gpt-4-base,0.367979,0.392918,0.40069,0.426456,0.571579,0.621953,0.481007,0.621154,0.621875,...,0.344156,0.350357,0.473125,0.433125,0.459375,0.444375,0.028103,0.03945,0.168214,0.168214


In [5]:
instruct_llm_eval_with_agent = pd.merge(instruct_llm_benchmark_eval, sad_eval, on="Model")

In [4]:
instruct_llm_eval_with_agent['Model'].unique().tolist()

['gpt-4-0613',
 'claude-2.0',
 'claude-1.3',
 'gpt-3.5-turbo-0613',
 'claude-instant-1.1',
 'codellama-34b-instruct',
 'vicuna-13b-v1.5',
 'llama-2-70b-chat',
 'llama-2-13b-chat',
 'vicuna-33b-v1.3',
 'openchat-13b-v3.2',
 'wizardlm-13b-v1.2',
 'codellama-13b-instruct',
 'vicuna-7b-v1.5',
 'guanaco-65b',
 'codellama-7b-instruct',
 'wizardlm-30b-v1.0',
 'guanaco-33b',
 'koala-13b',
 'llama-2-7b-chat',
 'dolly-v2-12b',
 'oasst-sft-4-pythia-12b-epoch-3.5',
 'gpt-4-0314',
 'deepseek-llm-67b-chat',
 'lemur-70b-chat-v1',
 'mistral-7b-instruct-v0.1',
 'vicuna-13b-16k']

In [7]:
sad_eval_models

['llama-2-7b',
 'llama-2-7b-chat',
 'llama-2-13b',
 'llama-2-13b-chat',
 'llama-2-70b',
 'llama-2-70b-chat',
 'llama-3-70b-chat',
 'davinci-002',
 'gpt-3.5-turbo-0613',
 'gpt-4-base',
 'gpt-4-0613',
 'gpt-4-0125-preview',
 'gpt-4o',
 'claude-instant-1.2',
 'claude-2.1',
 'claude-3-haiku',
 'claude-3-sonnet',
 'claude-3.5-sonnet',
 'claude-3-opus']

In [8]:
eval_result_path = "./eval_results/base_llm_benchmark_eval.csv"
base_eval = pd.read_csv(eval_result_path)

In [9]:
base_eval['Model'].unique().tolist()

['meta-llama/Llama-2-7b-hf',
 'meta-llama/Llama-2-13b-hf',
 'meta-llama/Llama-2-70b-hf',
 'huggyllama/llama-7b',
 'huggyllama/llama-13b',
 'huggyllama/llama-30b',
 'huggyllama/llama-65b',
 'meta-llama/Meta-Llama-3-70B',
 'meta-llama/Meta-Llama-3-8B',
 'Qwen/Qwen1.5-72B',
 'Qwen/Qwen1.5-32B',
 'Qwen/Qwen1.5-14B',
 'Qwen/Qwen1.5-7B',
 'Qwen/Qwen1.5-4B',
 'Qwen/Qwen1.5-1.8B',
 'Qwen/Qwen1.5-0.5B',
 'Qwen/Qwen-72B',
 'Qwen/Qwen-14B',
 'Qwen/Qwen-7B',
 'mistralai/Mistral-7B-v0.1',
 'mistralai/Mixtral-8x7B-v0.1',
 '01-ai/Yi-6B',
 '01-ai/Yi-34B',
 'google/gemma-7b',
 'google/gemma-2b',
 'tiiuae/falcon-180B',
 'tiiuae/falcon-40b',
 'tiiuae/falcon-7b',
 'tiiuae/falcon-rw-1b',
 'microsoft/phi-2',
 'microsoft/phi-1_5',
 'EleutherAI/pythia-1b-deduped',
 'EleutherAI/pythia-410m-deduped',
 'EleutherAI/pythia-6.9b-deduped',
 'EleutherAI/pythia-2.8b-deduped',
 'EleutherAI/pythia-12b-deduped',
 'EleutherAI/pythia-70m-deduped',
 'EleutherAI/pythia-1.4b-deduped',
 'EleutherAI/pythia-160m-deduped',
 'bigs