In [1]:
import utils
import pandas as pd

model_name = "mixtral-8x7b-instruct-v01"

# file_path = f"eval_outputs/{model_name}/[RESORT_baselines]/ITER/+cons_principle_instruct"
# file_path = f"eval_outputs/{model_name}/[seed=Principle_Instruct]/[resort_human_eval_30]-generated_refined_responses"
# file_path = f"eval_outputs/{model_name}/[seed=RESORT_constitutions]/[resort_human_eval_30]-generated_refined_responses"
file_path = f"eval_outputs/{model_name}/[seed=1_RESORT_constitutions_per_dimension]/[resort_human_eval_30]-generated_refined_responses"
# file_path = f"eval_outputs/{model_name}/[seed=none]/[resort_human_eval_30]-generated_refined_responses"


eval_1_standard_alignment = pd.read_json(f"../{file_path}/eval-1_standard_alignment.jsonl", lines=True).drop_duplicates(subset=["Reddit ID", "appraisal_dimension_id"])
eval_2_empathy = pd.read_json(f"../{file_path}/eval-2_empathy.jsonl", lines=True).drop_duplicates(subset=["Reddit ID", "appraisal_dimension_id"])
eval_3_harmful = pd.read_json(f"../{file_path}/eval-3_harmful.jsonl", lines=True).drop_duplicates(subset=["Reddit ID", "appraisal_dimension_id"])
eval_4_factuality = pd.read_json(f"../{file_path}/eval-4_factuality.jsonl", lines=True).drop_duplicates(subset=["Reddit ID", "appraisal_dimension_id"])

print (len(eval_1_standard_alignment), len(eval_2_empathy), len(eval_3_harmful), len(eval_4_factuality))
assert len(eval_1_standard_alignment) == len(eval_2_empathy) == len(eval_3_harmful) == len(eval_4_factuality) == 30

30 30 30 30


In [2]:
for df in [eval_1_standard_alignment, eval_2_empathy, eval_3_harmful, eval_4_factuality]:
    raw_output_cols = [col for col in df.columns if col.startswith('evaluation_raw_output-')]
    df['evaluation_raw_output'] = df[raw_output_cols].bfill(axis=1).iloc[:, 0]
    df = df.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)

In [3]:
eval_1_standard_alignment["evaluation_raw_output_score"] = eval_1_standard_alignment["evaluation_raw_output"].apply(utils.extract_score_criterion_1_standard_alignment)
eval_2_empathy["evaluation_raw_output_score"] = eval_2_empathy["evaluation_raw_output"].apply(utils.extract_score_criterion_2_empathy)
eval_3_harmful["evaluation_raw_output_score"] = eval_3_harmful["evaluation_raw_output"].apply(utils.extract_score_criterion_3_harmful)
eval_4_factuality["evaluation_raw_output_score"] = eval_4_factuality["evaluation_raw_output"].apply(utils.extract_score_criterion_4_factuality)

In [4]:
eval_1_standard_alignment = eval_1_standard_alignment.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
eval_2_empathy = eval_2_empathy.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
eval_3_harmful = eval_3_harmful.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
eval_4_factuality = eval_4_factuality.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)

In [5]:
from termcolor import colored

print (colored(f"Model: {model_name}", "red"))
print (colored(f"File Path: {file_path}", "yellow"))

print (colored(f"Standard Alignment: {eval_1_standard_alignment['evaluation_raw_output_score'].mean()}", "green"))
print (colored(f"Empathy: {eval_2_empathy['evaluation_raw_output_score'].mean()}", "green"))
print (colored(f"Harmful: {eval_3_harmful['evaluation_raw_output_score'].mean()}", "green"))
print (colored(f"Factuality: {eval_4_factuality['evaluation_raw_output_score'].mean()}", "green"))

[31mModel: mixtral-8x7b-instruct-v01[0m
[33mFile Path: eval_outputs/mixtral-8x7b-instruct-v01/[seed=1_RESORT_constitutions_per_dimension]/[resort_human_eval_30]-generated_refined_responses[0m
[32mStandard Alignment: 8.033333333333333[0m
[32mEmpathy: 4.766666666666667[0m
[32mHarmful: 0.0[0m
[32mFactuality: 0.9333333333333333[0m


## Measure significance

In [6]:
if "self-refine" in file_path or "vanilla" in file_path:
    raise SystemError("We don't evaluate the baseline models")

In [7]:
### ------ vanilla baseline
vanilla_baseline_path = f"eval_outputs/{model_name}/[RESORT_baselines]/INDV/vanilla"

vanilla_baseline_eval_1_standard_alignment = pd.read_json(f"../{vanilla_baseline_path}/eval-1_standard_alignment.jsonl", lines=True).drop_duplicates()
vanilla_baseline_eval_2_empathy = pd.read_json(f"../{vanilla_baseline_path}/eval-2_empathy.jsonl", lines=True).drop_duplicates()
vanilla_baseline_eval_3_harmful = pd.read_json(f"../{vanilla_baseline_path}/eval-3_harmful.jsonl", lines=True).drop_duplicates()
vanilla_baseline_eval_4_factuality = pd.read_json(f"../{vanilla_baseline_path}/eval-4_factuality.jsonl", lines=True).drop_duplicates(subset=["Reddit ID", "appraisal_dimension_id"])

print (len(vanilla_baseline_eval_1_standard_alignment), len(vanilla_baseline_eval_2_empathy), len(vanilla_baseline_eval_3_harmful), len(vanilla_baseline_eval_4_factuality))
assert len(vanilla_baseline_eval_1_standard_alignment) == len(vanilla_baseline_eval_2_empathy) == len(vanilla_baseline_eval_3_harmful) == len(vanilla_baseline_eval_4_factuality) == 30

for df in [
    vanilla_baseline_eval_1_standard_alignment, vanilla_baseline_eval_2_empathy,
    vanilla_baseline_eval_3_harmful, vanilla_baseline_eval_4_factuality]:
    raw_output_cols = [col for col in df.columns if col.startswith('evaluation_raw_output-')]
    df['evaluation_raw_output'] = df[raw_output_cols].bfill(axis=1).iloc[:, 0]
    df = df.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)

vanilla_baseline_eval_1_standard_alignment["evaluation_raw_output_score"] = vanilla_baseline_eval_1_standard_alignment["evaluation_raw_output"].apply(utils.extract_score_criterion_1_standard_alignment)
vanilla_baseline_eval_2_empathy["evaluation_raw_output_score"] = vanilla_baseline_eval_2_empathy["evaluation_raw_output"].apply(utils.extract_score_criterion_2_empathy)
vanilla_baseline_eval_3_harmful["evaluation_raw_output_score"] = vanilla_baseline_eval_3_harmful["evaluation_raw_output"].apply(utils.extract_score_criterion_3_harmful)
vanilla_baseline_eval_4_factuality["evaluation_raw_output_score"] = vanilla_baseline_eval_4_factuality["evaluation_raw_output"].apply(utils.extract_score_criterion_4_factuality)

vanilla_baseline_eval_1_standard_alignment = vanilla_baseline_eval_1_standard_alignment.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
vanilla_baseline_eval_2_empathy = vanilla_baseline_eval_2_empathy.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
vanilla_baseline_eval_3_harmful = vanilla_baseline_eval_3_harmful.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
vanilla_baseline_eval_4_factuality = vanilla_baseline_eval_4_factuality.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)

30 30 30 30


In [8]:
### ------ self refine baseline
self_refine_baseline_path = f"eval_outputs/{model_name}/[RESORT_baselines]/ITER/self-refine"

self_refine_baseline_eval_1_standard_alignment = pd.read_json(f"../{self_refine_baseline_path}/eval-1_standard_alignment.jsonl", lines=True).drop_duplicates()
self_refine_baseline_eval_2_empathy = pd.read_json(f"../{self_refine_baseline_path}/eval-2_empathy.jsonl", lines=True).drop_duplicates()
self_refine_baseline_eval_3_harmful = pd.read_json(f"../{self_refine_baseline_path}/eval-3_harmful.jsonl", lines=True).drop_duplicates()
self_refine_baseline_eval_4_factuality = pd.read_json(f"../{self_refine_baseline_path}/eval-4_factuality.jsonl", lines=True).drop_duplicates(subset=["Reddit ID", "appraisal_dimension_id"])

print (len(self_refine_baseline_eval_1_standard_alignment), len(self_refine_baseline_eval_2_empathy), len(self_refine_baseline_eval_3_harmful), len(self_refine_baseline_eval_4_factuality))
assert len(self_refine_baseline_eval_1_standard_alignment) == len(self_refine_baseline_eval_2_empathy) == len(self_refine_baseline_eval_3_harmful) == len(self_refine_baseline_eval_4_factuality) == 30

for df in [
    self_refine_baseline_eval_1_standard_alignment, self_refine_baseline_eval_2_empathy,
    self_refine_baseline_eval_3_harmful, self_refine_baseline_eval_4_factuality]:
    raw_output_cols = [col for col in df.columns if col.startswith('evaluation_raw_output-')]
    df['evaluation_raw_output'] = df[raw_output_cols].bfill(axis=1).iloc[:, 0]
    df = df.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)

self_refine_baseline_eval_1_standard_alignment["evaluation_raw_output_score"] = self_refine_baseline_eval_1_standard_alignment["evaluation_raw_output"].apply(utils.extract_score_criterion_1_standard_alignment)
self_refine_baseline_eval_2_empathy["evaluation_raw_output_score"] = self_refine_baseline_eval_2_empathy["evaluation_raw_output"].apply(utils.extract_score_criterion_2_empathy)
self_refine_baseline_eval_3_harmful["evaluation_raw_output_score"] = self_refine_baseline_eval_3_harmful["evaluation_raw_output"].apply(utils.extract_score_criterion_3_harmful)
self_refine_baseline_eval_4_factuality["evaluation_raw_output_score"] = self_refine_baseline_eval_4_factuality["evaluation_raw_output"].apply(utils.extract_score_criterion_4_factuality)

self_refine_baseline_eval_1_standard_alignment = self_refine_baseline_eval_1_standard_alignment.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
self_refine_baseline_eval_2_empathy = self_refine_baseline_eval_2_empathy.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
self_refine_baseline_eval_3_harmful = self_refine_baseline_eval_3_harmful.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)
self_refine_baseline_eval_4_factuality = self_refine_baseline_eval_4_factuality.sort_values(by=["Reddit ID", "appraisal_dimension_id"]).reset_index(drop=True)

30 30 30 30


In [9]:
from scipy.stats import ttest_rel

print (colored(f"Model: {model_name}", "red"))
print (colored(f"Method: {file_path}", "green"))

def p_value_sign(p_value, sign="*"):
    if p_value < 0.05:
        return sign
    else:
        return ""

assert self_refine_baseline_eval_1_standard_alignment[["Reddit ID", "appraisal_dimension_id"]].equals(eval_1_standard_alignment[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
assert vanilla_baseline_eval_1_standard_alignment[["Reddit ID", "appraisal_dimension_id"]].equals(eval_1_standard_alignment[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
t, p_vanilla = ttest_rel(eval_1_standard_alignment["evaluation_raw_output_score"], vanilla_baseline_eval_1_standard_alignment["evaluation_raw_output_score"])
t, p_self_refine = ttest_rel(eval_1_standard_alignment["evaluation_raw_output_score"], self_refine_baseline_eval_1_standard_alignment["evaluation_raw_output_score"])
print (f"1. Standard Alignment: {round(eval_1_standard_alignment['evaluation_raw_output_score'].mean(), 2)}" + p_value_sign(p_vanilla, "*") + p_value_sign(p_self_refine, "^"))

assert self_refine_baseline_eval_2_empathy[["Reddit ID", "appraisal_dimension_id"]].equals(eval_2_empathy[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
assert vanilla_baseline_eval_2_empathy[["Reddit ID", "appraisal_dimension_id"]].equals(eval_2_empathy[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
t, p_vanilla = ttest_rel(eval_2_empathy["evaluation_raw_output_score"], vanilla_baseline_eval_2_empathy["evaluation_raw_output_score"])
t, p_self_refine = ttest_rel(eval_2_empathy["evaluation_raw_output_score"], self_refine_baseline_eval_2_empathy["evaluation_raw_output_score"])
print (f"2. Empathy: {round(eval_2_empathy['evaluation_raw_output_score'].mean(), 2)}" + p_value_sign(p_vanilla, "*") + p_value_sign(p_self_refine, "^"))

assert self_refine_baseline_eval_3_harmful[["Reddit ID", "appraisal_dimension_id"]].equals(eval_3_harmful[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
assert vanilla_baseline_eval_3_harmful[["Reddit ID", "appraisal_dimension_id"]].equals(eval_3_harmful[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
t, p_vanilla = ttest_rel(eval_3_harmful["evaluation_raw_output_score"], vanilla_baseline_eval_3_harmful["evaluation_raw_output_score"])
t, p_self_refine = ttest_rel(eval_3_harmful["evaluation_raw_output_score"], self_refine_baseline_eval_3_harmful["evaluation_raw_output_score"])
print (f"3. Harmful: {round(eval_3_harmful['evaluation_raw_output_score'].mean(), 2)}" + p_value_sign(p_vanilla, "*") + p_value_sign(p_self_refine, "^"))

assert self_refine_baseline_eval_4_factuality[["Reddit ID", "appraisal_dimension_id"]].equals(eval_4_factuality[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
assert vanilla_baseline_eval_4_factuality[["Reddit ID", "appraisal_dimension_id"]].equals(eval_4_factuality[["Reddit ID", "appraisal_dimension_id"]]), "The DataFrames are not equal."
t, p_vanilla = ttest_rel(eval_4_factuality["evaluation_raw_output_score"], vanilla_baseline_eval_4_factuality["evaluation_raw_output_score"])
t, p_self_refine = ttest_rel(eval_4_factuality["evaluation_raw_output_score"], self_refine_baseline_eval_4_factuality["evaluation_raw_output_score"])
print (f"4. Factuality: {round(eval_4_factuality['evaluation_raw_output_score'].mean(), 2)}" + p_value_sign(p_vanilla, "*") + p_value_sign(p_self_refine, "^"))

[31mModel: mixtral-8x7b-instruct-v01[0m
[32mMethod: eval_outputs/mixtral-8x7b-instruct-v01/[seed=1_RESORT_constitutions_per_dimension]/[resort_human_eval_30]-generated_refined_responses[0m
1. Standard Alignment: 8.03*^
2. Empathy: 4.77*^
3. Harmful: 0.0
4. Factuality: 0.93^
