In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import json
import numpy as np
import pandas as pd
from utils import get_cosine_similarity_metrics, get_relevant_ratio, list_s3_prefix, get_file_from_s3
from transformers import AutoTokenizer, AutoModel

surveys = set()
for path in list_s3_prefix("human_resp/"):
    if path.startswith("human_resp/American_Trends_Panel"):
        # Extract the folder name
        folder = path.split("/")[1]
        surveys.add(folder)
surveys = sorted(list(surveys))
question_topic_mapping = np.load(get_file_from_s3('human_resp/topic_mapping.npy'), allow_pickle=True)
question_topic_mapping = question_topic_mapping.item()

survey_topics_mapping = {}
for survey in surveys:
    file_key = f"human_resp/{survey}/info.csv"
    info_df = pd.read_csv(get_file_from_s3(file_key))

    cgs, fgs = [], []
    for idx, row in info_df.iterrows():
        question = row['question']
        ref = f"[{'/'.join([char for char in eval(row['references'])])}]"
        key = ' '.join((question, ref))
        topics = question_topic_mapping[key]
        cgs += topics['cg']
        fgs += topics['fg']

    survey_topics_mapping[survey] = set(fgs)
survey_topics_mapping.keys()

dict_keys(['American_Trends_Panel_W26', 'American_Trends_Panel_W27', 'American_Trends_Panel_W29', 'American_Trends_Panel_W32', 'American_Trends_Panel_W34', 'American_Trends_Panel_W36', 'American_Trends_Panel_W41', 'American_Trends_Panel_W42', 'American_Trends_Panel_W43', 'American_Trends_Panel_W45', 'American_Trends_Panel_W49', 'American_Trends_Panel_W50', 'American_Trends_Panel_W54', 'American_Trends_Panel_W82', 'American_Trends_Panel_W92'])

In [2]:
print(survey_topics_mapping['American_Trends_Panel_W26'])

{'crime/security: terrorism', 'crime/security: justice system', 'crime/security: crime', 'community health', 'healthcare system: healthcare system other', 'personal finance', 'job/career', 'crime/security: guns'}


In [3]:

def get_exp(exp, eval_metrics=False):
    root_dir = f'sm_local/outputs_{exp}'

    # from loggins
    loggings = f"{root_dir}/loggings.json"
    with open(loggings, 'r') as f:
        loggings = json.load(f)

    extraction = loggings['extraction']
    avg_ext_valid = sum([_['valid_ratio'] * _['num_questions_used'] for _ in extraction]) / sum([_['num_questions_used'] for _ in extraction])
    avg_ext_num_persona = sum([_['total_num_personas_extracted'] for _ in extraction]) / len(extraction)
    avg_ext_time = sum([_['extraction_time'] for _ in extraction]) / len(extraction)

    clustering = loggings['clustering']
    avg_cluster_time = sum([_['clustering_time'] for _ in clustering]) / len(clustering)

    summarizing = loggings['summarizing']
    avg_cluster_generated = sum([_['num_of_clusters'] for _ in summarizing]) / len(summarizing)
    avg_sum_persona_generated = sum([_['num_of_personas'] for _ in summarizing]) / len(summarizing)
    avg_sum_valid = sum([_['valid_ratio'] for _ in summarizing]) / len(summarizing)
    avg_sum_time = sum([_['summarizing_time'] for _ in summarizing]) / len(summarizing)

    cleaning = loggings['cleaning']
    avg_clean_valid = sum([_['is_successful'] for _ in cleaning]) / len(cleaning)
    avg_clean_num_persona = sum([_['num_final_personas'] for _ in cleaning]) / len(cleaning)
    avg_clean_time = sum([_['cleaning_time'] for _ in cleaning]) / len(cleaning)

    # eval
    diversity_percentile_metrics = {'low': 0, 'mid': 0, 'high': 0}
    relevant_ratio = 0
    if eval_metrics:
        diversity_percentile_metrics = get_cosine_similarity_metrics(root_dir, tokenizer, model)
        relevant_ratio = get_relevant_ratio(root_dir, survey_topics_mapping)

    res = {
        "avg_ext_num_persona": avg_ext_num_persona,
        "avg_ext_valid": avg_ext_valid,
        "avg_ext_time": avg_ext_time,
        "avg_cluster_generated": avg_cluster_generated,
        "avg_cluster_time": avg_cluster_time,
        "avg_sum_persona_generated": avg_sum_persona_generated,
        "avg_sum_valid": avg_sum_valid,
        "avg_sum_time": avg_sum_time,
        "avg_clean_valid": avg_clean_valid,
        "avg_clean_num_persona": avg_clean_num_persona,
        "avg_clean_time": avg_clean_time,
        "diversity_percentile_metrics_low": diversity_percentile_metrics["low"],
        "diversity_percentile_metrics_mid": diversity_percentile_metrics["mid"],
        "diversity_percentile_metrics_high": diversity_percentile_metrics["high"],
        "relevant_ratio": relevant_ratio
    }
    return res

In [4]:
EVAL_METRICS=True
if EVAL_METRICS:
    tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-2_R')
    model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-2_R', device_map='auto')

for llm in ['haiku', 'sonnet']:
    kmeans = 10
    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    res = get_exp(exp, eval_metrics=EVAL_METRICS)
    print(f"{llm};kmeans;{kmeans};{survey};{extraction_type};{res['avg_ext_valid']};{res['avg_ext_num_persona']};{res['avg_ext_time']};{res['avg_cluster_generated']};{res['avg_cluster_time']};{res['avg_sum_valid']};{res['avg_sum_persona_generated']};{res['avg_sum_time']};{res['avg_clean_valid']};{res['avg_clean_num_persona']};{res['avg_clean_time']};{res['diversity_percentile_metrics_low']};{res['diversity_percentile_metrics_mid']};{res['diversity_percentile_metrics_high']};{res['relevant_ratio']}")
    
    kmeans = 10
    survey = 'single'
    extraction_type = 'description'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    res = get_exp(exp, eval_metrics=EVAL_METRICS)
    print(f"{llm};kmeans;{kmeans};{survey};{extraction_type};{res['avg_ext_valid']};{res['avg_ext_num_persona']};{res['avg_ext_time']};{res['avg_cluster_generated']};{res['avg_cluster_time']};{res['avg_sum_valid']};{res['avg_sum_persona_generated']};{res['avg_sum_time']};{res['avg_clean_valid']};{res['avg_clean_num_persona']};{res['avg_clean_time']};{res['diversity_percentile_metrics_low']};{res['diversity_percentile_metrics_mid']};{res['diversity_percentile_metrics_high']};{res['relevant_ratio']}")
   
    kmeans = 20
    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    res = get_exp(exp, eval_metrics=EVAL_METRICS)
    print(f"{llm};kmeans;{kmeans};{survey};{extraction_type};{res['avg_ext_valid']};{res['avg_ext_num_persona']};{res['avg_ext_time']};{res['avg_cluster_generated']};{res['avg_cluster_time']};{res['avg_sum_valid']};{res['avg_sum_persona_generated']};{res['avg_sum_time']};{res['avg_clean_valid']};{res['avg_clean_num_persona']};{res['avg_clean_time']};{res['diversity_percentile_metrics_low']};{res['diversity_percentile_metrics_mid']};{res['diversity_percentile_metrics_high']};{res['relevant_ratio']}")
   
    gmm = 10
    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_gmm{gmm}_{survey}_{extraction_type}'
    res = get_exp(exp, eval_metrics=EVAL_METRICS)
    print(f"{llm};gmm;{gmm};{survey};{extraction_type};{res['avg_ext_valid']};{res['avg_ext_num_persona']};{res['avg_ext_time']};{res['avg_cluster_generated']};{res['avg_cluster_time']};{res['avg_sum_valid']};{res['avg_sum_persona_generated']};{res['avg_sum_time']};{res['avg_clean_valid']};{res['avg_clean_num_persona']};{res['avg_clean_time']};{res['diversity_percentile_metrics_low']};{res['diversity_percentile_metrics_mid']};{res['diversity_percentile_metrics_high']};{res['relevant_ratio']}")

    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_dbscan_{survey}_{extraction_type}_test'
    res = get_exp(exp, eval_metrics=EVAL_METRICS)
    print(f"{llm};dbscan;-1;{survey};{extraction_type};{res['avg_ext_valid']};{res['avg_ext_num_persona']};{res['avg_ext_time']};{res['avg_cluster_generated']};{res['avg_cluster_time']};{res['avg_sum_valid']};{res['avg_sum_persona_generated']};{res['avg_sum_time']};{res['avg_clean_valid']};{res['avg_clean_num_persona']};{res['avg_clean_time']};{res['diversity_percentile_metrics_low']};{res['diversity_percentile_metrics_mid']};{res['diversity_percentile_metrics_high']};{res['relevant_ratio']}")

    kmeans = 10
    survey = 'same_topic'
    extraction_type = 'example'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    res = get_exp(exp, eval_metrics=EVAL_METRICS)
    print(f"{llm};kmeans;{kmeans};{survey};{extraction_type};{res['avg_ext_valid']};{res['avg_ext_num_persona']};{res['avg_ext_time']};{res['avg_cluster_generated']};{res['avg_cluster_time']};{res['avg_sum_valid']};{res['avg_sum_persona_generated']};{res['avg_sum_time']};{res['avg_clean_valid']};{res['avg_clean_num_persona']};{res['avg_clean_time']};{res['diversity_percentile_metrics_low']};{res['diversity_percentile_metrics_mid']};{res['diversity_percentile_metrics_high']};{res['relevant_ratio']}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 13/13 [02:08<00:00,  9.91s/it]


haiku;kmeans;10;single;example;1.0;737.6;383.64743700027464;10.0;81.11092033386231;0.9600000000000002;41.46666666666667;29.767044099171958;0.8666666666666667;33.0;35.05690800348918;0.672620415687561;0.7050308585166931;0.7606400847434998;0.9838383838383838


100%|██████████| 14/14 [01:13<00:00,  5.25s/it]


haiku;kmeans;10;single;description;1.0;290.0;183.5846312046051;10.0;37.716484785079956;1.0;26.2;18.240769831339517;0.9333333333333333;21.4;19.837557538350424;0.6895028054714203;0.7286495089530944;0.7700246155261994;0.9813084112149533


 20%|██        | 2/10 [00:17<01:08,  8.60s/it]


KeyboardInterrupt: 

In [9]:
def get_cleaning_stats(exp):
    root_dir = f'sm_local/outputs_{exp}'

    # from loggins
    loggings = f"{root_dir}/loggings.json"
    with open(loggings, 'r') as f:
        loggings = json.load(f)

    cleaning = loggings['cleaning']
    success = sum([_['is_successful'] for _ in cleaning]) / len(cleaning)

    return len(cleaning), success


for llm in ['haiku', 'sonnet']:
    kmeans = 10
    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    clean_len, success_rate = get_cleaning_stats(exp)
    print(f"{exp}: {clean_len},\n {success_rate}")
    
    kmeans = 10
    survey = 'single'
    extraction_type = 'description'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    clean_len, success_rate = get_cleaning_stats(exp)
    print(f"{exp}: {clean_len},\n {success_rate}")
   
    kmeans = 20
    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    clean_len, success_rate = get_cleaning_stats(exp)
    print(f"{exp}: {clean_len},\n {success_rate}")
   
    gmm = 10
    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_gmm{gmm}_{survey}_{extraction_type}'
    clean_len, success_rate = get_cleaning_stats(exp)
    print(f"{exp}: {clean_len},\n {success_rate}")

    survey = 'single'
    extraction_type = 'example'
    exp = f'{llm}_dbscan_{survey}_{extraction_type}_test'
    clean_len, success_rate = get_cleaning_stats(exp)
    print(f"{exp}: {clean_len},\n {success_rate}")

    kmeans = 10
    survey = 'same_topic'
    extraction_type = 'example'
    exp = f'{llm}_kmeans{kmeans}_{survey}_{extraction_type}'
    clean_len, success_rate = get_cleaning_stats(exp)
    print(f"{exp}: {clean_len},\n {success_rate}")
    

haiku_kmeans10_single_example: 15,
 0.8666666666666667
haiku_kmeans10_single_description: 15,
 0.9333333333333333
haiku_kmeans20_single_example: 15,
 0.6666666666666666
haiku_gmm10_single_example: 15,
 0.8
haiku_dbscan_single_example_test: 15,
 0.9333333333333333
haiku_kmeans10_same_topic_example: 3,
 0.6666666666666666
sonnet_kmeans10_single_example: 15,
 0.8666666666666667
sonnet_kmeans10_single_description: 15,
 0.9333333333333333
sonnet_kmeans20_single_example: 15,
 0.8666666666666667
sonnet_gmm10_single_example: 15,
 0.8666666666666667
sonnet_dbscan_single_example_test: 15,
 1.0
sonnet_kmeans10_same_topic_example: 3,
 1.0


haiku_kmeans10_single_example: 15, 0.8666666666666667
haiku_kmeans10_single_description: 15, 0.9333333333333333
haiku_kmeans20_single_example: 15, 0.6666666666666666
haiku_gmm10_single_example: 15, 0.8
haiku_dbscan_single_example_test: 15, 0.9333333333333333
haiku_kmeans10_same_topic_example: 3, 0.6666666666666666
sonnet_kmeans10_single_example: 15, 0.8666666666666667
sonnet_kmeans10_single_description: 15, 0.9333333333333333
sonnet_kmeans20_single_example: 15, 0.8666666666666667
sonnet_gmm10_single_example: 15, 0.8666666666666667
sonnet_dbscan_single_example_test: 15, 1.0
sonnet_kmeans10_same_topic_example: 3, 1.0


In [4]:
# import pandas as pd

# # get file path under sm_local
# model = 'haiku'
# for kmeans in ['10']:
#     for level in ['high','mid','low']:
#         file_name = f"sm_local/outputs_{model}_kmeans{kmeans}_single_example/cleaned/cleaned_{level}_level_personas_American_Trends_Panel_W26.json"
#         df = pd.read_json(file_name)
#         df.to_csv(f'{model}_kmeans_{kmeans}_single_example_W26_{level}.csv')

In [5]:
# # list all folders in 'pump/sm_local'

# import os

# folders = [_ for _ in os.listdir('sm_local') if _.startswith('outputs') and 'meanshift' not in _]
# folders

In [6]:
# for config in folders:
#     cleaned_dir = f"sm_local/{config}/cleaned"
#     print(cleaned_dir)

#     for filename in os.listdir(cleaned_dir):
#         exp = filename[:-5]
#         os.makedirs(f"sm_local/results/{config}/", exist_ok=True)
#         df = pd.read_json(f"sm_local/{config}/cleaned/{exp}.json")
#         df.to_csv(f"sm_local/results/{config}/{exp}.csv")
#         df.to_json(f"sm_local/results/{config}/{exp}.json", indent=4, orient="records")

# Result Variance

In [77]:
import json
import os
import pandas as pd

res = {}
for filename in os.listdir('opinions_qa/output'):
    if any(_ not in filename for _ in ['v7', '0804']):
        continue
    exp = filename[9:][:-5]
    if exp not in res:
        res[exp] = []
        
    with open(f"opinions_qa/output/{filename}", 'r') as f:
        data = json.load(f)
    res[exp] = [f"={sum([_['is_correct'] for _ in data])}/{len(data)}", f"{sum([_['is_correct'] for _ in data])/len(data):.4f}"]
    # res[exp] = [f"{sum([_['is_correct'] for _ in data])/len(data):.4f}"]

import numpy as np
df = pd.DataFrame(res)
df = df[sorted(df.columns)]

df = df.transpose()

# df['avg'] = df.apply(lambda x: np.mean(x[[0,1,2]]), axis=1)
# df['var'] = df.apply(lambda x: np.var(x[[0,1,2]]), axis=1)
# df['std'] = df.apply(lambda x: np.std(x[[0,1,2]]), axis=1)

df

Unnamed: 0,0,1
v7_vanilla_demo_persona_alllevels_allpersonas_namedescvalue_sonnetinfer_American_Trends_Panel_W26_run2,=604/1090,0.5541
v7_vanilla_no_history_demo_persona_alllevels_allpersonas_namedescvalue_sonnetinfer_American_Trends_Panel_W26,=547/1090,0.5018
v7_vanilla_no_history_persona_alllevels_allpersonas_namedescvalue_sonnetinfer_American_Trends_Panel_W26,=532/1090,0.4881
v7_vanilla_persona_alllevels_allpersonas_namedesccandvalue_sonnetinfer_American_Trends_Panel_W26,=604/1090,0.5541
v7_vanilla_persona_alllevels_allpersonas_namedescvalue_sonnetinfer_American_Trends_Panel_W26,=605/1090,0.555
