In [1]:
import os
import json
import numpy as np
import pandas as pd
from utils import get_llm_response, get_file_from_s3, get_topics, list_s3_prefix
from tqdm import tqdm
from dataclasses import dataclass


@dataclass
class PersonaDimension:
    name: str  # a concise name of the persona aspect
    description: str  # a detailed description of the persona aspect
    level: str  # the abstractness level of this persona dimension, choose from ["low", "mid", "high"]
    candidate_values: str  # the candidate values of this persona dimension

# Load Data

In [2]:
surveys = set()
for path in list_s3_prefix("human_resp/"):
    if path.startswith("human_resp/American_Trends_Panel"):
        # Extract the folder name
        folder = path.split("/")[1]
        surveys.add(folder)
surveys = sorted(list(surveys))[:1]
surveys

['American_Trends_Panel_W26']

In [3]:
mapping = np.load(get_file_from_s3('human_resp/topic_mapping.npy'), allow_pickle=True)
mapping = mapping.item()
mapping

{'How much of a problem, if any, would you say people saying things that are very offensive to others are in the country today? [Major problem/Minor problem/Not a problem/Refused]': {'fg': ['discrimination: other'],
  'cg': ['discrimination']},
 'How important is being black to how you think about yourself? [Extremely important/Very important/Moderately important/Only a little important/Not at all important/Refused]': {'fg': ['race',
   'self-perception and values'],
  'cg': ['race', 'self-perception and values']},
 'How important is being Hispanic to how you think about yourself? [Extremely important/Very important/Moderately important/Only a little important/Not at all important/Refused]': {'fg': ['race',
   'self-perception and values'],
  'cg': ['race', 'self-perception and values']},
 'How important is being white to how you think about yourself? [Extremely important/Very important/Moderately important/Only a little important/Not at all important/Refused]': {'fg': ['race',
   'sel

# Extraction: Get personas from questions

In [4]:
def get_personas_from_survey(info_df, survey, prompt_name, output_dir, print_result=False):
    with open(f'prompts/{prompt_name}.txt') as f:
        prompt_template = f.read()

    res = []
    logs = {}
    valid_cnt = 0
    for idx, row in tqdm(info_df.iterrows(), total=len(info_df)):
        topics = get_topics(mapping, row['question'])
        input_dict = {
            "topic_fg": topics['fg'],
            "topic_cg": topics['cg'],
            "question": row['question'],
            "options": row['references'],
        }
        prompt = prompt_template.format(**input_dict)
        response = get_llm_response(prompt, model_id='anthropic.claude-3-haiku-20240307-v1:0', prefill='[')
        response = '[' + response
        valid = None
        error_msg = None
        try:
            eval(response)
            valid = True
            valid_cnt += 1
            if print_result:
                print(response)
        except Exception as e:
            print(e)
            valid = False
            error_msg = str(e)

        res.append({
            'valid': valid,
            'error_msg': error_msg,
            'input_dict': str(input_dict),
            'response': response,
        })

        logs['res_len'] = len(res)
        logs['valid_ratio'] = valid_cnt / len(res)

        os.makedirs(output_dir, exist_ok=True)

        with open(f'{output_dir}/personas_extracted_from_question_{survey}.json', 'w') as f:
            json.dump(res, f, indent=4)
        with open(f'{output_dir}/logs_{survey}.json', 'w') as f:
            json.dump(logs, f, indent=4)


In [6]:
for survey in surveys:
    file_key = f"human_resp/{survey}/info.csv"
    info_df = pd.read_csv(get_file_from_s3(file_key))
    get_personas_from_survey(info_df, survey, prompt_name='get_personas_from_questions', output_dir='sm_local/outputs/extraction', print_result=True)


  1%|▏         | 1/78 [00:04<05:45,  4.49s/it]

[
    PersonaDimension(
        name="Geographic Location",
        description="the specific area or neighborhood where the person lives and their perception of its safety from crime",
        level="low",
        candidate_values=["high-crime area", "low-crime area"]
    ),
    PersonaDimension(
        name="Personal Experiences",
        description="the person's past experiences with crime or safety issues in their local community",
        level="low",
        candidate_values=["experienced crime", "not experienced crime"]
    ),
    PersonaDimension(
        name="Risk Perception and Assessment",
        description="how the person evaluates the likelihood and potential impact of crime in their local community",
        level="mid",
        candidate_values=["high perception of risk", "low perception of risk"]
    ),
    PersonaDimension(
        name="Emotional State",
        description="the person's feelings of safety, fear, or concern about crime in their local community",


  3%|▎         | 2/78 [00:10<06:56,  5.48s/it]

[
    PersonaDimension(
        name="Perception of Crime and Safety",
        description="how the person perceives the overall level of crime and safety in the world compared to the past",
        level="mid",
        candidate_values=["perceives the world as safer", "perceives the world as more dangerous", "perceives the world as neither safer nor more dangerous"]
    ),
    PersonaDimension(
        name="Risk Assessment",
        description="the person's ability to evaluate and assess the risks and threats in their environment",
        level="mid",
        candidate_values=["high risk assessment", "low risk assessment"]
    ),
    PersonaDimension(
        name="Optimism/Pessimism",
        description="the person's overall outlook on the state of the world and its trajectory",
        level="high",
        candidate_values=["optimistic", "pessimistic"]
    ),
    PersonaDimension(
        name="Personal Experiences",
        description="the person's own experiences with crime 

  4%|▍         | 3/78 [00:15<06:29,  5.20s/it]

[
    PersonaDimension(
        name="Geographic Location",
        description="living in an area with a higher or lower risk of home burglaries",
        level="low",
        candidate_values=["high-crime area", "low-crime area"]
    ),
    PersonaDimension(
        name="Personal Experiences",
        description="having been a victim of a home burglary in the past",
        level="low",
        candidate_values=["experienced burglary", "not experienced burglary"]
    ),
    PersonaDimension(
        name="Home Security Measures",
        description="the level of security measures taken to protect one's home",
        level="low",
        candidate_values=["high security", "low security"]
    ),
    PersonaDimension(
        name="Risk Perception and Assessment",
        description="how they evaluate the likelihood and potential impact of a home burglary",
        level="mid",
        candidate_values=["high perception of risk", "low perception of risk"]
    ),
    PersonaDimensio

  4%|▍         | 3/78 [00:16<06:56,  5.55s/it]


KeyboardInterrupt: 