In [4]:
import os
os.environ['HF_HOME'] = '/mnt/sagemaker-nvme/cache'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import torch
import pandas as pd
from sklearn.cluster import KMeans
from utils import last_token_pool, get_detailed_instruct, get_llm_response
from transformers import AutoTokenizer, AutoModel


def merge_personas(exp_name):

    file_path = f'{exp_name}.csv'
    data = pd.read_csv(file_path)
    
    def get_formatted_persona_dim(row):
        task = 'Given a persona dimension description, retrieve semantically similar persona dimension descriptions.'
        persona = f"{row['name']}: {row['description']}. Candidate values: {row['candidate_values']}"
        return get_detailed_instruct(task, persona)
    
    data['formatted'] = data.apply(get_formatted_persona_dim, axis=1)
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-2_R')
    model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-2_R', device_map='auto')
    
    # Get the embeddings
    max_length = 4096
    input_texts = data['formatted'].to_list()
    batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**batch_dict)
        embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        # print(embeddings)

    # Clustering
    num_clusters = 20
    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(embeddings)
    data['cluster'] = clustering_model.labels_
    data = data.sort_values(by='cluster')
    data.to_csv(f'{exp_name}_clustered.csv')

    for idx in range(num_clusters):
        print(idx)
        for _, row in enumerate(data[data['cluster'] == idx]['formatted']):
            print(row.split('\n')[1])
        print('\n\n')


In [None]:
merge_personas('low')

In [None]:
merge_personas('low_simple')

In [1]:
import os
os.environ['HF_HOME'] = '/mnt/sagemaker-nvme/cache'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

import torch
import pandas as pd
from sklearn.cluster import KMeans
from utils import last_token_pool, get_detailed_instruct, PersonaDimension
from transformers import AutoTokenizer, AutoModel

exp_name = "low"
file_path = f'{exp_name}.csv'
data = pd.read_csv(file_path)


def get_formatted_persona_dim(row):
    task = 'Given a persona dimension description, retrieve semantically similar persona dimension descriptions.'
    persona = f"{row['name']}: {row['description']}. Candidate values: {row['candidate_values']}"
    return get_detailed_instruct(task, persona)

data['formatted'] = data.apply(get_formatted_persona_dim, axis=1)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-2_R')
model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-2_R', device_map='auto')

# Get the embeddings
max_length = 4096
input_texts = data['formatted'].to_list()
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**batch_dict)
    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    # print(embeddings)

# Clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
data['cluster'] = clustering_model.labels_
data = data.sort_values(by='cluster')
data.to_csv(f'{exp_name}_clustered.csv')

for idx in range(num_clusters):
    print(idx)
    for _, row in enumerate(data[data['cluster'] == idx]['formatted']):
        print(row.split('\n')[1])
    print('\n\n')


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

0
Recreational Interests: the individual's engagement in outdoor activities and sports that involve the use of firearms. Candidate values: ['active in hunting/shooting sports', 'not active in hunting/shooting sports']
Recreational Interests: the person's hobbies and leisure activities, which may have included the use of air guns. Candidate values: ['active participation', 'occasional participation', 'no participation']
Frequency of Gun Usage: how often the person engages in shooting or visiting gun ranges. Candidate values: ['often', 'sometimes', 'rarely', 'never']
Recreational Interests: whether the person has a hobby or interest in shooting sports. Candidate values: ['has shooting interest', 'lacks shooting interest']
Recreational Interests: the person's personal interests and hobbies, such as target shooting, hunting, or other sport-related activities involving guns. Candidate values: ['enjoys sport shooting', 'does not enjoy sport shooting']
Hobby and Interests: having a personal i

In [1]:
import torch
import pandas as pd
from sklearn.cluster import KMeans
from utils import last_token_pool, get_detailed_instruct, get_llm_response, PersonaDimension
from transformers import AutoTokenizer, AutoModel

with open('prompts/summarize_clustered_personas.txt') as f:
    prompt_template = f.read()
data = pd.read_csv('low_clustered.csv')

In [2]:
data[data['cluster'].astype(str)  == '19'][:3]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,name,description,level,candidate_values,original_question,formatted,cluster
201,41.0,124,Personal Experiences,having been a victim of crime or feeling unsaf...,low,"['experienced crime/violence', 'not experience...",Please indicate whether the following is a maj...,Instruct: Given a persona dimension descriptio...,19
202,178.0,539,Personal Experiences,"the person's past experiences with crime, viol...",low,"['experienced crime/violence', 'not experience...","How often, if ever, do you carry a handgun or ...",Instruct: Given a persona dimension descriptio...,19
203,14.0,51,Personal Experiences,having been a victim of violent crime in the past,low,"['victim of violent crime', 'not a victim of v...","How much, if at all, do you worry about the fo...",Instruct: Given a persona dimension descriptio...,19


In [5]:
from tqdm import trange
import json

num_clusters = 20

res = []
for idx in trange(num_clusters):
    eval_res = []
    for _, row in data[data['cluster'].astype(str) == str(idx)].iterrows():
        persona = PersonaDimension(**row[['name', 'description', 'level', 'candidate_values']].to_dict())
        eval_res.append(persona)
    
    prompt = prompt_template.format(persona_dimensions='[\n' + ',\n'.join(repr(dim) for dim in eval_res) + '\n]')
    response = get_llm_response(prompt, prefill='[')
    response = '[' + response
    
    try:
        response = eval(response)
        for dim in response:
            res.append(dim)
        print(response)
        record_res = [str(dim) for dim in res]
        with open('merged.json', 'w') as f:
            json.dump(record_res, f, indent=4)
    except:
        print(f"cluster {idx} failed")
        print(response)
        continue

  5%|▌         | 1/20 [00:02<00:51,  2.73s/it]

[    PersonaDimension(
        name="Firearm-Related Recreational Interests",
        description="The individual's involvement in recreational activities and hobbies related to firearms, such as hunting, shooting sports, gun collecting, and visiting gun ranges.",
        level="low",
        candidate_values=['active participant', 'occasional participant', 'non-participant']
    )]


 10%|█         | 2/20 [00:07<01:13,  4.06s/it]

[    PersonaDimension(
        name="Occupation and Social Circle",
        description="The participant's occupation and social environment, including whether they work in a field related to guns, hunting, or shooting sports, and whether their friends or social circle is exposed to gun ownership.",
        level="low",
        candidate_values=['high gun exposure', 'moderate gun exposure', 'low gun exposure']
    )]


 15%|█▌        | 3/20 [00:11<01:06,  3.91s/it]

[    PersonaDimension(
        name="Geographic Location",
        description="The region or area where the person lives or grew up, which may influence their exposure to and attitudes towards guns, as well as the prevalence of gun ownership, accessibility, and related activities such as hunting.",
        level="low",
        candidate_values=['urban area', 'rural area', 'suburban area']
    )]


 20%|██        | 4/20 [00:19<01:26,  5.38s/it]

[    PersonaDimension(
        name="Personal Experiences with Firearms",
        description="The individual's past experiences, either positive or negative, with firearms, gun ownership, gun violence, or gun-related incidents, which may shape their attitudes, beliefs, and behaviors regarding gun use and storage.",
        level="low",
        candidate_values=['positive experiences', 'negative experiences', 'no experiences']
    )]


 25%|██▌       | 5/20 [00:23<01:16,  5.12s/it]

[    PersonaDimension(
        name="Geographic Location and Gun Culture",
        description="The participant's geographic location and the prevalence of gun ownership, gun culture, gun-related incidents, and attitudes towards gun ownership in that area.",
        level="low",
        candidate_values=['high gun ownership area', 'low gun ownership area', 'high-risk area', 'low-risk area', 'strict gun laws', 'lenient gun laws', 'high gun show availability', 'low gun show availability', 'positive attitude towards gun owners', 'negative attitude towards gun owners']
    )]


 30%|███       | 6/20 [00:28<01:08,  4.92s/it]

[    PersonaDimension(
        name="Household Firearm Ownership",
        description="The presence and extent of firearm ownership within the individual's household, including personal ownership, ownership by other household members, and the number of firearms owned.",
        level="low",
        candidate_values=['multiple gun owner', 'single gun owner', 'non-gun owner']
    )]


 35%|███▌      | 7/20 [00:33<01:05,  5.01s/it]

[    PersonaDimension(
        name="Experiences with Gun Violence",
        description="The individual's direct or indirect experiences with gun violence, gun-related incidents, accidents, threats, intimidation, illegal gun access, or being affected by mass shootings or terrorist attacks involving guns.",
        level="low",
        candidate_values=['affected', 'unaffected']
    )]


 40%|████      | 8/20 [00:37<00:54,  4.58s/it]

[    PersonaDimension(
        name="Parental Status",
        description="Whether the individual has children, particularly those under the age of 18, living in their home.",
        level="low",
        candidate_values=['has children under 18', 'no children under 18']
    )]


 45%|████▌     | 9/20 [00:40<00:47,  4.27s/it]

[    PersonaDimension(
        name="Personal Experiences with Firearms",
        description="The participant's personal experiences, exposure, and familiarity with firearms, including gun ownership, hunting, shooting sports, self-defense situations, and childhood experiences related to guns.",
        level="low",
        candidate_values=['extensive experience', 'moderate experience', 'limited experience', 'no experience']
    )]


 50%|█████     | 10/20 [00:42<00:35,  3.57s/it]

[    PersonaDimension(
        name="Community and Civic Engagement",
        description="The participant's level of involvement, familiarity, and interest in their local community, as well as issues related to crime, security, and the justice system.",
        level="low",
        candidate_values=['high engagement', 'low engagement']
    )]


 55%|█████▌    | 11/20 [00:46<00:31,  3.54s/it]

[    PersonaDimension(
        name="Household Composition and Security",
        description="The composition of the participant's household, including the presence of children and the level of security measures taken to protect the home.",
        level="low",
        candidate_values=['household with children and high security', 'household with children and low security', 'household without children and high security', 'household without children and low security']
    )]


 60%|██████    | 12/20 [00:49<00:27,  3.41s/it]

[    PersonaDimension(
        name="Household Composition and Vulnerability",
        description="The makeup of the participant's household, including the presence and ages of children, elderly, or other vulnerable individuals, as well as the number and types of people living in the home.",
        level="low",
        candidate_values=['household with vulnerable individuals', 'household without vulnerable individuals', 'single-person household', 'multi-person household']
    )]


 65%|██████▌   | 13/20 [00:56<00:31,  4.53s/it]

[    PersonaDimension(
        name="Occupational Risk",
        description="The person's occupation and its potential risk of being targeted by violence, terrorism, or mass shootings, as well as the stability and job security of the field.",
        level="low",
        candidate_values=['high-risk occupation', 'low-risk occupation']
    ),     PersonaDimension(
        name="Financial Stability",
        description="The individual's current financial status, ability to pay bills, and overall financial stability.",
        level="low",
        candidate_values=['financially stable', 'financially unstable']
    ),     PersonaDimension(
        name="Health and Healthcare",
        description="The person's current health condition, medical history, and the type and extent of their health insurance coverage.",
        level="low",
        candidate_values=['good health with comprehensive coverage', 'poor health with limited coverage']
    )]


 70%|███████   | 14/20 [00:59<00:24,  4.07s/it]

[    PersonaDimension(
        name="Socioeconomic and Demographic Factors",
        description="The individual's economic and social position, as well as demographic characteristics such as age, gender, and geographic location, which may influence their perspective on gun ownership and its relationship with economic opportunities and gun violence.",
        level="low",
        candidate_values=['high socioeconomic status', 'low socioeconomic status', 'urban', 'rural', 'male', 'female', 'young', 'old']
    )]


 75%|███████▌  | 15/20 [01:04<00:21,  4.39s/it]

[    PersonaDimension(
        name="Crime and Safety Media Consumption",
        description="The person's habits and preferences in consuming media related to crime and safety topics.",
        level="low",
        candidate_values=['high consumption', 'low consumption']
    )]


 80%|████████  | 16/20 [01:11<00:20,  5.15s/it]

[    PersonaDimension(
        name="Media Consumption Related to Firearms",
        description="The frequency and intensity of consuming various forms of media related to firearms, gun culture, and gun violence, including news, entertainment, video games, and online forums.",
        level="low",
        candidate_values=['high consumption', 'moderate consumption', 'low consumption']
    )]


 85%|████████▌ | 17/20 [01:15<00:14,  4.74s/it]

[    PersonaDimension(
        name="Occupation and Job-Related Gun Ownership",
        description="The participant's occupation or profession that may require or encourage gun ownership and usage for safety, security, or job-related purposes.",
        level="low",
        candidate_values=['law enforcement', 'military', 'security', 'high-risk occupation', 'job requires gun ownership', 'job does not require gun ownership']
    )]


 90%|█████████ | 18/20 [01:18<00:08,  4.32s/it]

[    PersonaDimension(
        name="Firearm-Related Lifestyle and Hobbies",
        description="The individual's daily activities, routines, hobbies, and leisure pursuits that involve or are related to firearms, such as hunting, sport shooting, collecting guns, or occupations that necessitate gun use.",
        level="low",
        candidate_values=['active involvement', 'occasional involvement', 'no involvement']
    )]


 95%|█████████▌| 19/20 [01:22<00:04,  4.15s/it]

[    PersonaDimension(
        name="Perceived Risk in Geographic Location",
        description="The perceived level of risk or safety in the participant's current living area or the area where they grew up, including risks related to crime, violence, gun incidents, terrorist attacks, economic instability, and the potential need for self-defense.",
        level="low",
        candidate_values=['high-risk area', 'low-risk area']
    )]


100%|██████████| 20/20 [01:25<00:00,  4.28s/it]

[    PersonaDimension(
        name="Personal Experiences with Crime and Safety",
        description="The participant's direct experiences as a victim of crime, violence, or safety issues, including violent crimes, burglaries, or feeling unsafe in their local community.",
        level="low",
        candidate_values=['victim of violent crime', 'victim of burglary', 'experienced crime/safety issues', 'not experienced crime/safety issues']
    )]





In [6]:
with open('prompts/clean_summarized_personas.txt') as f:
    prompt_template = f.read()

In [9]:
prompt = prompt_template.format(persona_dimensions='[\n' + ',\n'.join(repr(dim) for dim in res) + '\n]')
response = get_llm_response(prompt, prefill='[', max_tokens=2000)
response = '[' + response