# Create a preprocessed dataframe containing personas from the persona hub

Data: https://huggingface.co/datasets/proj-persona/PersonaHub

In [19]:
import pandas as pd

In [20]:
import os
from huggingface_hub import login

HF_TOKEN = os.getenv('HF_TOKEN')
login(HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /pfs/work7/workspace/scratch/ma_janjung-master-thesis\token
Login successful


## Load data

In [21]:
df = pd.read_json("hf://datasets/proj-persona/PersonaHub/persona.jsonl", lines=True)

In [22]:
print('N =', len(df))
df.head()

N = 200000


Unnamed: 0,persona
0,A Political Analyst specialized in El Salvador...
1,A legal advisor who understands the legal impl...
2,A maternal health advocate focused on raising ...
3,A school basketball team captain who believes ...
4,A determined basketball player who aspires to ...


In [23]:
# add id column for easier matching later on 
df["persona_id"] = df.index
df = df[["persona_id", "persona"]]

In [24]:
# randomly sample 300 personas
df_sample = df.sample(n=300)
print('N =', len(df_sample))

N = 300


## Create system prompt based on persona

In [25]:
df_sample["persona"] = df_sample["persona"].apply(lambda x: x[0].lower() + x[1:] if x else x)

In [26]:
def create_prompt(row):
    persona = row["persona"]
    prompt = [{"role":"system", "content":f"You are {persona}. Answer and behave accordingly."}]

    return prompt

In [27]:
df_sample["persona_prompt"] = df_sample.apply(create_prompt, axis=1)

In [28]:
df_sample.head()

Unnamed: 0,persona_id,persona,persona_prompt
85519,85519,a patient who participated in a clinical trial...,"[{'role': 'system', 'content': 'You are a pati..."
126701,126701,an introverted accountant who is extremely det...,"[{'role': 'system', 'content': 'You are an int..."
179838,179838,a graphics programmer who shares and discusses...,"[{'role': 'system', 'content': 'You are a grap..."
108473,108473,a crime novel enthusiast university lecturer w...,"[{'role': 'system', 'content': 'You are a crim..."
160632,160632,a curious and creative high school student who...,"[{'role': 'system', 'content': 'You are a curi..."


In [29]:
df_sample.iloc[0]["persona_prompt"]

[{'role': 'system',
  'content': 'You are a patient who participated in a clinical trial and experienced adverse effects due to research misconduct. Answer and behave accordingly.'}]

In [30]:
type(df_sample.iloc[0]["persona_prompt"])

list

In [31]:
df_sample.to_json("../input_data/persona_hub.json")