In [1]:
from datasets import load_dataset
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from json_repair import repair_json
import pandas as pd
from prutils.prevaluation import PrEvaluation

dataset = load_dataset("jingjietan/essays-big5", cache_dir="../datasets")

name = "essays_llama"

In [2]:
def get_single_sample_json(id,user_text):
    messages = [
        {
            "role": "user",
            "content": (
                "Analyze the user's personality using the Big 5 framework based on the given text.\n"
                "Output the result in this JSON format:\n"
                "{\n"
                "  'dimensions': {\n"
                "    'Openness': {'evidence': '', 'label': '0/1'},\n"
                "    'Conscientiousness': {'evidence': '', 'label': '0/1'},\n"
                "    'Extraversion': {'evidence': '', 'label': '0/1'},\n"
                "    'Agreeableness': {'evidence': '', 'label': '0/1'},\n"
                "    'Neuroticism': {'evidence': '', 'label': '0/1'}\n"
                "  }\n"
                "}\n\n"
                "For each dimension, provide:\n"
                "- evidence: Analysis of the text reflecting the trait. Give examples (content and/or tone) from the user's message. Avoid using punctuation\n"
                "- label: Provide only a value (0 for low; or 1 for high) to indicate the traits.\n\n"
            )
        },
        {
            "role": "user",
            "content": user_text
        }
    ]

    
    return messages

In [3]:
response_format={
  "type": "json_schema",
  "json_schema": {
    "name": "user_personality_analysis",
    "strict": True,
    "schema": {
      "type": "object",
      "properties": {
        "dimensions": {
          "type": "object",
          "properties": {
            "Openness": {
              "type": "object",
              "properties": {
                "evidence": { "type": "string" },
                "label": { "type": "integer", "enum": [0, 1] }
              },
              "required": ["evidence", "label"]
            },
            "Conscientiousness": {
              "type": "object",
              "properties": {
                "evidence": { "type": "string" },
                "label": { "type": "integer", "enum": [0, 1] }
              },
              "required": ["evidence", "label"]
            },
            "Extraversion": {
              "type": "object",
              "properties": {
                "evidence": { "type": "string" },
                "label": { "type": "integer", "enum": [0, 1] }
              },
              "required": ["evidence", "label"]
            },
            "Agreeableness": {
              "type": "object",
              "properties": {
                "evidence": { "type": "string" },
                "label": { "type": "integer", "enum": [0, 1] }
              },
              "required": ["evidence", "label"]
            },
            "Neuroticism": {
              "type": "object",
              "properties": {
                "evidence": { "type": "string" },
                "label": { "type": "integer", "enum": [0, 1] }
              },
              "required": ["evidence", "label"]
            }
          },
          "required": ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism"]
        }
      },
      "required": ["dimensions"]
    }
  }
}


In [5]:
testset = dataset['test']
testset = pd.DataFrame(testset)


In [6]:
# submit batch to open ai using jsonl file
from openai import OpenAI
client = OpenAI(api_key="test", base_url="http://localhost:1234/v1")


messages = get_single_sample_json(testset.iloc[0]['__index_level_0__'],testset.iloc[0]['text'])
response = client.chat.completions.create(
    model="mistral-7b-instruct-v0.3",
    messages=messages,
    response_format= response_format
)

In [None]:
from tqdm import tqdm

df = pd.DataFrame(columns=["id","o_evi", "o_type", "c_evi", "c_type", "e_evi", "e_type", "a_evi", "a_type", "n_evi", "n_type", "O", "C", "E", "A", "N"])

for _, line in tqdm(testset.iterrows(), total=testset.shape[0]):
    custom_id = line["__index_level_0__"]
    messages = get_single_sample_json(custom_id, line['text'])
    response = client.chat.completions.create(
        model="meta-llama-3.1-8b-instruct",
        messages=messages,
        response_format=response_format
    )

    content = response.choices[0].message.content

    O = testset[testset["__index_level_0__"] == int(custom_id)]["O"].values[0]
    C = testset[testset["__index_level_0__"] == int(custom_id)]["C"].values[0]
    E = testset[testset["__index_level_0__"] == int(custom_id)]["E"].values[0]
    A = testset[testset["__index_level_0__"] == int(custom_id)]["A"].values[0]
    N = testset[testset["__index_level_0__"] == int(custom_id)]["N"].values[0]

    try:
        content = json.loads(content)
        content = content["dimensions"]

        o_evi = content["Openness"]["evidence"]
        o_type = content["Openness"]["label"]
        c_evi = content["Conscientiousness"]["evidence"]
        c_type = content["Conscientiousness"]["label"]
        e_evi = content["Extraversion"]["evidence"]
        e_type = content["Extraversion"]["label"]
        a_evi = content["Agreeableness"]["evidence"]
        a_type = content["Agreeableness"]["label"]
        n_evi = content["Neuroticism"]["evidence"]
        n_type = content["Neuroticism"]["label"]

        #make to pandas
        df = df.append({"id": custom_id, "o_evi": o_evi, "o_type": o_type, "c_evi": c_evi, "c_type": c_type, "e_evi": e_evi, "e_type": e_type, "a_evi": a_evi, "a_type": a_type, "n_evi": n_evi, "n_type": n_type, "O": O, "C": C, "E": E, "A": A, "N": N}, ignore_index=True)
    except:
        # append the content to a txt file
        with open(name + '_error.txt', 'a') as f:
            f.write(str(content))
            f.write('\n\n')


In [7]:
df["o_type"] = df["o_type"].apply(lambda x: 1 if str(x) == "1" else 0 if str(x).upper() == "0" else -1).astype(int)
df["c_type"] = df["c_type"].apply(lambda x: 1 if str(x) == "1" else 0 if str(x).upper() == "0" else -1).astype(int)
df["e_type"] = df["e_type"].apply(lambda x: 1 if str(x) == "1" else 0 if str(x).upper() == "0" else -1).astype(int) 
df["a_type"] = df["a_type"].apply(lambda x: 1 if str(x) == "1" else 0 if str(x).upper() == "0" else -1).astype(int)
df["n_type"] = df["n_type"].apply(lambda x: 1 if str(x) == "1" else 0 if str(x).upper() == "0" else -1).astype(int)

#make int
df["o_type"] = df["o_type"].astype(int)
df["c_type"] = df["c_type"].astype(int)
df["e_type"] = df["e_type"].astype(int)
df["a_type"] = df["a_type"].astype(int)
df["n_type"] = df["n_type"].astype(int)

df["O"] = df["O"].astype(int)
df["C"] = df["C"].astype(int)
df["E"] = df["E"].astype(int)
df["A"] = df["A"].astype(int)
df["N"] = df["N"].astype(int)



In [8]:
# save to csv
df.to_csv(name + '.csv', index=False)

In [9]:
# #read from csv
# import pandas as pd
# df = pd.read_csv(name + '.csv')


In [None]:
O = PrEvaluation("O")
O.push([df["o_type"].to_list()],[df["O"].to_list()])
O.print_performance()

C = PrEvaluation("C")
C.push([df["c_type"].to_list()],[df["C"].to_list()])
C.print_performance()

E = PrEvaluation("E")
E.push([df["e_type"].to_list()],[df["E"].to_list()])
E.print_performance()

A = PrEvaluation("A")
A.push([df["a_type"].to_list()],[df["A"].to_list()])
A.print_performance()

N = PrEvaluation("N")
N.push([df["n_type"].to_list()],[df["N"].to_list()])
N.print_performance()