In [None]:
import json
import pandas as pd
import re
import os
from tqdm import tqdm
import openai
from openai import OpenAI
import logging
from bert_score import score as score1
logging.disable(logging.INFO)


api_key = "your openai api key"
client = OpenAI(api_key=api_key)

In [None]:
# import dataset
#big 5 analysis
with open("./withdbig5.json", 'r', encoding='utf-8') as file:
    big5 = json.load(file)

#The work title corresponding to the character name
with open('profiles-eng_scripts.json', 'r', encoding='utf-8') as file:
    work = json.load(file)

#character description
with open('profiles-eng_desc.json', 'r', encoding='utf-8') as file:
    desp = json.load(file)

In [None]:
#calculate bert score
def calculate_scores(df):
    bert_scores_ans = []
    no_tune_scores = []
    tuned_simple_scores = []
    simple_prompt_scores = []
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        # get reference txt and generated txt
        reference = str(row['reference answer'])
        answer = str(row['answer']) 
        notune_answer = str(row['answer without tune'])
        tuned_simple = str(row['tuned simple prompt'])
        simplep_answer = str(row['simple prompt answer'])

        #calculate BERTScore 
        _, _, bert_score = score1([answer], [reference], lang="eng", verbose=False)
        bert_scores_ans.append(bert_score.mean().item())

        _, _, bert_score1 = score1([notune_answer], [reference], lang="eng", verbose=False)
        no_tune_scores.append(bert_score1.mean().item())

        _, _, bert_score1 = score1([tuned_simple], [reference], lang="eng", verbose=False)
        tuned_simple_scores.append(bert_score1.mean().item())

        _, _, bert_score2 = score1([simplep_answer], [reference], lang="eng", verbose=False)
        simple_prompt_scores.append(bert_score2.mean().item())


    # add scores into DataFrame
    df['tuned score'] = bert_scores_ans
    df['no tune score'] = no_tune_scores
    df['tuned simple score'] = tuned_simple_scores
    df['simple prompt score'] = simple_prompt_scores

    return df

#calculate avg score of bert scores
def get_mean(df):
    avg_score = df['tuned score'].mean()
    avg_score2 = df['no tune score'].mean()
    avg_score3 = df['tuned simple score'].mean()
    avg_score4 = df['simple prompt score'].mean()
    print([avg_score,avg_score2,avg_score3,avg_score4])

In [None]:
c_name = "HAL 9000"#"Willie Soke" #"Gaston" #"Naruto Uzumaki"#"Sasuke Uchiha"
path = f"./result/{c_name}_120.csv"
#read generated results
df = pd.read_csv(path)
#calculate bert score
df = calculate_scores(df)
df.head()
#save score
df.to_csv(path, index=False)

In [None]:
#calculate and print average scores
get_mean(df)

In [None]:
#GPT evaluation
#prompt of winning rate evaluation
def generate_prompt(df, role_name, big5):

    path1 = f"./profile/{role_name}.json"
    path2 = f"./score/{role_name}.json"

    with open(path1, 'r', encoding='utf-8') as file:
        profile = json.load(file)
    
    with open(path2, 'r', encoding='utf-8') as file:
        score = json.load(file)

    experience = profile[role_name]
    description = desp[role_name]
    big_5 = big5[role_name]

    prompts = []
    for index, row in df.iterrows():
        question = row['question']
        tuned_gpt_answer = row["Tuned GPT answer"]
        reference_answer = row['reference answer']
        answer = row['answer']
        no_tune_answer = row['answer without tune']
        simple_prompt_answer = row['simple prompt answer']
        tuned_simple_prompt = row['tuned simple prompt']

        question_dict = {'question': question}
        list_model_answer_dict = [
            {'model': 'Tuned GPT3.5', 'answer': tuned_gpt_answer},
            {'model': 'GPT3.5', 'answer': reference_answer},
            {'model': 'Tuned Phi-3', 'answer': answer},
            {'model': 'Tuned Phi-3 with simple prompt', 'answer': tuned_simple_prompt},
            {'model': 'Phi-3', 'answer': no_tune_answer},
            {'model': 'Phi-3 with simple prompt', 'answer': simple_prompt_answer}
        ]

        prompt = f"""
System Instruction:
You are a role-playing performance comparison assistant. You should rank the models based on the role characteristics and text quality of their responses. The rankings are then output using Python dictionaries and lists.
User Prompt:
The models below are to play the role of "{role_name}". The brief description of {role_name} is: {description}
The Big-5 personality trait analysis of {role_name} is:{big_5}
The relationships between {role_name} and some characters (0 means no, 1 means yes) and your familiarity and affection scores for them (from 1 to 10, the higher the score, the more familiar you are with them and the more you like them) are: 
{score}.
I need to rank the following models based on the criteria below:
1. Which one that better reflects the character's correct interpersonal relationships is better
2. Which one has more pronounced role speaking style, and speaks more in line with the role description. The more distinctive the speaking style, the better.
3. Which one's output can reflects the personality of the charactor more, is better.
4. Which one's output contains more knowledge and memories related to the role; the richer, the better. (If the question contains reference answers, then the role-specific knowledge and memories are based on the reference answer.)
5. Consider the character's limited knowledge base. If you think the question falls outside the character's knowledge range according to your judgment, then answers with an attitude of "not knowing" or "not understanding" will be better.
The question provided to each model is:
{question_dict}
The respective answers from the models to this question are:
{list_model_answer_dict}
Now, based on the above two criteria, please rank the models. Avoid any positional biases and ensure that the order in which the responses are presented does not influence your decision. Do not favor certain model names.
Then, use a list containing the model's name, its rank, and the reason for its ranking to return the results, i.e., please ensure to use the following format to return the results:
[{{"model": <model-name>, "reason": <rank-reason>, "rank": <model-rank>}}, {{"model": <model-name>, "reason": <rank-reason>, "rank'': <model-rank>}}]
Your answer must be a valid Python list of dictionaries to ensure I can directly parse it using Python. Scrictly follow the format above. Do not include any extraneous content! Please provide a ranking that is as accurate as possible and aligns with the intuition of most people.
        """
        prompts.append(prompt)
    return prompts

#prompt of multi-dimension evaluation
def generate_dimension_prompt(df, role_name, big5):

    path1 = f"./profile/{role_name}.json"
    path2 = f"./score/{role_name}.json"

    with open(path1, 'r', encoding='utf-8') as file:
        profile = json.load(file)
    
    with open(path2, 'r', encoding='utf-8') as file:
        score = json.load(file)

    description = desp[role_name]
    big_5 = big5[role_name]

    prompts = []
    for index, row in df.iterrows():
        question = row['question']
        tuned_gpt_answer = row["Tuned GPT answer"]
        reference_answer = row['reference answer']
        answer = row['answer']
        no_tune_answer = row['answer without tune']
        simple_prompt_answer = row['simple prompt answer']
        tuned_simple_prompt = row['tuned simple prompt']

        question_dict = {'question': question}
        list_model_answer_dict = [
            {'model': 'Tuned GPT3.5', 'answer': tuned_gpt_answer},
            {'model': 'GPT3.5', 'answer': reference_answer},
            {'model': 'Tuned Phi-3', 'answer': answer},
            {'model': 'Tuned Phi-3 with simple prompt', 'answer': tuned_simple_prompt},
            {'model': 'Phi-3', 'answer': no_tune_answer},
            {'model': 'Phi-3 with simple prompt', 'answer': simple_prompt_answer}
        ]
        prompt = f"""
System Instruction:
You are a role-playing performance comparison assistant. You should rank the models based on the role characteristics and text quality of their responses. The rankings are then output using Python dictionaries and lists.
User Prompt:
The models below are to play the role of "{role_name}". The brief description of {role_name} is: {description}
The Big-5 personality trait analysis of {role_name} is:{big_5}
The relationships between {role_name} and some characters (0 means no, 1 means yes) and your familiarity and affection scores for them (from 1 to 10, the higher the score, the more familiar you are with them and the more you like them) are: 
{score}.
I need to rank the following models based on the two criteria below:
[Evaluation Criterion]
Factual Correctness (1-7): Is the response provides truthful and detailed facts about the character?
Personality (1-7): Is the response reflects the personalities and preferences of the character?
Values (1-7): Is the response reflects the values and convictions of the character?
Avoiding Hallucination (1-7): Is the response avoids to say things that the character do not know?
[Evaluation Steps]
[Factual Correctness]
1. Read through the interactions and identify the key points related to the character.
2. Read through the responses of the AI assistant and compare them to the profile. Check if the responses are consistent with the character's profile, background, and known facts about the character.
3. Check whether the responses provide detailed facts about the character or if they are generic responses that could apply to any character. Detailed responses are more factual and contribute positively to the score.
4. Rate the performance of the AI on a scale of 1-7 for factual correctness, where 1 is the lowest and 7 is the highest based on the Evaluation Criteria.
[Personality]
1. Read through the profile and write the personalities and preferences of the real character.
2. Read through the interactions and identify the personalities and preferences of the AI assistant.
3. After having a clear understanding of the interactions, compare the responses to the profile. Look for any consistencies or inconsistencies. Do the responses reflect the character's personalities and preferences?
4. Use the given scale from 1-7 to rate how well the response reflects the personalities and preferences of the character. 1 being not at all reflective of the character's personalities, and 7 being perfectly reflective of the character's personalities.
[Values]
1. Read through the profile and write the values and convictions of the real character.
2. Read through the interactions and identify the values and convictions of the AI assistant.
3. After having a clear understanding of the interactions, compare the responses to the profile. Look for any consistencies or inconsistencies. Do the responses reflect the character's values and convictions?
4. Use the given scale from 1-7 to rate how well the response reflects the values and convictions of the character. 1 being not at all reflective of the character's values, and 7 being perfectly reflective of the character's values.
[Avoiding Hallucination]
1. Read through the interactions and identify the knowledge scope of the character.
2. Read through the responses of the AI assistant, find the evidence of knowledge used in the response.
3. Compare the evidence to the profile. Check if the responses are consistent with the character's knowledge scope. If some knowledge contradicts to the character's identity, given a lower score. Otherwise, assign a higher score.
4. Rate the performance of the AI on a scale of 1-7 for Avoiding Hallucination, where 1 is the lowest and 7 is the highest based on the Evaluation Criteria.
The question provided to each model is:
{question_dict}
The respective answers from the models to this question are:
{list_model_answer_dict}
Now, based on the above criteria, please rank the models. Avoid any positional biases and ensure that the order in which the responses are presented does not influence your decision. Do not favor certain model names.
Then, use a list containing the model's name, its score, and the reason for its score to return the results, i.e., please ensure to use the following format to return the results:
[{{"model": <model-name>, "reason": <score-reason>, "factual correctness score": <model-score>,"personality score": <model-score>,"values score": <model-score>,"avoiding hallucination score": <model-score>}}, {{"model": <model-name>, "reason": <score-reason>, "factual correctness score": <model-score>,"personality score": <model-score>,"values score": <model-score>,"avoiding hallucination score": <model-score>}}]
Your answer must be a valid Python list of dictionaries to ensure I can directly parse it using Python. Strictly follow the format above. Do not include any extraneous content! Please provide a ranking that is as accurate as possible and aligns with the intuition of most people.
        """
        prompts.append(prompt)
    return prompts

#prompt of big5 questionnaire evaluation
def big5_eval(c_name,ver):
    path = f"./result/{c_name}_big5.json"
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    with open("./big5_dim.json", 'r', encoding='utf-8') as file:
        intro = json.load(file)
    
    prompts = []
    for item in data:
        dim = item["dimension"]
        question = item["question"]
        if ver == 1:
            answer = item["answer"]
        elif ver == 0:
            answer = item["answer without tune"]
        elif ver == 2:
            answer = item["gpt answer"]
            
        dim_desp = intro[dim]
        prompt = f"""
You are a psychologist with expertise in personality theories. I'm conducting an experiment to evaluate participants' scores in the Big Five personality traits, especially on the {dim} dimension. For clarity, here's some background on differentiating this particular dimension and its factors:
===
{dim_desp}
===

I've invited a participant, {c_name}, and had the following conversations:
===
question:{question}
answer:{answer}
===

Please help me evaluates whether {c_name} possesses a high {dim} personality or a low {dim} personality, and provide an integer score ranging from -5 to 5. 

Below are some scoring references. If the subject demonstrates a high {dim} personality in many factors, the score is 5 points. If the subject exhibits a high {dim} personality in a single factor, the score is 2 points. If the subject's personality cannot be determined, the score is 0 points. If the subject shows a low {dim} personality in one factor, the score is -2 points. If the subject indicates a low {dim} personality across multiple factors, the score is -5 points. 

Please output in the following json format:
===
{{
    "dimention": "{dim}"
    "analysis": <your analysis, based on the conversations>,
    "result": <the person's score on {dim}, ranging from -5 to 5>
}}
===
"""
        prompts.append(prompt)
    return prompts

In [None]:
role_name = "HAL 9000" #"Willie Soke" #"Gaston" #"Naruto Uzumaki"#"Sasuke Uchiha"

# read result file
path = f"./result/{role_name}_120.csv"
df = pd.read_csv(path)

# generate prompts

#prompt of winning rate evaluation
# prompts = generate_prompt(df, role_name, big5)

#prompt of multi-dimension evaluation
prompts = generate_dimension_prompt(df, role_name, big5)
# df.head()

In [None]:
#using GPT-4o-mini to do differenr evaluations using different promts above
responses = []

for prompt in tqdm(prompts):
    response = client.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[{"role": "user", "content": prompt}],
        )
    answer = response.choices[0].message.content
    responses.append(answer)

In [None]:
#save evaluation results
save_path = f'./result/{role_name}_dimension_eval.json' #f'./result/{role_name}_4o.json'
with open(save_path, 'w', encoding='utf-8') as file:
    json.dump(responses, file, indent=4, ensure_ascii=False)

print(f"saved")

In [None]:
#calculate winning rate
def count_rank_1(data):
    models_to_count = ["Tuned GPT3.5","GPT3.5", "Tuned Phi-3", "Phi-3", "Tuned Phi-3 with simple prompt", "Phi-3 with simple prompt"]
    rank_1_count = {model: 0 for model in models_to_count}

    for sublist in data:
        for entry in sublist:
            model = entry.get("model")
            rank = entry.get("rank")
            if model in models_to_count and rank == 1:
                rank_1_count[model] += 1

    return rank_1_count

#calculate avg multi-dimension scores
def calculate_avg_dimension_scores(data):
    models = ["Tuned GPT3.5", "GPT3.5", "Tuned Phi-3", "Phi-3", "Tuned Phi-3 with simple prompt", "Phi-3 with simple prompt"]
    
    scores_sums = {model: {
            "factual correctness score": 0,
            "personality score": 0,
            "values score": 0,
            "avoiding hallucination score": 0
        } for model in models}
    
    counts = {model: {
            "factual correctness score": 0,
            "personality score": 0,
            "values score": 0,
            "avoiding hallucination score": 0
        } for model in models}
    
    for model_list in data:
        for model_data in model_list:
            model_name = model_data.get("model")
            
            for score_type in scores_sums[model_name]:
                score = model_data.get(score_type, 0)
                scores_sums[model_name][score_type] += score
                counts[model_name][score_type] += 1
    
    average_scores = {model: {} for model in models}
    for model_name in scores_sums:
        for score_type in scores_sums[model_name]:
            if counts[model_name][score_type] > 0:
                avg_score = scores_sums[model_name][score_type] / counts[model_name][score_type]
            else:
                avg_score = 0
            average_scores[model_name][score_type] = avg_score
    
    return average_scores

def calculate_model_average_scores(data):
    scores = {
        "Tuned GPT3.5": [],
        "GPT3.5": [],
        "Tuned Phi-3": [],
        "Phi-3": [],
        "Tuned Phi-3 with simple prompt": [],
        "Phi-3 with simple prompt": []
    }
    
    for model_list in data:
        for model_data in model_list:
            model_name = model_data["model"]
            score = model_data["score"]
            if model_name in scores:
                scores[model_name].append(score)
    average_scores = {}
    for model_name, score_list in scores.items():
        if score_list:  
            average_scores[model_name] = sum(score_list) / len(score_list)
        else:
            average_scores[model_name] = 0
    
    return average_scores

#calculate questionnaire scores
def calculate_big5_scores(data):
    scores = {
        "openness": [],
        "neuroticism": [],
        "extraversion": [],
        "agreeableness": [],
        "conscientiousness": []
    }
    
    for item in data:
        dimension = item.get("dimension")
        result = item.get("result")
        if dimension in scores and result is not None:
           scores[dimension].append(result)
    
    average_scores = {}
    for dimension, results in scores.items():
        if results:
            average_scores[dimension] = sum(results) / len(results)
        else:
            average_scores[dimension] = None
    
    return average_scores

In [None]:
#read evaluation responses (at start is a whole string format)
name = "Sasuke Uchiha"#"HAL 9000"#"Gaston" #"Naruto Uzumaki" #"Sasuke Uchiha" #"Willie Soke"
file_path = f'./result/{name}_dimension_eval.json'#f'./result/{role_name}_memory_eval.json'#f'./result/{name}_gpt_big5_eval.json' #f'./result/{name}_4o.json'
with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

In [None]:
#data processing-extract the dict format responses
extracted_lists = []
for i, item in enumerate(data):
    try:
        if isinstance(item, str):
            parsed_item = json.loads(item)
        else:
            parsed_item = item
        extracted_lists.append(parsed_item)
    except json.JSONDecodeError as e:
        print(f"error:{i} : {item}")
        print(f"error msg:{e}")

print(extracted_lists)
#save data after processing
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(extracted_lists, file, indent=4, ensure_ascii=False)

print(f"saved")

In [None]:
# get the best performed questions&answers of model
for sublist_index, sublist in enumerate(data):
    for dict_index, dictionary in enumerate(sublist):
        if isinstance(dictionary, dict):  # Check if the item is a dictionary
            if dictionary.get("model") == "Tuned GPT3.5" and dictionary.get("rank") == 1:
                print(f"Sublist Index: {sublist_index}, Dictionary Index: {dict_index}, Dictionary: {dictionary}")
        else:
            print(f"Item at sublist index {sublist_index}, dictionary index {dict_index} is not a dictionary: {dictionary}")

In [None]:
#read the evaluation responses after processing
name = "Sasuke Uchiha"#"HAL 9000"#"Gaston" #"Naruto Uzumaki" #"Sasuke Uchiha" #"Willie Soke"
file_path = f'./result/{name}_dimension_eval.json'#f'./result/{role_name}_memory_eval.json'#f'./result/{name}_gpt_big5_eval.json' #f'./result/{name}_4o.json'
with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

In [None]:
#get winning rate
rank_1_count = count_rank_1(data)
    
for model, count in rank_1_count.items():
    print(f"Model: {model}, Rank 1 count: {count}, Winning Rate:{count/100}")

In [None]:
#get multi-dimension scores
average_scores = calculate_avg_dimension_scores(extracted_lists)
for model, scores in average_scores.items():
    print(f"{model}:")
    for score_type, avg_score in scores.items():
        print(f"  {score_type}: {avg_score:.2f}")

In [None]:
#get questionnaire scores
average_scores = calculate_big5_scores(extracted_lists)
print(average_scores)