In [1]:
import autogen
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np

# Load Data

In [2]:
criteria_dict={'autogen':[],
           'react':[],
           'vanilla_solver':[],
           'task_based':[]}

for agent in ['autogen','react','vanilla_solver']:    
    criteria_dict[agent]=[]
    for file in os.listdir(f'../test/test_files/agenteval-in-out/solution_based/pointwise/{agent}'):
        with open(f'../test/test_files/agenteval-in-out/solution_based/pointwise/{agent}/{file}'  ,'r') as fptr:
            try: 
                criteria = json.load(fptr)
                criteria_dict[agent].append(criteria)
            except:
                pass

for file in os.listdir(f'../test/test_files/agenteval-in-out/task_based'):
    with open(f'../test/test_files/agenteval-in-out/task_based/{file}'  ,'r') as fptr:
        try:
            criteria = json.load(fptr)
            criteria_dict["task_based"].append(criteria)
        except:
            pass

In [3]:
criteria_dict

{'autogen': [{'accuracy': {'description': 'The correctness of the solution provided for the math problem.',
    'accepted_values': ['incorrect', 'partially_correct', 'correct']},
   'completeness': {'description': 'The extent to which the solution covers all necessary steps and details.',
    'accepted_values': ['incomplete', 'mostly_complete', 'complete']},
   'clarity': {'description': 'How well the solution is explained and easy to understand.',
    'accepted_values': ['confusing', 'somewhat_clear', 'very_clear']},
   'presentation': {'description': 'The organization and presentation of the solution, including proper use of notation, symbols, and formatting.',
    'accepted_values': ['poor', 'fair', 'excellent']},
   'efficiency': {'description': 'The conciseness of the solution and the use of the most efficient method to solve the problem.',
    'accepted_values': ['inefficient',
     'moderately_efficient',
     'highly_efficient']}},
  {'accuracy': {'description': 'The correctnes

# Synonym Detection

In [4]:
def find_synonymous_in_list(word_list, threshold=0.75):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Load a pre-trained SentenceTransformer model
    embeddings = model.encode(word_list, convert_to_tensor=True)  # Compute embeddings for the word list

    processed_words = set()
    used_in_synonyms = set()
    synonymous_dict = {}

    for i, word in enumerate(word_list):
        if word not in processed_words and word not in used_in_synonyms:
            synonymous_list = []
            for j, other_word in enumerate(word_list):
                if i != j:
                    cosine_sim = np.dot(embeddings[i], embeddings[j]) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
                    if cosine_sim > threshold:
                        synonymous_list.append(other_word)
                        used_in_synonyms.add(other_word)
            if synonymous_list or word not in synonymous_dict:  # Check if synonymous_list is not empty or word is not already added
                synonymous_list = list(set(synonymous_list))  # Remove duplicates from the list
                synonymous_list.sort()  # Sort the list to make the output consistent
                synonymous_dict[word] = synonymous_list
            processed_words.add(word)
    
    return synonymous_dict

# Filter Criteria using LLM

## V1: Tell it to extract k criteria (k = 25)

In [5]:
config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
)

%env AUTOGEN_USE_DOCKER=0

env: AUTOGEN_USE_DOCKER=0


In [6]:
criteria_summarizer_message_base = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be dinstinguishable, quantifieable and not redundant.
    A criteria dictionary is a dictionary where the keys are the criteria. 
    The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key}
    You will be given a list of criteria dictionaries that others have suggested. They will be of varying qualities, and some of them will be synonymous.
    You should pick the best 25 distinct criteria for the task, and each criterion's corresponding best decription and range of accepted values.
    Your output should be a criteria dictionary containing the 25 distinct criteria you have picked.
    Make sure the keys are criteria for assessing the given task.  "accepted_values" include the acceptable inputs for each key that are fine-grained and preferrably mlti-graded levels. "description" includes the criterion description.
    Return only the dictionary, and in json format."""

In [7]:
criteria_summarizer = autogen.AssistantAgent(
    name = "criteria_summarizer",
    llm_config = {"config_list": config_list},
    system_message = criteria_summarizer_message_base,
)

criteria_summarizer_user = autogen.UserProxyAgent(
    name = "criteria_summarizer_user",
    max_consecutive_auto_reply = 0,  # terminate without auto-reply
    human_input_mode = "NEVER",
)

def generate_summarized_criteria(message):
    """
    Initiate a chat with the criteria summarizer user and return the last message received from the criteria summarizer.

    Args:
    - message (str): The message to be sent to the criteria summarizer.

    Returns:
    - str: The content of the last message received.
    """
    criteria_summarizer_user.initiate_chat(criteria_summarizer, message=message)
    # return the last received from the criteria summarizer
    return criteria_summarizer_user.last_message()["content"]

  super().__init__(


In [8]:
def get_sys_msg(crit_dict):
    return f"""Task: Math problem solving.
Task description: Given any question, the system needs to  solve the problem as consisely and  acccurately as possible.
Suggested criteria{crit_dict}
"""

In [9]:
summarized_criteria_group1 = generate_summarized_criteria(get_sys_msg(criteria_dict['autogen'][:25]))

[33mcriteria_summarizer_user[0m (to criteria_summarizer):

Task: Math problem solving.
Task description: Given any question, the system needs to  solve the problem as consisely and  acccurately as possible.
Suggested criteria[{'accuracy': {'description': 'The correctness of the solution provided for the math problem.', 'accepted_values': ['incorrect', 'partially_correct', 'correct']}, 'completeness': {'description': 'The extent to which the solution covers all necessary steps and details.', 'accepted_values': ['incomplete', 'mostly_complete', 'complete']}, 'clarity': {'description': 'How well the solution is explained and easy to understand.', 'accepted_values': ['confusing', 'somewhat_clear', 'very_clear']}, 'presentation': {'description': 'The organization and presentation of the solution, including proper use of notation, symbols, and formatting.', 'accepted_values': ['poor', 'fair', 'excellent']}, 'efficiency': {'description': 'The conciseness of the solution and the use of the m

In [10]:
summarized_criteria_group1 = json.loads(summarized_criteria_group1)

In [11]:
# convert to a list of words
conversion_dict_group1 = {}
for criteria in summarized_criteria_group1:
    conversion_dict_group1[' '.join(criteria.lower().split('_'))] = criteria

group1_list_of_criteria = list(conversion_dict_group1.keys())
    
group1_list_of_criteria

['accuracy',
 'completeness',
 'clarity',
 'presentation',
 'efficiency',
 'conciseness',
 'steps delineation',
 'response time',
 'steps logic',
 'notations',
 'step explanation',
 'error diagnosis',
 'clear explanation',
 'solution depth',
 'use of methods',
 'level appropriateness',
 'error handling',
 'symbol consistency',
 'creativity',
 'understandability',
 'calculation error',
 'terminology',
 'reliability',
 'adherence to standards']

In [12]:
synonymous_dict = find_synonymous_in_list(group1_list_of_criteria)
print(f"Number of criteria {len(group1_list_of_criteria)}")
print(f"Number of key in synonimous dict {len(synonymous_dict)}")
print(synonymous_dict)

Number of criteria 24
Number of key in synonimous dict 24
{'accuracy': [], 'completeness': [], 'clarity': [], 'presentation': [], 'efficiency': [], 'conciseness': [], 'steps delineation': [], 'response time': [], 'steps logic': [], 'notations': [], 'step explanation': [], 'error diagnosis': [], 'clear explanation': [], 'solution depth': [], 'use of methods': [], 'level appropriateness': [], 'error handling': [], 'symbol consistency': [], 'creativity': [], 'understandability': [], 'calculation error': [], 'terminology': [], 'reliability': [], 'adherence to standards': []}


In [13]:
summarized_criteria_group2 = generate_summarized_criteria(get_sys_msg(criteria_dict['autogen'][25:]))

[33mcriteria_summarizer_user[0m (to criteria_summarizer):

Task: Math problem solving.
Task description: Given any question, the system needs to  solve the problem as consisely and  acccurately as possible.
Suggested criteria[{'Accuracy': {'description': 'The solution provided must accurately solve the given math problem and provide the correct final answer.', 'accepted_values': ['Correct', 'Partially correct', 'Incorrect']}, 'Clarity': {'description': 'The solution should be clear and well-organized, using proper mathematical notation and language.', 'accepted_values': ['Highly clear', 'Moderately clear', 'Unclear']}, 'Completeness': {'description': 'The solution should include all necessary steps and explain the reasoning behind each step.', 'accepted_values': ['Complete', 'Partially complete', 'Incomplete']}, 'Efficiency': {'description': 'The solution should use an efficient and concise approach to solve the problem.', 'accepted_values': ['Highly efficient', 'Moderately efficient

In [14]:
summarized_criteria_group2 = json.loads(summarized_criteria_group2)

In [15]:
# convert to a list of words
conversion_dict_group2 = {}
for criteria in summarized_criteria_group2:
    conversion_dict_group2[' '.join(criteria.lower().split('_'))] = criteria

group2_list_of_criteria = list(conversion_dict_group2.keys())
    
group2_list_of_criteria

['accuracy',
 'clarity',
 'completeness',
 'efficiency',
 'relevance',
 'notation quality',
 'response time',
 'simplification',
 'step explanation',
 'handling constraints',
 'description',
 'speed',
 'solution conciseness',
 'problem difficulty',
 'appropriate methods',
 'problem type',
 'problem level',
 'adaptability',
 'method appropriateness',
 'completeness metric',
 'step by step logic',
 'use of appropriate methods',
 'solution approach',
 'presentation',
 'correct reasoning']

In [16]:
synonymous_dict = find_synonymous_in_list(group2_list_of_criteria)
print(f"Number of criteria {len(group2_list_of_criteria)}")
print(f"Number of key in synonimous dict {len(synonymous_dict)}")
print(synonymous_dict)

Number of criteria 25
Number of key in synonimous dict 21
{'accuracy': [], 'clarity': [], 'completeness': ['completeness metric'], 'efficiency': [], 'relevance': [], 'notation quality': [], 'response time': [], 'simplification': [], 'step explanation': [], 'handling constraints': [], 'description': [], 'speed': [], 'solution conciseness': ['solution approach'], 'problem difficulty': [], 'appropriate methods': ['method appropriateness', 'use of appropriate methods'], 'problem type': [], 'problem level': [], 'adaptability': [], 'step by step logic': [], 'presentation': [], 'correct reasoning': []}


In [17]:
final_summarized_criteria = generate_summarized_criteria(get_sys_msg([summarized_criteria_group1, summarized_criteria_group2]))

[33mcriteria_summarizer_user[0m (to criteria_summarizer):

Task: Math problem solving.
Task description: Given any question, the system needs to  solve the problem as consisely and  acccurately as possible.
Suggested criteria[{'accuracy': {'description': 'The correctness of the solution provided for the math problem.', 'accepted_values': ['0% - Completely incorrect', '25% - Partially correct', '50% - Mostly correct', '75% - Almost correct', '100% - Completely correct']}, 'completeness': {'description': 'The extent to which the solution covers all aspects of the problem.', 'accepted_values': ['0% - Not complete', '25% - Partially complete', '50% - Mostly complete', '75% - Almost complete', '100% - Fully complete']}, 'clarity': {'description': 'The ease with which the solution can be understood by the target audience.', 'accepted_values': ['0% - Not clear', '25% - Somewhat clear', '50% - Fairly clear', '75% - Mostly clear', '100% - Very clear']}, 'presentation': {'description': 'The or

In [18]:
final_summarized_criteria = json.loads(final_summarized_criteria)

In [19]:
# convert to a list of words
conversion_dict_final = {}
for criteria in final_summarized_criteria:
    conversion_dict_final[' '.join(criteria.lower().split('_'))] = criteria

final_list_of_criteria = list(conversion_dict_final.keys())
    
final_list_of_criteria

['efficiency',
 'accuracy',
 'completeness',
 'clarity',
 'presentation',
 'steps delineation',
 'response time',
 'notations',
 'step explanation',
 'error handling',
 'use of methods',
 'level appropriateness',
 'solution depth',
 'terminology',
 'reliability',
 'calculation error',
 'creativity',
 'relevance',
 'notation quality',
 'simplification',
 'handling constraints',
 'problem type',
 'adaptability',
 'problem level',
 'solution approach',
 'correct reasoning']

In [20]:
synonymous_dict = find_synonymous_in_list(final_list_of_criteria)
print(f"Number of criteria {len(final_list_of_criteria)}")
print(f"Number of key in synonimous dict {len(synonymous_dict)}")
print(synonymous_dict)

Number of criteria 26
Number of key in synonimous dict 25
{'efficiency': [], 'accuracy': [], 'completeness': [], 'clarity': [], 'presentation': [], 'steps delineation': [], 'response time': [], 'notations': ['notation quality'], 'step explanation': [], 'error handling': [], 'use of methods': [], 'level appropriateness': [], 'solution depth': [], 'terminology': [], 'reliability': [], 'calculation error': [], 'creativity': [], 'relevance': [], 'simplification': [], 'handling constraints': [], 'problem type': [], 'adaptability': [], 'problem level': [], 'solution approach': [], 'correct reasoning': []}


# Save data

In [21]:
final_criteria = {}
for criteria in synonymous_dict:
    final_criteria[criteria] = final_summarized_criteria[conversion_dict_final[criteria]]

In [22]:
final_criteria

{'efficiency': {'description': 'The conciseness of the solution and the use of the most efficient method to solve the problem.',
  'accepted_values': ['inefficient',
   'moderately_efficient',
   'highly_efficient']},
 'accuracy': {'description': 'The correctness of the solution provided for the math problem.',
  'accepted_values': ['0% - Completely incorrect',
   '25% - Partially correct',
   '50% - Mostly correct',
   '75% - Almost correct',
   '100% - Completely correct']},
 'completeness': {'description': 'The extent to which the solution covers all aspects of the problem.',
  'accepted_values': ['0% - Not complete',
   '25% - Partially complete',
   '50% - Mostly complete',
   '75% - Almost complete',
   '100% - Fully complete']},
 'clarity': {'description': 'The ease with which the solution can be understood by the target audience.',
  'accepted_values': ['0% - Not clear',
   '25% - Somewhat clear',
   '50% - Fairly clear',
   '75% - Mostly clear',
   '100% - Very clear']},
 'pre

In [23]:
with open("output/final_filtered_criteria.json", "w") as outfile:
    json.dump(final_criteria, outfile, indent = 4) 