In [1]:

import nest_asyncio
nest_asyncio.apply()

import autogen
import json
import pandas as pd
import concurrent.futures
from datetime import datetime
from typing import Union, List, Dict

In [2]:
questions = [
"""
Create a comprehensive table comparing representation learning models for polymers. Include the following information for each model:

Model name
Year introduced
Key authors/research group
Type of representation (e.g., graph-based, sequence-based, 3D structure-based)
Input features used
Dimensionality of the learned representation
Training data type and size
Key applications (e.g., property prediction, polymer design)
Notable advantages
Limitations or challenges

After the table, provide a brief analysis (2-3 paragraphs) of the current trends in polymer representation learning, highlighting the most promising approaches and areas for future research. Please cite relevant papers for each model and in your analysis.
""",
"""
Create a comparative analysis of machine learning models used for predicting glass transition temperature (Tg) and melting temperature (Tm) of polymers. Include the following in your analysis:

A table comparing at least 5 different models, with the following information for each:
a) Model name and type (e.g., Random Forest, Neural Network, etc.)
b) Year introduced
c) Key authors/research group
d) Input features or representation used
e) Dataset size and composition
f) Accuracy metrics for Tg prediction (e.g., R², RMSE, MAE)
g) Accuracy metrics for Tm prediction (e.g., R², RMSE, MAE)
h) Computational complexity or training time (if available)
A brief discussion (2-3 paragraphs) on:
a) The overall trends in prediction accuracy for Tg vs Tm
b) Factors that contribute to higher accuracy in certain models
c) Challenges in predicting these properties accurately
d) Recent advancements or novel approaches in the field
A short section on potential areas for improvement and future research directions.

Please provide citations for the models and studies mentioned in your analysis.
"""
]

models = ['gpt-4', 'gpt-4o', 'gpt-4o-mini', 'gpt-35-turbo', 'gpt-35-turbo-16k']

# input_dir = '/home/azureuser/autogen_uscases/graphrag_memo/output/20240830-145930/artifacts'
input_dir = '/home/azureuser/autogen_uscases/autosearch/notebooks/graphrag/output/20240903-050706/artifacts'
# input_dir = '/home/azureuser/autogen_uscases/graphrag_memo/output/20240830-145930/artifacts'
lancedb_uri = "/home/azureuser/autogen_uscases/autosearch/notebooks/graphrag/output/20240903-050706/artifacts/lancedb"
embedding_model = 'text-embedding-3-small'



In [3]:
config_list = autogen.config_list_from_json(
    "../OAI_CONFIG_LIST-sweden-505",
    file_location=".",
    filter_dict={
        "model": ["gpt-4"]
    },
)

llm_config = {
    "config_list": config_list,
    "timeout": 120,
}

# Create the GPT-4 agent
gpt4_agent = autogen.AssistantAgent(
    name="gpt4_evaluator",
    llm_config=llm_config
)

# Create a user proxy agent
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
    code_execution_config=False,
)

# The create_evaluation_prompt, parse_evaluation_table, is_header_or_separator_row, and parse_score functions remain the same
def create_evaluation_prompt(result: Dict) -> str:
    prompt = f"""
    Evaluate and compare the following search results for the question: "{result['question']}"
    
    Model used: {result['model']}
    
    Local search result:
    {result['local_result']}
    
    Global search result:
    {result['global_result']}
    
    Please provide a detailed evaluation considering the following criteria:
    1. Relevance: How well does each result address the question?
    2. Completeness: Which result provides more comprehensive information?
    3. Accuracy: Are there any apparent errors or inconsistencies in either result?
    4. Clarity: Which result is easier to understand?
    5. Unique insights: Does either result provide unique information not present in the other?
    6. Coherence: How well-structured and logically connected is the information in each result?
    7. Source diversity: Does the result appear to draw from a variety of sources or perspectives?
    
    For each criterion, provide a brief explanation of your reasoning, followed by a score for both Local and Global results on a scale of 1-10.
    
    After your analysis, provide an overall score for each result, taking into account all the above criteria.
    
    Finally, conclude with a brief statement on which result (Local or Global) you think is better for this particular question and why.
    
    At the end of your analysis, provide a structured table with all the numerical scores using the following format:

    EVALUATION TABLE
    | Criterion        | Local Score | Global Score |
    |------------------|-------------|--------------|
    | Relevance        |             |              |
    | Completeness     |             |              |
    | Accuracy         |             |              |
    | Clarity          |             |              |
    | Unique Insights  |             |              |
    | Coherence        |             |              |
    | Source Diversity |             |              |
    | Overall          |             |              |

    Fill in the table with your numerical scores, ensuring all scores are between 1 and 10.

    End your evaluation with the word TERMINATE.
    """
    return prompt

import math
import re

def parse_evaluation_table(evaluation: str) -> Dict[str, Dict[str, float]]:
    try:
        table_text = evaluation.split("EVALUATION TABLE")[1].split("TERMINATE")[0].strip()
        lines = table_text.split("\n")
        scores = {}
        for line in lines:
            parts = [part.strip() for part in line.split("|") if part.strip()]
            if len(parts) >= 3 and not is_header_or_separator_row(parts):
                criterion = parts[0].lower().replace(" ", "_")
                try:
                    local_score = parse_score(parts[-2])  # Second to last part
                    global_score = parse_score(parts[-1])  # Last part
                    scores[criterion] = {"local": local_score, "global": global_score}
                except ValueError as e:
                    print(f"Warning: {str(e)} for criterion {criterion}")
        
        if not scores:
            raise ValueError("No valid scores found in the evaluation table")
        
        return scores
    except Exception as e:
        print(f"Error parsing evaluation table: {str(e)}")
        return {}

def is_header_or_separator_row(parts: List[str]) -> bool:
    # Check if it's a header row
    if any(header.lower() in " ".join(parts).lower() for header in ["criterion", "local score", "global score"]):
        return True
    
    # Check if it's a separator row (contains only dashes, possibly with colons for alignment)
    if all(re.match(r'^[-:]+$', part) for part in parts):
        return True
    
    return False

def extract_score(score_dict: Dict[str, str], model_name: str) -> Union[float, None]:
    # Normalize the model name
    model_name_lower = model_name.lower()
    
    # Define possible column names
    possible_names = [
        f"{model_name} Score",
        model_name,
        f"{model_name_lower} score",
        "Local Score" if "local" in model_name_lower else "Global Score"
    ]
    
    # Try to find a matching key
    for name in possible_names:
        if name in score_dict:
            return parse_score_value(score_dict[name])
    
    # If no exact match, try partial matches
    for key in score_dict.keys():
        if model_name_lower in key.lower():
            return parse_score_value(score_dict[key])
    
    # If still no match, return None
    return None

def parse_score_value(value: str) -> Union[float, None]:
    # Remove any asterisks or other non-numeric characters
    clean_value = re.sub(r'[^0-9./NA]', '', value.upper())
    
    if clean_value in ['NA', 'N/A', '']:
        return None
    elif '/' in clean_value:  # Handle fractional scores
        num, denom = clean_value.split('/')
        return float(num) / float(denom)
    elif '.' in clean_value:  # Handle decimal scores
        return float(clean_value)
    elif clean_value.isdigit():  # Handle integer scores
        return float(clean_value)
    else:
        # If it's a complex string (e.g., "1 (approx.)"), try to extract the first number
        match = re.search(r'\d+(\.\d+)?', value)
        if match:
            return float(match.group())
        else:
            return None 
    
    
def evaluate_single_result(result: Dict) -> Dict:
    print("=============================================")
    print(f"Evaluating question: {result['question']}, model: {result['model']}")
    print("=============================================")
    prompt = create_evaluation_prompt(result)
    
    user_proxy.initiate_chat(
        gpt4_agent,
        message=prompt,
    )
    
    evaluation = user_proxy.last_message()["content"]
    
    scores = parse_evaluation_table(evaluation)
    
    if not scores:
        print(f"Warning: Failed to parse scores for question: {result['question']}, model: {result['model']}")
        scores = {criterion: {"local": 0, "global": 0} for criterion in 
                  ["relevance", "completeness", "accuracy", "clarity", "unique_insights", "coherence", "source_diversity", "overall"]}
    
    return {
        "question": result["question"],
        "model": result["model"],
        "local_result": result["local_result"],
        "global_result": result["global_result"],
        "prompt": prompt,
        "evaluation": evaluation,
        "scores": scores
    }

def evaluate_results(results: Union[List[Dict], pd.DataFrame]) -> pd.DataFrame:
    evaluations = []
    
    def evaluate_and_store(result: Dict) -> None:
        evaluation = evaluate_single_result(result)
        evaluations.append(evaluation)
    
    # Convert to DataFrame if it's a list
    if isinstance(results, list):
        results_df = pd.DataFrame(results)
    else:
        results_df = results
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        futures = [executor.submit(evaluate_and_store, row.to_dict()) for _, row in results_df.iterrows()]
        concurrent.futures.wait(futures)
    
    # Convert evaluations to DataFrame
    eval_df = pd.DataFrame(evaluations)
    
    # Expand the scores dictionary into separate columns
    score_df = pd.json_normalize(eval_df['scores'])
    score_df.columns = [f'score_{col}' for col in score_df.columns]
    
    # Combine the evaluation DataFrame with the scores DataFrame
    eval_df = pd.concat([eval_df.drop(columns=['scores']), score_df], axis=1)
    
    return eval_df

In [4]:

results_json = 'results-20240830-145930.json'
results = pd.read_json(results_json)

# Run the evaluation
evaluations_df = evaluate_results(results)

# Generate a unique filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'evaluations_{timestamp}.csv'

# Save the evaluations to a new CSV file
evaluations_df.to_csv(filename, index=False)

print(f"Evaluations completed and saved to {filename}")

# If you still want to save as JSON
json_filename = f'evaluations_{timestamp}.json'
evaluations_df.to_json(json_filename, orient='records', indent=4)
print(f"Evaluations also saved as JSON to {json_filename}")

Evaluating question: 
Create a comprehensive table comparing representation learning models for polymers. Include the following information for each model:

Model name
Year introduced
Key authors/research group
Type of representation (e.g., graph-based, sequence-based, 3D structure-based)
Input features used
Dimensionality of the learned representation
Training data type and size
Key applications (e.g., property prediction, polymer design)
Notable advantages
Limitations or challenges

After the table, provide a brief analysis (2-3 paragraphs) of the current trends in polymer representation learning, highlighting the most promising approaches and areas for future research. Please cite relevant papers for each model and in your analysis.
, model: gpt-4
[33muser_proxy[0m (to gpt4_evaluator):


    Evaluate and compare the following search results for the question: "
Create a comprehensive table comparing representation learning models for polymers. Include the following information for 

In [5]:
def compare_local_global_per_model(evaluations_df: pd.DataFrame) -> pd.DataFrame:
    # Melt the DataFrame to get all scores in one column
    melted = pd.melt(evaluations_df, 
                     id_vars=['model'], 
                     value_vars=[col for col in evaluations_df.columns if col.startswith('score_')],
                     var_name='criterion', 
                     value_name='score')
    
    # Split the criterion into base criterion and search type
    melted[['criterion', 'search_type']] = melted['criterion'].str.split('_', n=2, expand=True)[1].str.split('.', n=2, expand=True)
    
    # Pivot the DataFrame to get local and global scores side by side
    pivoted = melted.pivot_table(values='score', 
                                 index=['model', 'search_type'], 
                                 columns='criterion', 
                                 aggfunc='mean')
    
    # Swap levels to get search_type as the first level
    pivoted = pivoted.swaplevel(0, 1, axis=0)
    
    # Sort index to group local and global together
    pivoted = pivoted.sort_index()
    
    return pivoted


In [6]:
def compare_across_models(evaluations_df: pd.DataFrame, search_type: str) -> pd.DataFrame:
    # Filter columns for the specified search type
    score_columns = [col for col in evaluations_df.columns if col.startswith(f'score_') and col.endswith(f'.{search_type}')]
    
    # Select relevant columns and group by model
    grouped = evaluations_df[['model'] + score_columns].groupby('model')
    
    # Calculate mean scores for each model
    mean_scores = grouped.mean()
    
    # Reset index to make 'model' a regular column
    mean_scores = mean_scores.reset_index()
    
    # Rename columns for clarity
    mean_scores.columns = [col.replace(f'score_', '').replace(f'.{search_type}', '') for col in mean_scores.columns]
    
    return mean_scores


In [7]:
df_questions = evaluations_df[evaluations_df['question'] == questions[0]]

# Compare Local and Global search for each model
local_global_comparison = compare_local_global_per_model(df_questions)
print(f"question: {questions[0]}")
print("\nLocal vs Global comparison for each model:")
local_global_comparison

question: 
Create a comprehensive table comparing representation learning models for polymers. Include the following information for each model:

Model name
Year introduced
Key authors/research group
Type of representation (e.g., graph-based, sequence-based, 3D structure-based)
Input features used
Dimensionality of the learned representation
Training data type and size
Key applications (e.g., property prediction, polymer design)
Notable advantages
Limitations or challenges

After the table, provide a brief analysis (2-3 paragraphs) of the current trends in polymer representation learning, highlighting the most promising approaches and areas for future research. Please cite relevant papers for each model and in your analysis.


Local vs Global comparison for each model:


Unnamed: 0_level_0,criterion,accuracy,clarity,coherence,completeness,overall,relevance
search_type,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
global,gpt-35-turbo,,2.0,1.0,1.0,,1.0
global,gpt-35-turbo-16k,10.0,10.0,1.0,1.0,,1.0
global,gpt-4,10.0,4.0,2.0,2.0,3.0,3.0
global,gpt-4-mini,,1.0,1.0,1.0,1.0,1.0
global,gpt-4o,1.0,1.0,1.0,1.0,1.0,1.0
local,gpt-35-turbo,7.0,8.0,8.0,5.0,,7.0
local,gpt-35-turbo-16k,7.0,8.0,7.0,5.0,,6.0
local,gpt-4,10.0,7.0,6.0,4.0,5.0,5.0
local,gpt-4-mini,,8.0,7.0,5.0,6.0,7.0
local,gpt-4o,5.0,6.0,6.0,4.0,5.0,4.0


In [23]:
df_questions = evaluations_df[evaluations_df['question'] == questions[1]]

# Compare Local and Global search for each model
local_global_comparison = compare_local_global_per_model(df_questions)
print(f"question: {questions[0]}")
print("\nLocal vs Global comparison for each model:")
local_global_comparison

question: 
Create a comprehensive table comparing representation learning models for polymers. Include the following information for each model:

Model name
Year introduced
Key authors/research group
Type of representation (e.g., graph-based, sequence-based, 3D structure-based)
Input features used
Dimensionality of the learned representation
Training data type and size
Key applications (e.g., property prediction, polymer design)
Notable advantages
Limitations or challenges

After the table, provide a brief analysis (2-3 paragraphs) of the current trends in polymer representation learning, highlighting the most promising approaches and areas for future research. Please cite relevant papers for each model and in your analysis.


Local vs Global comparison for each model:


Unnamed: 0_level_0,criterion,accuracy,clarity,coherence,completeness,overall,relevance
search_type,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
global,gpt-35-turbo,6.0,5.0,3.0,2.0,3.1,3.0
global,gpt-35-turbo-16k,5.0,6.0,5.0,3.0,4.3,5.0
global,gpt-4,5.0,6.0,5.0,2.0,3.7,3.0
global,gpt-4-mini,,1.0,1.0,1.0,1.0,1.0
global,gpt-4o,,,,1.0,1.0,1.0
local,gpt-35-turbo,8.0,8.0,8.0,7.0,7.4,8.0
local,gpt-35-turbo-16k,4.0,7.0,7.0,5.0,5.4,6.0
local,gpt-4,7.0,9.0,9.0,8.0,8.1,9.0
local,gpt-4-mini,6.0,6.0,6.0,5.0,5.6,7.0
local,gpt-4o,5.0,7.0,7.0,5.0,6.0,6.0


In [22]:
# use iwidget to display the comparison
# local_global_comparison.loc[local_global_comparison.index.get_level_values('model') == 'gpt-4']

from IPython.display import display
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

@interact
def show_comparison(model=models):
    display(local_global_comparison.loc[local_global_comparison.index.get_level_values('model') == model])



interactive(children=(Dropdown(description='model', options=('gpt-4', 'gpt-4o', 'gpt-4o-mini', 'gpt-35-turbo',…

In [9]:
df_questions = evaluations_df[evaluations_df['question'] == questions[0]]

# Compare local scores across all models
local_comparison = compare_across_models(df_questions, 'local')
local_comparison

Unnamed: 0,model,relevance,completeness,accuracy,clarity,unique_insights,coherence,source_diversity,overall
0,gpt-35-turbo,7.0,5.0,7.0,8.0,6.0,8.0,4.0,
1,gpt-35-turbo-16k,6.0,5.0,7.0,8.0,6.0,7.0,3.0,
2,gpt-4,5.0,4.0,10.0,7.0,3.0,6.0,1.0,5.0
3,gpt-4-mini,7.0,5.0,,8.0,5.0,7.0,,6.0
4,gpt-4o,4.0,4.0,5.0,6.0,5.0,6.0,3.0,5.0


In [10]:
df_questions = evaluations_df[evaluations_df['question'] == questions[1]]

# Compare local scores across all models
local_comparison = compare_across_models(df_questions, 'local')
local_comparison

Unnamed: 0,model,relevance,completeness,accuracy,clarity,unique_insights,coherence,source_diversity,overall
0,gpt-35-turbo,8.0,7.0,8.0,8.0,7.0,8.0,6.0,7.4
1,gpt-35-turbo-16k,6.0,5.0,4.0,7.0,6.0,7.0,3.0,5.4
2,gpt-4,9.0,8.0,7.0,9.0,8.0,9.0,7.0,8.1
3,gpt-4-mini,7.0,5.0,6.0,6.0,5.0,6.0,4.0,5.6
4,gpt-4o,6.0,5.0,5.0,7.0,6.0,7.0,5.0,6.0


In [11]:
df_questions = evaluations_df[evaluations_df['question'] == questions[0]]

# Compare global scores across all models
local_comparison = compare_across_models(df_questions, 'global')
local_comparison

Unnamed: 0,model,relevance,completeness,accuracy,clarity,unique_insights,coherence,source_diversity,overall
0,gpt-35-turbo,1.0,1.0,,2.0,1.0,1.0,1.0,
1,gpt-35-turbo-16k,1.0,1.0,10.0,10.0,1.0,1.0,1.0,
2,gpt-4,3.0,2.0,10.0,4.0,1.0,2.0,1.0,3.0
3,gpt-4-mini,1.0,1.0,,1.0,1.0,1.0,,1.0
4,gpt-4o,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
df_questions = evaluations_df[evaluations_df['question'] == questions[1]]

# Compare global scores across all models
local_comparison = compare_across_models(df_questions, 'global')
local_comparison

Unnamed: 0,model,relevance,completeness,accuracy,clarity,unique_insights,coherence,source_diversity,overall
0,gpt-35-turbo,3.0,2.0,6.0,5.0,1.0,3.0,2.0,3.1
1,gpt-35-turbo-16k,5.0,3.0,5.0,6.0,3.0,5.0,3.0,4.3
2,gpt-4,3.0,2.0,5.0,6.0,3.0,5.0,2.0,3.7
3,gpt-4-mini,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
4,gpt-4o,1.0,1.0,,,1.0,,1.0,1.0


In [13]:

# Compare Local and Global search for each model
local_global_comparison = compare_local_global_per_model(evaluations)
print("\nLocal vs Global comparison for each model:")
print(json.dumps(local_global_comparison, indent=2))

# Compare Local search across all models
local_comparison = compare_across_models(evaluations, "local")
print("\nLocal search comparison across all models:")
print(json.dumps(local_comparison, indent=2))

# Compare Global search across all models
global_comparison = compare_across_models(evaluations, "global")
print("\nGlobal search comparison across all models:")
print(json.dumps(global_comparison, indent=2))

# Calculate overall average scores
def calculate_average_scores(evaluations):
    total_scores = {
        "local": defaultdict(float),
        "global": defaultdict(float)
    }
    count = len(evaluations)

    for eval in evaluations:
        for criterion, scores in eval["scores"].items():
            total_scores["local"][criterion] += scores["local"]
            total_scores["global"][criterion] += scores["global"]

    avg_scores = {
        "local": {criterion: score / count for criterion, score in total_scores["local"].items()},
        "global": {criterion: score / count for criterion, score in total_scores["global"].items()}
    }

    return avg_scores

# Calculate and print overall average scores
average_scores = calculate_average_scores(evaluations)
print("\nOverall Average Scores:")
print(json.dumps(average_scores, indent=2))

NameError: name 'evaluations' is not defined

In [None]:
import math
import re

def parse_evaluation_table(evaluation: str) -> Dict[str, Dict[str, float]]:
    try:
        table_text = evaluation.split("EVALUATION TABLE")[1].split("TERMINATE")[0].strip()
        lines = table_text.split("\n")
        scores = {}
        for line in lines:
            parts = [part.strip() for part in line.split("|") if part.strip()]
            if len(parts) >= 3 and not is_header_or_separator_row(parts):
                criterion = parts[0].lower().replace(" ", "_")
                try:
                    local_score = parse_score(parts[-2])  # Second to last part
                    global_score = parse_score(parts[-1])  # Last part
                    scores[criterion] = {"local": local_score, "global": global_score}
                except ValueError as e:
                    print(f"Warning: {str(e)} for criterion {criterion}")
        
        if not scores:
            raise ValueError("No valid scores found in the evaluation table")
        
        return scores
    except Exception as e:
        print(f"Error parsing evaluation table: {str(e)}")
        return {}

def is_header_or_separator_row(parts: List[str]) -> bool:
    # Check if it's a header row
    if any(header.lower() in " ".join(parts).lower() for header in ["criterion", "local score", "global score"]):
        return True
    
    # Check if it's a separator row (contains only dashes, possibly with colons for alignment)
    if all(re.match(r'^[-:]+$', part) for part in parts):
        return True
    
    return False

def parse_score(score_str: str) -> float:
    score_str = score_str.strip().lower()
    if score_str in ['n/a', ''] or not re.search(r'\d', score_str):
        return math.nan  # Use NaN to represent N/A, empty scores, or non-numeric entries
    try:
        score = float(score_str)
        if 1 <= score <= 10:
            return score
        else:
            raise ValueError(f"Score out of range: {score}")
    except ValueError:
        raise ValueError(f"Invalid score format: {score_str}")