In [None]:
import openai
import csv
from time import sleep
import numpy as np

# Set your OpenAI API key
client = openai.OpenAI(api_key=
)

# Mapping output numbers to method names
output_names = {
    1: "MindMap",
    2: "GPT3.5",
    3: "BM25_retrieval",
    4: "Embedding_retrieval",
    5: "KG_retrieval",
    6: "KG_self-consistency",
    7: "KG_TreeOfThoughts",
    8: "GPT4"
}

def prompt_ranking(reference_text, outputs, model_name):
    prompt_template = """
You are an expert medical evaluator.

Your task is to compare 8 candidate outputs against the following reference (gold label) answer:
[Reference Answer]
{reference}

Here are the candidate outputs:
1. {output1}
2. {output2}
3. {output3}
4. {output4}
5. {output5}
6. {output6}
7. {output7}
8. {output8}

Evaluation criteria:
✅ Accuracy: How factually correct is the output compared to the reference?
✅ Coverage: Does the output include all key drug and test recommendations?
✅ Relevance: Are the details consistent with the disease context?

Instructions:
- Rank the outputs from best (most aligned with the reference) to worst.
- Ignore language style, grammar, or fluency. Focus only on factual alignment.
- Strictly output only the final ranking as 8 numbers separated by commas.
- No explanations, no extra text, no notes.

Example format (for outputs 5,8,6,7,3,2,4,1 in order):
5,8,6,7,3,2,4,1
"""
    prompt = prompt_template.format(reference=reference_text, **outputs)
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are an excellent evaluator."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

# File paths
input_file = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\output.csv'
output_file_details = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\detailed_rankings.csv'

# Parameters
num_votes = 5
model_labels = [
    ("gpt-4o", "GPT_4o"),
    ("gpt-3.5-turbo", "GPT_3.5_turbo")
]

with open(input_file, 'r', newline="", encoding='utf-8') as f_input, \
     open(output_file_details, 'w', newline='', encoding='utf-8') as f_output:

    reader = csv.reader(f_input)
    writer = csv.writer(f_output)

    header = next(reader)
    writer.writerow(["Model", "RowNumber", "FinalRanking (with names)", "AverageRanks (with names)"])  # output file header

    all_results = {model_label: np.zeros(8) for _, model_label in model_labels}
    count_rows = 0

    for row in reader:
        count_rows += 1
        reference_text = row[1].strip("\n")
        outputs = {
            "output1": row[2].strip("\n"),
            "output2": row[3].strip("\n"),
            "output3": row[4].strip("\n"),
            "output4": row[5].strip("\n"),
            "output5": row[6].strip("\n"),
            "output6": row[7].strip("\n"),
            "output7": row[8].strip("\n"),
            "output8": row[9].strip("\n")
        }

        for model_name, model_label in model_labels:
            rank_sums = np.zeros(8)
            for _ in range(num_votes):
                flag = 0
                while flag == 0:
                    try:
                        ranking_str = prompt_ranking(reference_text, outputs, model_name)
                        ranking = [int(x.strip()) for x in ranking_str.split(',')]
                        for position, output_num in enumerate(ranking):
                            rank_sums[output_num - 1] += position + 1
                        flag = 1
                    except Exception as e:
                        print(f"Error with {model_name}: {e}, retrying after sleep...")
                        sleep(40)

            avg_ranks = rank_sums / num_votes
            all_results[model_label] += avg_ranks

            sorted_outputs = sorted(range(1, 9), key=lambda x: avg_ranks[x - 1])
            final_ranking_str = ', '.join(f"{output_names[x]}(Output{x})" for x in sorted_outputs)
            avg_ranks_str = ', '.join(f"{output_names[i + 1]}(Output{i + 1}): {avg_ranks[i]:.2f}" for i in range(8))

            print(f"{model_label} Row {count_rows} Final Ranking: {final_ranking_str}")
            writer.writerow([model_label, count_rows, final_ranking_str, avg_ranks_str])

    # After processing all rows, calculate final average and print
    for model_label in all_results:
        final_avg = all_results[model_label] / count_rows
        sorted_outputs = sorted(range(1, 9), key=lambda x: final_avg[x - 1])

        print(f"=== {model_label} 按平均排名排序 ===")
        for i, output_idx in enumerate(sorted_outputs, 1):
            print(f"{i}. {output_names[output_idx]} (Output{output_idx}) 平均排名: {final_avg[output_idx - 1]:.2f}")
        print()

GPT_4o Row 1 Final Ranking: Embedding_retrieval(Output4), BM25_retrieval(Output3), KG_retrieval(Output5), MindMap(Output1), KG_TreeOfThoughts(Output7), KG_self-consistency(Output6), GPT3.5(Output2), GPT4(Output8)
GPT_3.5_turbo Row 1 Final Ranking: KG_self-consistency(Output6), Embedding_retrieval(Output4), BM25_retrieval(Output3), KG_retrieval(Output5), MindMap(Output1), GPT3.5(Output2), KG_TreeOfThoughts(Output7), GPT4(Output8)
GPT_4o Row 2 Final Ranking: KG_self-consistency(Output6), KG_retrieval(Output5), GPT4(Output8), Embedding_retrieval(Output4), GPT3.5(Output2), BM25_retrieval(Output3), MindMap(Output1), KG_TreeOfThoughts(Output7)
GPT_3.5_turbo Row 2 Final Ranking: GPT3.5(Output2), BM25_retrieval(Output3), GPT4(Output8), MindMap(Output1), Embedding_retrieval(Output4), KG_self-consistency(Output6), KG_TreeOfThoughts(Output7), KG_retrieval(Output5)
GPT_4o Row 3 Final Ranking: GPT3.5(Output2), GPT4(Output8), KG_retrieval(Output5), KG_self-consistency(Output6), BM25_retrieval(Output