In [6]:
import math
import random
import threading

import pandas as pd
from statsmodels.stats.proportion import proportion_confint
from pprint import pprint
import json

from utils.utils import PChatResponse

lock = threading.Lock()

HP_LLM = [
    "mistralai--mistral-large-instruct",
    "mistralai--mistral-small-instruct",
    "meta--llama3.1-70b-instruct",
]
HP_TOP_K = [4, 6, 8, 10, 12, 14]

feedback_data = pd.read_csv("llm_feedback_2025-08-27.csv")

# 1. Code for Choice Sampling
This is the code that is used in Document Chat to sample model pairs

In [8]:
def select_two_candidates(rag_enabled: bool = True) -> list[dict]:
    if not rag_enabled:
        llm1, llm2 = random.sample(HP_LLM, 2)
        choice1 = {"llm": llm1, "top_k": 12}  # 12 is the default top_k in Document Chat
        choice2 = {"llm": llm2, "top_k": 12}
    else:
        choice1 = {"llm": random.choice(HP_LLM), "top_k": random.choice(HP_TOP_K)}
        while True:
            choice2 = {
                "llm": random.choice(HP_LLM),
                "top_k": random.choice(HP_TOP_K),
            }
            if choice2 != choice1:
                break

    return [choice1, choice2]


print(select_two_candidates())

[{'llm': 'meta--llama3.1-70b-instruct', 'top_k': 14}, {'llm': 'mistralai--mistral-small-instruct', 'top_k': 4}]


# 2. Mask Feedback Data

In [7]:
def mask(row):
    choices = json.loads(json.loads(row["choices"]))
    for c in choices:
        c["response"] = "--confidential--"
        c["response_markdown"] = "--confidential--"
        c["span_tree"] = "--confidential--"
        c["nodes"] = []
    return json.dumps(choices)


feedback_data["choices"] = feedback_data.apply(mask, axis=1)
feedback_data.to_csv("llm_feedback_2025-08-27.csv", index=False)

# 3. Compile Leaderboard
This code is also taken from Document Chat, where it is used to display an in-app leaderboard of the models.

In [8]:
def get_leaderboard(open_source_only=False):
    stats = {"llms": {}, "top_k": {}, "total": {}}
    elo_scores = {"llms": {}, "top_k": {}, "total": {}}
    initial_elo = 1500
    k_factor = 32

    def update_stats(stat_dict, winner, looser):
        if winner == looser:
            return

        if winner not in stat_dict:
            stat_dict[winner] = {"wins": 0, "total": 0}
        if looser not in stat_dict:
            stat_dict[looser] = {"wins": 0, "total": 0}

        stat_dict[winner]["wins"] += 1
        stat_dict[winner]["total"] += 1
        stat_dict[looser]["total"] += 1

    def update_elo(cat, winner, looser):
        if winner == looser:
            return
        winner_elo = elo_scores[cat].get(winner, initial_elo)
        looser_elo = elo_scores[cat].get(looser, initial_elo)

        expected_winner = 1 / (1 + math.pow(10, (looser_elo - winner_elo) / 400))
        expected_looser = 1 - expected_winner

        new_winner_elo = winner_elo + k_factor * (1 - expected_winner)
        new_looser_elo = looser_elo + k_factor * (0 - expected_looser)

        elo_scores[cat][winner] = new_winner_elo
        elo_scores[cat][looser] = new_looser_elo

    n_feedbacks = 0

    for _, f in feedback_data.iterrows():
        choices = [PChatResponse.model_validate(c) for c in json.loads(f.choices)]
        if any(choice.hp_candidate is None for choice in choices):
            print(f"Feedback entry {f.feedbackID} has no HP candidate in choices")
            continue

        n_feedbacks += 1

        winning_choice = choices.pop(f.selectedChoiceIndex)
        loosing_choice = choices.pop()

        winner_llm = winning_choice.hp_candidate["llm"]
        winner_top_k = winning_choice.hp_candidate["top_k"]
        looser_llm = loosing_choice.hp_candidate["llm"]
        looser_top_k = loosing_choice.hp_candidate["top_k"]

        if open_source_only and (winner_llm not in HP_LLM or looser_llm not in HP_LLM):
            continue

        if (
            winning_choice.used_tools
            and loosing_choice.used_tools
            and not any(
                tool.tool_name == "RetrievalAugmentedGeneration"
                for tool in winning_choice.used_tools + loosing_choice.used_tools
            )
        ):
            # If no choice used RAG, we ignore the top_k evaluation
            winner_top_k = "blank"
            looser_top_k = "blank"

        winner_total = f"{winner_llm} / {winner_top_k}"
        looser_total = f"{looser_llm} / {looser_top_k}"

        update_stats(stats["llms"], winner_llm, looser_llm)
        update_stats(stats["top_k"], winner_top_k, looser_top_k)
        update_stats(stats["total"], winner_total, looser_total)

        update_elo("llms", winner_llm, looser_llm)
        update_elo("top_k", winner_top_k, looser_top_k)
        update_elo("total", winner_total, looser_total)

    def compile_results(category):
        results = {}
        for key, value in stats[category].items():
            wins = value["wins"]
            total = value["total"]
            win_rate = wins / total if total > 0 else 0
            elo = elo_scores[category].get(key, initial_elo)

            # Wilson confidence interval for win rate
            win_rate_ci_90 = proportion_confint(wins, total, alpha=0.10, method="wilson") if total > 0 else (0, 0)
            win_rate_ci_95 = proportion_confint(wins, total, alpha=0.05, method="wilson") if total > 0 else (0, 0)

            results[key] = {
                "total": total,
                "wins": wins,
                "win_rate": win_rate,
                "win_rate_ci_90": win_rate_ci_90,
                "win_rate_ci_95": win_rate_ci_95,
                "elo": elo,
            }

        return dict(sorted(results.items(), key=lambda x: x[1]["elo"], reverse=True))

    return {
        "n_feedbacks": n_feedbacks,
        "llm": compile_results("llms"),
        "top_k": compile_results("top_k"),
        "total": compile_results("total"),
    }


pprint(get_leaderboard(open_source_only=True))

{'llm': {'meta--llama3.1-70b-instruct': {'elo': 1460.3983304415642,
                                         'total': 63,
                                         'win_rate': 0.3968253968253968,
                                         'win_rate_ci_90': (0.3017061963698504,
                                                            0.5004413954983711),
                                         'win_rate_ci_95': (0.28531911519046455,
                                                            0.5201908148834256),
                                         'wins': 25},
         'mistralai--mistral-large-instruct': {'elo': 1512.1507213085692,
                                               'total': 80,
                                               'win_rate': 0.5375,
                                               'win_rate_ci_90': (0.44608599623212136,
                                                                  0.6264605314783243),
                                               'win_r