In [None]:
# pip install -U plotly kaleido

In [9]:
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]


def get_model_df():
    q2result = []
    fin = open("gpt-4_single.jsonl", "r")
    for line in fin:
        obj = json.loads(line)
        obj["category"] = CATEGORIES[(obj["question_id"]-81)//10]
        q2result.append(obj)
    df = pd.DataFrame(q2result)
    return df

def toggle(res_str):
    if res_str == "win":
        return "loss"
    elif res_str == "loss":
        return "win"
    return "tie"

df = get_model_df()

In [None]:
all_models = df["model"].unique()
print(all_models)
scores_all = []
for model in all_models:
    for cat in CATEGORIES:
        # filter category/model, and score format error (<1% case)
        res = df[(df["category"]==cat) & (df["model"]==model) & (df["score"] >= 0)]
        score = res["score"].mean()
        scores_all.append({"model": model, "category": cat, "score": score})

In [None]:
print(scores_all)

In [14]:
target_models = ['gpt-3.5-turbo', 'gemma-7b-it', 'gemma-2b-it','gpt-4-turbo-preview', 'qwen1.5-14b-chat']

scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]
print(scores_target)

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]

rename_map = { "gpt-3.5-turbo": "GPT-3.5-turbo",
               "gemma-7b-it": "Gemma-7B-IT",
               "gpt-4": "GPT-4",
               "gemma-2b-it": "Gemma-2B-IT",
               "gpt-4-turbo-preview": "GPT-4-turbo-preview",
               "qwen1.5-14b-chat": "Qwen1.5-14B-Chat",
             }

for k, v in rename_map.items():
    df_score.replace(k, v, inplace=True)

fig = px.line_polar(df_score, r = 'score', theta = 'category', line_close = True, category_orders = {"category": CATEGORIES},
                    color = 'model', markers=True, color_discrete_sequence=px.colors.qualitative.Pastel)

fig.show()

[{'model': 'gpt-3.5-turbo', 'category': 'Writing', 'score': 8.75}, {'model': 'gpt-3.5-turbo', 'category': 'Roleplay', 'score': 8.1}, {'model': 'gpt-3.5-turbo', 'category': 'Reasoning', 'score': 3.25}, {'model': 'gpt-3.5-turbo', 'category': 'Math', 'score': 4.5}, {'model': 'gpt-3.5-turbo', 'category': 'Coding', 'score': 7.2}, {'model': 'gpt-3.5-turbo', 'category': 'Extraction', 'score': 8.15}, {'model': 'gpt-3.5-turbo', 'category': 'STEM', 'score': 8.5}, {'model': 'gpt-3.5-turbo', 'category': 'Humanities', 'score': 9.6}, {'model': 'gemma-7b-it', 'category': 'Writing', 'score': 4.3}, {'model': 'gemma-7b-it', 'category': 'Roleplay', 'score': 4.25}, {'model': 'gemma-7b-it', 'category': 'Reasoning', 'score': 2.3}, {'model': 'gemma-7b-it', 'category': 'Math', 'score': 3.9}, {'model': 'gemma-7b-it', 'category': 'Coding', 'score': 2.85}, {'model': 'gemma-7b-it', 'category': 'Extraction', 'score': 5.5}, {'model': 'gemma-7b-it', 'category': 'STEM', 'score': 5.45}, {'model': 'gemma-7b-it', 'categ