In [370]:
from pathlib import Path
import mlflow

TSP_EXPERIMENTS = {
    603996644784169297: "hynky-TSP-Verbal",
    193975243763088023: "hynky-TSP-Culture",
    842945248455049410: "hynky-TSP-Critical",
    996016737840831842: "hynky-TSP-Analytical"
}

TSP_runs = mlflow.search_runs(experiment_names=TSP_EXPERIMENTS.values())
klokan_runs = mlflow.search_runs(experiment_names=["hynky-klokan-qa-train"])

TSP_runs['params.split'] = TSP_runs['params.dataset_split'].where(TSP_runs['params.dataset_split'].notnull(), TSP_runs['params.split'])

In [372]:
TSP_runs[TSP_runs["params.model_name"] == "anthropic/claude-2.1"]["params.split"]


0       Culture
1        Verbal
2      Critical
3    Analytical
Name: params.split, dtype: object

In [373]:
## General table
res = TSP_runs[["metrics.accuracy", "params.split", "params.model_name"]].pivot_table(index="params.model_name", columns="params.split", values="metrics.accuracy")

# Rename the params.split to task
res = res.rename(columns={"Analytical": "analytical", "Critical": "critical", "Culture": "culture", "Verbal": "verbal"})

# Join with Klokan
klokan = klokan_runs[["metrics.accuracy", "params.model_name"]].pivot_table(index="params.model_name", values="metrics.accuracy")
klokan.rename(columns={"metrics.accuracy": "klokan"}, inplace=True)

joined = res.join(klokan)
# Rename index params.models_name to model_name
joined.index.rename('model_name', inplace=True)



In [374]:
joined

Unnamed: 0_level_0,analytical,critical,culture,verbal,klokan
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
anthropic/claude-2.1,0.380403,0.644991,0.798177,0.633634,0.382388
google/gemini-pro,0.268012,0.599297,0.782552,0.576577,0.235223
mistralai/mixtral-8x7b-instruct,0.244957,0.483304,0.643229,0.369369,0.253317
openai/gpt-3.5-turbo,0.277618,0.465729,0.682292,0.408408,0.314837
openai/gpt-4-1106-preview,0.479347,0.766257,0.916667,0.720721,0.47889


In [375]:

import os

# Create the directory if it does not exist
if not os.path.exists("leaderboard"):
    os.makedirs("leaderboard")


joined.to_csv("leaderboard/table.csv")

In [473]:

import pandas as pd

# Klokan spider preparation

# Function to process each artifact and calculate accuracy
def process_artifact(artifact_info):
    artifact_info = artifact_info.replace("file:", "")
    try:
        dataset_path = Path(artifact_info)
        if (dataset_path / "dataset.json").exists():
            with open(dataset_path / "dataset.json", "r") as f:
                x = pd.read_json(f, lines=True)
        elif (dataset_path / "dataset.csv").exists():
            x = pd.read_json(dataset_path / "dataset.csv", lines=True)
        else:
            print(f"Neither dataset.json nor dataset.csv could be found in {dataset_path}")
            raise FileNotFoundError("Neither dataset.json nor dataset.csv could be found.")
    except:
        return None
    # Parse into a real DataFrame
    real_df = pd.DataFrame.from_records(x["data"][0], columns=x["columns"][0])

    # Add a column indicating whether the answer was correct
    real_df["is_correct"] = real_df.apply(lambda row: row["answer"] == row["correct_answer"], axis=1)

    # Group by 'category' and calculate mean accuracy
    category_accuracy = real_df.groupby("category")["is_correct"].mean()

    return category_accuracy

# Process each artifact and store the results
all_category_accuracies = [(process_artifact(artifact[1]["artifact_uri"]), artifact[1].get("params.model_name")) for artifact in klokan_runs.iterrows()]
combined_accuracies = pd.concat([x[0].rename(x[1]) for x in all_category_accuracies], axis=1).T
df = combined_accuracies
df = df* 100

# Rename the columns
categories = [
    "Elementary 2-3",
    "Elementary 4-5",
    "Elementary 6-7",
    "Elementary 8-9",
    "High School 1-2",
    "High School 3-4"
]

df.columns = categories
df.sort_index(inplace=True)

df.to_csv("leaderboard/klokan.csv")


In [474]:
import pandas as pd
import plotly.graph_objects as go
import pandas as pd

import matplotlib.pyplot as plt

# Combine all category accuracies into a single DataFrame


categories = df.columns.tolist()
categories = [*categories, categories[0]]  # Ensure the graph is circular by appending the start to the end
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]

# Setting for 1000x1000
fig_1000 = go.Figure()

for i, (idx, row) in enumerate(df.iterrows()):
    row = row.tolist() + [row[0]]  # Ensure the graph is circular by appending the start to the end
    color = colors[i]
    fig_1000.add_trace(go.Scatterpolar(
        r=row,
        theta=categories,
        opacity=0.4,
        name=idx,
        line=dict(color=color, width=4),  # Adjust line width for better visibility
    ))

fig_1000.update_layout(
    width=600,
    height=628,
    polar=dict(
        angularaxis=dict(
            gridwidth=2,  # Increase line width for better visibility
            rotation=90,
        direction='clockwise',

        ),
        radialaxis=dict(

            visible=True,
            range=[0, 100],
            angle=45,
            tickangle=45,
            tickvals=[0, 25, 50, 75, 100],
            ticktext=["0%", "25%", "50%", "75%", "100%"],
        ),
    ),
    title_text='Klokan-QA - Accuracy',
    title_x=0.5,
    title_y=0.97,
    title_xanchor='center',
    title_yanchor='top',
    title_font_size=24,
    title_font_color='#333333',
    font=dict(
        family='Arial',
        size=16,
        color='#333333'
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.45,
        xanchor="center",
        x=0.5
    )
)







Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [378]:
df = pd.pivot_table(TSP_runs, values='metrics.accuracy', index=['params.model_name'], columns=['params.split'], aggfunc=np.mean)
# Convert the values to percentages
df = df * 100

# Rename splits Culture -> Cultural overview
# Analytical -> Analytical thinking
# Verbal -> Verbal thinking
# Critical -> Critical thinking

df.rename(columns={
    'Culture': 'Cultural',
}, inplace=True)

# Sort the rows by alpah
# Sort the DataFrame by model name in alphabetical order
df.sort_index(inplace=True)
df.index.name = ""
df.to_csv("leaderboard/tsp.csv")



The provided callable <function mean at 0x103874c10> is currently using DataFrameGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.



In [471]:
import plotly.graph_objects as go
import pandas as pd


categories = df.columns.tolist()
categories = [*categories, categories[0]]  # Ensure the graph is circular by appending the start to the end
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]

# Setting for 1000x1000
fig_1000 = go.Figure()

for i, (idx, row) in enumerate(df.iterrows()):
    row = row.tolist() + [row[0]]  # Ensure the graph is circular by appending the start to the end
    color = colors[i]
    fig_1000.add_trace(go.Scatterpolar(
        r=row,
        theta=categories,
        opacity=0.4,
        name=idx,
        line=dict(color=color, width=4),  # Adjust line width for better visibility
    ))

fig_1000.update_layout(
    width=600,
    height=628,
    polar=dict(
        angularaxis=dict(
            gridwidth=2,  # Increase line width for better visibility
        ),
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            angle=45,
            tickangle=45,
            tickvals=[0, 25, 50, 75, 100],
            ticktext=["0%", "25%", "50%", "75%", "100%"],
        ),
    ),
    title_text='TSP-QA - Accuracy',
    title_x=0.5,
    title_y=0.97,
    title_xanchor='center',
    title_yanchor='top',
    title_font_size=24,
    title_font_color='#333333',
    font=dict(
        family='Arial',
        size=16,
        color='#333333'
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.45,
        xanchor="center",
        x=0.5
    )
)




Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [380]:
from datasets import  load_dataset

x = load_dataset("hynky/TSP")

In [446]:
q_names = []
vals = []
for dst in x.values():
    lst = [
        x+y for x,y in zip(dst["questionName"], dst["file"])
    ]
    q_names.extend(lst)
    vals.extend(dst)



In [450]:
import numpy as np

# get non uniques ones

from collections import Counter
name_counts = Counter(q_names)
non_unique_names = [name for name, count in name_counts.items() if count > 1]
print(non_unique_names)

# Get indexes of non unique names
non_unique_indexes = [i for i, name in enumerate(q_names) if name in non_unique_names]
print(non_unique_indexes)


['TSP 2010 - varianta 03, otázka č. 4 / 70./Users/hynekkdylicek/Projects/MUNI-TSP/downloads/tsp2010_v03.qdefx', 'TSP 2010 - varianta 03, otázka č. 2 / 70./Users/hynekkdylicek/Projects/MUNI-TSP/downloads/tsp2010_v03.qdefx', 'TSP 2010 - varianta 03, otázka č. 3 / 70./Users/hynekkdylicek/Projects/MUNI-TSP/downloads/tsp2010_v03.qdefx']
[1381, 1399, 1435, 1489, 1533, 1558]


In [469]:
vals[1399]

{'question': 'Vyberte nesprávné tvrzení.',
 'questionTopic': 'Kritické myšlení',
 'questionName': 'TSP 2010 - varianta 03, otázka č. 2 / 70.',
 'file': '/Users/hynekkdylicek/Projects/MUNI-TSP/downloads/tsp2010_v03.qdefx',
 'correct_answer': 'A',
 'answers.A': 'Z ustanovení vyplývá, že před výpovědí nájmu bez přivolení soudu musí pronajímatel doručit nájemci písemnou výstrahu.',
 'answers.B': 'Z ustanovení vyplývá, že za okolností stanovených Občanským zákoníkem má nájemce při výpovědi z nájmu nárok na náhradní bydlení.',
 'answers.C': 'Ustanovení připouští, že nájemce může dostat výpověď z nájmu bez přivolení soudu také v případě, že dluží platby za užívání bytu za leden, červenec a září.',
 'answers.D': 'Ustanovení připouští, že občané mohou bez vážných důvodů užívat dva a více bytů.',
 'answers.E': 'Ustanovení připouští, že výpověď z nájmu lze podat i z jiných než uvedených důvodů.',
 '__index_level_0__': 1869}

In [468]:
vals[1558]

{'question': 'Zpráva z Internetu o podpoře v nezaměstnanosti: „Do 50 let náleží nezaměstnanému podpora 6 měsíců, od 50 do 55 let pak 9 měsíců a starší 55 let mohou pobírat podporu až 12 měsíců. V prvních dvou měsících je vyplácena podpora 80 % z průměrného čistého výdělku, od třetího měsíce pak 55 % z průměrného čistého výdělku. Podpora v nezaměstnanosti má strop 13 307 Kč měsíčně.“ Vyberte správné tvrzení.',
 'questionTopic': 'Kritické myšlení',
 'questionName': 'TSP 2010 - varianta 03, otázka č. 2 / 70.',
 'file': '/Users/hynekkdylicek/Projects/MUNI-TSP/downloads/tsp2010_v03.qdefx',
 'correct_answer': 'D',
 'answers.A': 'Maximální celková částka podpory vyplacená během nezaměstnanosti je pro všechny nezaměstnané stejná.',
 'answers.B': 'Čím je věk občana nižší, tím nižší celková částka podpory mu bude v případě nezaměstnanosti vyplacena.',
 'answers.C': 'Z ustanovení nelze činit závěry, neboť z něj není patrné, co přesně se rozumí průměrným čistým výdělkem.',
 'answers.D': 'Dva občan