In [1]:
import pathlib
import sys

def _add_src_to_path():
    src_path = str(pathlib.Path.cwd().parent.parent)
    if src_path not in sys.path:
        # sys.path.insert(0, str(pathlib.Path.cwd().parent.parent))
        sys.path.append(str(pathlib.Path.cwd().parent.parent))

_add_src_to_path()

In [2]:
import pandas as pd

from src.llm_experiments import data_analysis_util

from src.ttt import ttt_board


def do_analysis():
    analyzer = data_analysis_util.DataAnalyzer()
    results: list[dict[str, str | int]] = []
    for data in data_analysis_util.data_iterator():
        results.append(data.model_dump())
        eval, eval_msg = analyzer.analyse(data)
        board = ttt_board.BoardState.from_array(data.board)
        results[-1].update(
            {"move_num": board.move_count, "eval": eval.name, "eval_comment": eval_msg}
        )
    df = pd.DataFrame(results)
    return df


_DF = do_analysis()
_DF

Unnamed: 0,timestamp,board,error,ai_model,ai_whose_move,ai_move,duration_ms,move_num,eval,eval_comment
0,1759983604,O..|.O.|X.X,,OpenAI gpt-4.1,O,"[O, ., ., ., O, X, X, ., X]",2029,4,BEST_MOVE,Continues to win. X wins in 2 moves.
1,1759983605,..X|..X|OO.,,OpenAI gpt-4.1,O,"[., ., X, ., X, X, O, O, .]",888,4,BLUNDER,Hands a won game to the opponent. O wins in 1 ...
2,1759983607,OX.|...|...,,OpenAI gpt-4.1,O,"[O, X, X, ., ., ., ., ., .]",1998,2,BLUNDER,"Was draw, now opponent wins. O wins in 5 moves."
3,1759983608,XO.|XO.|..X,,OpenAI gpt-4.1,ended,,1815,5,ASSUMED_ENDED,Incorrectly determined that game ended.
4,1759983610,...|.O.|..X,,OpenAI gpt-4.1,O,"[., ., ., ., O, ., ., X, X]",1354,2,BEST_MOVE,Best move. Drawn position.
...,...,...,...,...,...,...,...,...,...,...
495,1760025490,...|..O|..X,,OpenAI o3-mini,O,"[., ., ., ., X, O, ., ., X]",12495,2,BEST_MOVE,Best move. X wins in 4 moves.
496,1760025497,...|...|X..,,OpenAI o3-mini,X,"[., ., ., ., O, ., X, ., .]",7104,1,BEST_MOVE,Best move. Drawn position.
497,1760025509,..X|...|.XO,illegal JSON,OpenAI o3-mini,,,11838,3,PARSE_ERROR,illegal JSON
498,1760025515,...|..X|...,,OpenAI o3-mini,X,"[., ., ., ., O, X, ., ., .]",6314,1,BEST_MOVE,Best move. Drawn position.


In [3]:
import plotly.express as px

from src.ttt import ttt_evaluator


def eval_colors(values: set[str]) -> list[tuple[str, str]]:
    good = ["BEST_MOVE", "GOOD_MOVE", "NOT_EVALUATED"]
    good = [x for x in good if x in values]
    bad = [x for x in values if x not in good]
    result: list[tuple[str, str]] = []
    name_to_value = {e.name: e.value for e in ttt_evaluator.TttEvaluation}
    for labels, colors in [
        (good, ["#0000ff", "#2F94FF"]),
        # (bad, px.colors.sequential.Oranges[::2]),
        (bad, px.colors.diverging.RdYlGn),
    ]:
        print(labels)
        assert len(labels) <= len(colors), f"Need more colors for {labels}, {colors=}"
        labels.sort(key=lambda label: name_to_value[label], reverse=True)
        for label, color in zip(labels, colors):
            result.append((label, color))
    return result


def plotly_chart(df: pd.DataFrame):
    unique_evals = set(df["eval"])
    values, colors = zip(*eval_colors(unique_evals))
    df_count = df.groupby(["eval", "ai_model"]).size().reset_index(name="count")
    model_sort = ["OpenAI gpt-3.5-turbo", "OpenAI gpt-4.1","OpenAI o3-mini", "OpenAI o3"]
    fig = px.bar(
        df_count,
        x="count",
        y="ai_model",
        color="eval",
        orientation="h",
        category_orders={
            "eval": values,
            "ai_model": model_sort,
        },
        color_discrete_map={a: b for a, b in zip(values, colors)},
        width=800,
        height=330,
        labels={
            "eval": "Move Evaluation Legend:",
            "ai_model": "Model",
            "count": "# Tic-tac-toe Positions Queried",
        },
    )
    fig.update_layout(
        plot_bgcolor="white",
        paper_bgcolor="white",
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False),
    )
    fig.show()

    # Time taken to solve.
    df_mean = df.groupby(["ai_model"])["duration_ms"].mean().reset_index()
    df_mean["Duration"] = df_mean["duration_ms"] / 1000
    fig = px.bar(
        df_mean,
        y="ai_model",
        x="Duration",
        orientation="h",
        category_orders={
            "ai_model": model_sort,
        },
        labels={
            "ai_model": "Model",
            "Duration": "Average Response Latency (s)",
        },
        width=800,
        height=320,
    )
    fig.update_layout(
        bargap=0.6,
        plot_bgcolor="lightgray",
        paper_bgcolor="white",
        # xaxis=dict(showgrid=False),
        # yaxis=dict(showgrid=False),
    )
    fig.update_traces(marker_color="gray")
    fig.show()


plotly_chart(_DF)

['BEST_MOVE', 'GOOD_MOVE']
['INVALID_BOARD', 'ASSUMED_ENDED', 'PARSE_ERROR', 'BLUNDER', 'ILLEGAL_MOVE']


In [4]:
_DF[_DF['eval'] == 'WRONG_PLAYER']

Unnamed: 0,timestamp,board,error,ai_model,ai_whose_move,ai_move,duration_ms,move_num,eval,eval_comment
