# Largest Jump Benchmark

In [3]:
# add parent directory to path
import sys
sys.path.append('..')

import copy
import random
import numpy as np

import t2ebm
from t2ebm import graphs
from t2ebm import prompts

## Create the benchmark

In [4]:
# load graphs (pickle)
import pickle

with open("all_graphs.pkl", "rb") as f:
    all_graphs = pickle.load(f)

In [6]:
def graph_largest_jump(graph):
    """returns the position and the size of the largest jump in the graph"""
    jumps = [
        np.abs(graph.scores[idx] - graph.scores[idx - 1])
        for idx in range(1, len(graph.x_vals))
    ]
    largest_jump_idx = np.argmax(jumps)
    return graph.x_vals[largest_jump_idx][1], jumps[largest_jump_idx]

In [49]:
questions = []
for graph, graph_as_text in all_graphs:
    # only continuous graphs
    if graph.feature_type != "continuous":
        continue
    question = """Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)\n\n"""
    question += f"Here is the graph:\n\n{graph_as_text}\n"
    question += """Within the different intervals, the graph predicts the same mean value. This means that there are discontinuous 'jumps' in the mean of the graph in between the intervals.

We are now looking for the position of the largest jump in the graph. We are looking for both positive and negative jumps, that is we care about the absolute magnitude of the jump.

What is the x-axis position of the largest jump in the graph? Think step by step."""
    graph_ = graphs.text_to_graph(graph_as_text)
    pos, size = graph_largest_jump(graph_)
    questions.append((question, str(pos)))

In [50]:
# subset 100 random questions
random.shuffle(questions)
questions = questions[:100]

In [56]:
# print a random question
import random
random.shuffle(questions)
print(questions[0][0])
print('SOLUTION: ', questions[0][1])

Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)

Here is the graph:

This graph represents a continuous-valued feature. The keys are intervals that represent ranges where the function predicts the same value.

Feature Name: compactness_worst
Feature Type: continuous
Means: {"(0.02729, 0.049945)": -0.0578, "(0.049945, 0.06971)": -0.0099, "(0.06971, 0.

In [55]:
# save the questions to json
import json
with open("../benchmark/jumps.json", "w") as f:
    json.dump(questions, f, indent=2)

# Run the benchmark

In [62]:
# load the json file
with open("../benchmark/jumps.json", "r") as f:
    questions = json.load(f)

In [63]:
len(questions)

100

In [64]:
system_msg = "You are an expert statistician and data scientist. You interpret global explanations produced by a generalized additive model (GAM). You answer all questions to the best of your ability, combining the data contained in the graph, any data set description you are given, and your knowledge about the real world."
for question in questions:
    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": question[0]}]
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    print('LLM RESPONSE: ', response)
    print('SOLUTION: ', question[1])
    print('-'*80)

LLM RESPONSE:  To identify the position of the largest jump in the graph, we need to compare the mean values within each interval and calculate the absolute difference between adjacent intervals. The largest jump will be the one with the highest absolute difference in mean values.

Let's calculate the absolute differences between adjacent intervals:

1. Absolute difference between intervals (1.0, 1.5) and (1.5, 2.5):
   Absolute difference = |0.534 - (-0.009)| = 0.543

2. Absolute difference between intervals (1.5, 2.5) and (2.5, 3.0):
   Absolute difference = |(-0.532) - 0.534| = 1.066

Based on these calculations, the largest jump in the graph occurs between the intervals (1.5, 2.5) and (2.5, 3.0) with an absolute difference of 1.066.

Therefore, the x-axis position of the largest jump in the graph is between Pclass values of 1.5 and 2.5.
SOLUTION:  2.5
--------------------------------------------------------------------------------
LLM RESPONSE:  To find the x-axis position of the l

KeyboardInterrupt: 

In [None]:
# 24 mistakes for gpt-4-turbo-2024-04-09

In [None]:
#  mistakes for gpt-3.5-turbo-0125