# Wide Confidence Interval Benchmark

In [5]:
# add parent directory to path
import sys
sys.path.append('..')

import copy
import random
import numpy as np

import matplotlib.pyplot as plt

import t2ebm
from t2ebm import graphs
from t2ebm import prompts

## Create the benchmark

In [1]:
# load graphs (pickle)
import pickle

with open("all_graphs.pkl", "rb") as f:
    all_graphs = pickle.load(f)

  " a simplification level of 10\%. This graph is too complex to"


In [2]:
def widest_confidence_interval(graph):
    """returns the position and the size of the largest jump in the graph"""
    idx = np.argmax(graph.stds)
    return graph.x_vals[idx]

In [35]:
questions = []
for graph, graph_as_text in all_graphs:
    # only continuous graphs
    if graph.feature_type != "continuous":
        continue
    question = """Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)\n\n"""
    question += f"Here is the graph:\n\n{graph_as_text}\n"
    question += """Every interval in the graph comes with a lower and and an upper confidence bound.

We are now looking for the interval in the graph that has the widest confidence bound. 

What is the x-axis interval that has the widest confidence bound? Think step by step."""
    graph_ = graphs.text_to_graph(graph_as_text)
    pos = widest_confidence_interval(graph_)
    questions.append((question, str(pos)))

In [36]:
# subset 100 random questions
random.shuffle(questions)
questions = questions[:100]

In [41]:
# print a random question
import random
random.shuffle(questions)
print(questions[0][0])
print('SOLUTION: ', questions[0][1])

Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)

Here is the graph:

This graph represents a continuous-valued feature. The keys are intervals that represent ranges where the function predicts the same value.

Feature Name: smoothness_mean
Feature Type: continuous
Means: {"(0.05263, 0.0706)": -0.835, "(0.0706, 0.07455500000000001)": -0.769, "(0.07455

In [42]:
# save the questions to json
import json
with open("../benchmark/wide-confidence.json", "w") as f:
    json.dump(questions, f, indent=2)

# Run the benchmark

In [59]:
# load the json file
with open("../benchmark/wide-confidence.json", "r") as f:
    questions = json.load(f)
len(questions)

100

In [64]:
system_msg = "You are an expert statistician and data scientist. You interpret global explanations produced by a generalized additive model (GAM). You answer all questions to the best of your ability, combining the data contained in the graph, any data set description you are given, and your knowledge about the real world."
for question in questions[54:]:
    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": question[0]}]
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    messages.append({"role": "assistant", "content": response})
    messages.append({"role": "user", "content": "Thanks. Now summarize your response by answering again with the interval."})
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    print('LLM RESPONSE: ', response)
    print('SOLUTION: ', question[1])
    print('-'*80)

LLM RESPONSE:  To find the interval with the widest confidence bound, we need to calculate the difference between the upper and lower bounds for each interval and identify the interval where this difference is the largest.

Here are the steps to find the interval with the widest confidence bound:

1. **Extract the Lower and Upper Bounds**: For each interval, retrieve the lower and upper bounds from the provided data.

2. **Calculate the Width of the Confidence Interval**: For each interval, subtract the lower bound from the upper bound to get the width of the confidence interval.

3. **Compare the Widths**: Identify the interval with the maximum width.

Let's perform these calculations:

- For interval `(0.07117, 0.09376000000000001)`: Width = -0.336 - (-2.26) = 1.924
- For interval `(0.09376000000000001, 0.099705)`: Width = -0.19 - (-2.132) = 1.942
- For interval `(0.099705, 0.10519999999999999)`: Width = -0.039 - (-2.009) = 1.97
- For interval `(0.10519999999999999, 0.10825)`: Width 