# Wide Confidence Interval Benchmark

In [5]:
# add parent directory to path
import sys
sys.path.append('..')

import copy
import random
import numpy as np

import matplotlib.pyplot as plt

import t2ebm
from t2ebm import graphs
from t2ebm import prompts

## Create the benchmark

In [1]:
# load graphs (pickle)
import pickle

with open("all_graphs.pkl", "rb") as f:
    all_graphs = pickle.load(f)

  " a simplification level of 10\%. This graph is too complex to"


In [2]:
def widest_confidence_interval(graph):
    """returns the position and the size of the largest jump in the graph"""
    idx = np.argmax(graph.stds)
    return graph.x_vals[idx]

In [35]:
questions = []
for graph, graph_as_text in all_graphs:
    # only continuous graphs
    if graph.feature_type != "continuous":
        continue
    question = """Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)\n\n"""
    question += f"Here is the graph:\n\n{graph_as_text}\n"
    question += """Every interval in the graph comes with a lower and and an upper confidence bound.

We are now looking for the interval in the graph that has the widest confidence bound. 

What is the x-axis interval that has the widest confidence bound? Think step by step."""
    graph_ = graphs.text_to_graph(graph_as_text)
    pos = widest_confidence_interval(graph_)
    questions.append((question, str(pos)))

In [36]:
# subset 100 random questions
random.shuffle(questions)
questions = questions[:100]

In [41]:
# print a random question
import random
random.shuffle(questions)
print(questions[0][0])
print('SOLUTION: ', questions[0][1])

Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)

Here is the graph:

This graph represents a continuous-valued feature. The keys are intervals that represent ranges where the function predicts the same value.

Feature Name: smoothness_mean
Feature Type: continuous
Means: {"(0.05263, 0.0706)": -0.835, "(0.0706, 0.07455500000000001)": -0.769, "(0.07455

In [42]:
# save the questions to json
import json
with open("../benchmark/wide-confidence.json", "w") as f:
    json.dump(questions, f, indent=2)

# Run the benchmark

In [66]:
# load the json file
with open("../benchmark/wide-confidence.json", "r") as f:
    questions = json.load(f)
len(questions)

100

In [65]:
system_msg = "You are an expert statistician and data scientist. You interpret global explanations produced by a generalized additive model (GAM). You answer all questions to the best of your ability, combining the data contained in the graph, any data set description you are given, and your knowledge about the real world."
for question in questions[54:]:
    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": question[0]}]
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    messages.append({"role": "assistant", "content": response})
    messages.append({"role": "user", "content": "Thanks. Now summarize your response by answering again with the interval."})
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    print('LLM RESPONSE: ', response)
    print('SOLUTION: ', question[1])
    print('-'*80)

LLM RESPONSE:  The x-axis interval that has the widest confidence bound is (0.17825, 0.19574999999999998).
SOLUTION:  (0.17825, 0.19574999999999998)
--------------------------------------------------------------------------------
LLM RESPONSE:  The x-axis interval that has the widest confidence bound is "(5.45, 5.55)" with a width of 1.007.
SOLUTION:  (5.45, 5.55)
--------------------------------------------------------------------------------
LLM RESPONSE:  The x-axis interval that has the widest confidence bound is "(1.0, 1.5)".
SOLUTION:  (1.0, 1.5)
--------------------------------------------------------------------------------
LLM RESPONSE:  The x-axis interval that has the widest confidence bound is "(38.5, 48.5)" with a width of 57155.6.
SOLUTION:  (38.5, 48.5)
--------------------------------------------------------------------------------
LLM RESPONSE:  The x-axis interval with the widest confidence bound is "(387.6646, 512.3292)".
SOLUTION:  (0.0, 6.325)
---------------------

### gemini

In [68]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ['GEMINI_API_KEY'])

import time

model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

for question in questions:
    messages = [{'role':'user', 'parts': [question[0]]}]
    response = model.generate_content(
                messages,
                generation_config=genai.types.GenerationConfig(
                    candidate_count=1,
                    max_output_tokens=500,
                    temperature=0.2),
            )
    try:
        response = response.text
    except:
        print(f"Gemini: Invalid response with parts {response.parts}.")
        print(response)
        response = ""
    print('LLM RESPONSE: ', response)
    print('SOLUTION: ', question[1])
    print('-'*80)
    # sleep 20 sec to avoid rate limit
    time.sleep(20)

LLM RESPONSE:  Here's how to find the interval with the widest confidence bound:

1. **Calculate the width of each confidence interval:** 
   - For each interval, subtract the lower bound from the upper bound. 
   - For example, for the interval (0.05263, 0.0706): -0.216 - (-1.454) = 1.238

2. **Compare the widths:**
   - Go through all the calculated widths and find the largest one.

3. **Identify the corresponding interval:**
   - The interval associated with the largest width is the interval with the widest confidence bound.

Let's calculate for the first few intervals:

- (0.05263, 0.0706): -0.216 - (-1.454) = **1.238**
- (0.0706, 0.07455500000000001): -0.178 - (-1.359) = **1.181**
- (0.07455500000000001, 0.07589499999999999): -0.151 - (-1.244) = **1.093** 
- ... and so on

You would need to continue this calculation for all intervals.  The interval with the largest difference between its upper and lower bound is the interval with the widest confidence bound. 

SOLUTION:  (0.05263,

DeadlineExceeded: 504 Deadline Exceeded