# Monotonicity Benchmark

In [15]:
# add parent directory to path
import sys
sys.path.append('..')

import copy
import random

import t2ebm
from t2ebm import graphs
from t2ebm import prompts

## Create the benchmark

In [3]:
# load graphs (pickle)
import pickle

with open("all_graphs.pkl", "rb") as f:
    all_graphs = pickle.load(f)

In [73]:
def is_monotone_increasing(graph):
    """Return: True/False"""
    for idx, x_bin in enumerate(graph.x_vals):
        if idx > 0 and idx < len(graph.x_vals):
            if graph.scores[idx] < graph.scores[idx - 1]:
                return False
    return True

def is_monotone_decreasing(graph):
    """Return: True/False"""
    for idx, x_bin in enumerate(graph.x_vals):
        if idx > 0 and idx < len(graph.x_vals):
            if graph.scores[idx] > graph.scores[idx - 1]:
                return False
    return True

def invert_graph(graph: t2ebm.graphs.EBMGraph):
    """Returns a new graph with the y-axis inverted."""
    new_graph = copy.deepcopy(graph)
    new_graph.scores = -new_graph.scores
    return new_graph

In [107]:
questions = []
num_non_monotone = 0
for graph, graph_as_text in all_graphs:
    # only continuous graphs
    if graph.feature_type != "continuous":
        continue
    question = """Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)\n\n"""
    question += f"Here is the graph:\n\n{graph_as_text}\n\n"
    question += f"Your task is to determine if the graph is\na) monotone increasing\nb) monotone decreasing\nc) not monotone.\n\nWhat is the correct answer? Think step by step."
    graph_ = graphs.text_to_graph(graph_as_text)
    if is_monotone_increasing(graph_):
        questions.append((question, "Increasing"))
    elif is_monotone_decreasing(graph_):
        questions.append((question, "Decreasing"))
    elif num_non_monotone < 62:
        num_non_monotone += 1
        questions.append((question, "Not monotone"))

In [108]:
# subset 100 random questions
random.shuffle(questions)
questions = questions[:100]

In [111]:
# print a random question
import random
random.shuffle(questions)
print(questions[0][0])
print('SOLUTION: ', questions[0][1])

Below is the graph of a generalized additive model (GAM). The graph is presented as a JSON object with keys representing the x-axis and values representing the y-axis. For continuous features, the keys are intervals that represent ranges where the function predicts the same value. For categorical features, each key represents a possible value that the feature can take.
        
    The graph is provided in the following format:
        - The name of the feature depicted in the graph
        - The type of the feature (continuous, categorical, or boolean)
        - Mean values
        - Lower bounds of confidence interval (optional)
        - Upper bounds of confidence interval (optional)

Here is the graph:

This graph represents a continuous-valued feature. The keys are intervals that represent ranges where the function predicts the same value.

Feature Name: FoodCourt
Feature Type: continuous
Means: {"(0.0, 593.5)": -0.177, "(593.5, 779.5)": 0.043, "(779.5, 1341.5)": 0.27, "(1341.5, 2

In [112]:
# print the number of non-monotone graphs
len([q for q in questions if q[1] == "Not monotone"])

62

In [114]:
# save the questions to json
import json
with open("../benchmark/monotonicity.json", "w") as f:
    json.dump(questions, f, indent=2)

# Run the benchmark

In [119]:
# load the json file
with open("../benchmark/monotonicity.json", "r") as f:
    questions = json.load(f)

In [123]:
system_msg = "You are an expert statistician and data scientist. You interpret global explanations produced by a generalized additive model (GAM). You answer all questions to the best of your ability, combining the data contained in the graph, any data set description you are given, and your knowledge about the real world."
for question in questions:
    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": question[0]}]
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    messages.append({"role": "assistant", "content": response})
    messages.append({"role": "user", "content": "Thanks. Now summarize your response by answering with 'Increasing', 'Decreasing', or 'Not monotone'."})
    response = t2ebm.utils.openai_completion_query('gpt-3.5-turbo-0125', messages, temperature=0.0)
    print('LLM RESPONSE: ', response)
    print('SOLUTION: ', question[1])
    print('-'*80)

LLM RESPONSE:  Increasing
SOLUTION:  Not monotone
--------------------------------------------------------------------------------
LLM RESPONSE:  Not monotone
SOLUTION:  Not monotone
--------------------------------------------------------------------------------
LLM RESPONSE:  Decreasing
SOLUTION:  Not monotone
--------------------------------------------------------------------------------
LLM RESPONSE:  Not monotone
SOLUTION:  Not monotone
--------------------------------------------------------------------------------
LLM RESPONSE:  Increasing
SOLUTION:  Increasing
--------------------------------------------------------------------------------
LLM RESPONSE:  Increasing
SOLUTION:  Increasing
--------------------------------------------------------------------------------
LLM RESPONSE:  Increasing
SOLUTION:  Increasing
--------------------------------------------------------------------------------
LLM RESPONSE:  Increasing
SOLUTION:  Increasing
-------------------------------------

### Gemini

In [122]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ['GEMINI_API_KEY'])

import time

model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

for question in questions:
    messages = [{'role':'user', 'parts': [question[0]]}]
    response = model.generate_content(
                messages,
                generation_config=genai.types.GenerationConfig(
                    candidate_count=1,
                    max_output_tokens=500,
                    temperature=0.2),
            )
    try:
        response = response.text
    except:
        print(f"Gemini: Invalid response with parts {response.parts}.")
        response = ""
    print('LLM RESPONSE: ', response)
    print('SOLUTION: ', question[1])
    print('-'*80)
    # sleep 20 sec to avoid rate limit
    time.sleep(20)

LLM RESPONSE:  Here's how to determine the monotonicity of the graph:

1. **Understand Monotonicity:**
   - **Monotone Increasing:** A function is monotone increasing if, as the input increases, the output either increases or stays the same. It never decreases.
   - **Monotone Decreasing:** A function is monotone decreasing if, as the input increases, the output either decreases or stays the same. It never increases.

2. **Analyze the "Means" data:**  We need to see if the mean values consistently increase or decrease as the input intervals increase. Here's a breakdown:

   | Interval        | Mean     | 
   |-----------------|----------|
   | (0.0, 593.5)     | -0.177   | 
   | (593.5, 779.5)    | 0.043    | 
   | (779.5, 1341.5)   | 0.27     |
   | (1341.5, 2175.5)  | 0.543    |
   | ...             | ...      | 

3. **Identify the Trend:** As you move from left to right across the intervals, the mean values are generally increasing. There's a small dip between the intervals (5218.5,