In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from openai import OpenAI
from dotenv import load_dotenv
import os
import random
import numpy as np
import regex as re
import pandas as pd
import random
from scipy.stats import sem, t
from formal_lang_refactored import ExperimentRunner
import matplotlib.pyplot as plt


load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_KEY')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def compute_confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    std_err = sem(data)
    df = len(data) - 1
    interval = t.interval(confidence, df, loc=mean, scale=std_err)
    return mean, interval


# Define parameter ranges
length_thresholds = [5, 10, 20, 30, 40, 50]
num_people = [3, 4, 5, 6]
num_few_shot = [0, 2, 3]
locations = [4, 8, 16]
types = ["cot", "standard"]

# Models to evaluate
models = ["gpt-3.5-turbo", "gpt-4"]

# Initialize results storage
results = {model: {"length_threshold": [], "num_people": [], "num_few_shot": [], "num_locs": []} for model in models}

# Function to calculate confidence interval

# Run experiments
for response_type in types:
    for num_locs in locations:
        for length_threshold in length_thresholds:
            for people_count in num_people:
                for few_shot in num_few_shot:
                    print([response_type, num_locs, length_threshold, people_count, few_shot])
                    runner = ExperimentRunner(
                        people=[f"Person_{i}" for i in range(people_count)],
                        locations=[f"hole_{i+1}" for i in range(num_locs - 1)] + ["field"],
                        relation="jumps_in",
                        models=models,
                        num_trials=20,  # Adjust trials as needed
                        length_threshold=length_threshold,
                        trial_seed=50,
                        params={"prompt": str(few_shot), "type": response_type},
                    )
                    accuracies = runner.model_evals()

                    # Store results
                    for model in models:
                        mean_acc, conf_int = compute_confidence_interval(accuracies[model])
                        results[model]["length_threshold"].append((length_threshold, mean_acc, conf_int))
                        results[model]["num_people"].append((people_count, mean_acc, conf_int))
                        results[model]["num_few_shot"].append((few_shot, mean_acc, conf_int))
                        results[model]["num_locs"].append((num_locs, mean_acc, conf_int))




['cot', 4, 5, 3, 0]
Beginning...
Trial  0
Trial  5
Trial  10
Trial  15
gpt-3.5-turbo Avg: 0.4
gpt-3.5-turbo confidence interval: (nan, nan)
gpt-4 Avg: 0.0
gpt-4 confidence interval: (nan, nan)
Experiments finished!
['cot', 4, 5, 3, 2]
Beginning...
Trial  0


  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc


Trial  5
Trial  10
Trial  15
gpt-3.5-turbo Avg: 0.45
gpt-3.5-turbo confidence interval: (nan, nan)
gpt-4 Avg: 0.3
gpt-4 confidence interval: (nan, nan)
Experiments finished!
['cot', 4, 5, 3, 3]
Beginning...
Trial  0
Trial  5
Trial  10
Trial  15
gpt-3.5-turbo Avg: 0.5
gpt-3.5-turbo confidence interval: (nan, nan)
gpt-4 Avg: 0.35
gpt-4 confidence interval: (nan, nan)
Experiments finished!
['cot', 4, 5, 4, 0]
Beginning...
Trial  0
Trial  5
Trial  10
Trial  15
gpt-3.5-turbo Avg: 0.35
gpt-3.5-turbo confidence interval: (nan, nan)
gpt-4 Avg: 0.1
gpt-4 confidence interval: (nan, nan)
Experiments finished!
['cot', 4, 5, 4, 2]
Beginning...
Trial  0
Trial  5
Trial  10
Trial  15
gpt-3.5-turbo Avg: 0.3
gpt-3.5-turbo confidence interval: (nan, nan)
gpt-4 Avg: 0.35
gpt-4 confidence interval: (nan, nan)
Experiments finished!
['cot', 4, 5, 4, 3]
Beginning...
Trial  0
Trial  5
Trial  10
Trial  15
gpt-3.5-turbo Avg: 0.35
gpt-3.5-turbo confidence interval: (nan, nan)
gpt-4 Avg: 0.7
gpt-4 confidence inter

KeyboardInterrupt: 

In [None]:
# Visualization
def plot_trends(data, xlabel, ylabel, title, models, param_name):
    plt.figure(figsize=(8, 6))
    for model in models:
        x_vals = [x[0] for x in data[model][param_name]]
        y_vals = [x[1] for x in data[model][param_name]]
        plt.plot(x_vals, y_vals, label=model)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot trends
plot_trends(results, "Length Threshold", "Accuracy", "Accuracy vs Length Threshold", models, "length_threshold")
plot_trends(results, "Number of People", "Accuracy", "Accuracy vs Number of People", models, "num_people")
plot_trends(results, "Few-shot Examples", "Accuracy", "Accuracy vs Few-shot Examples", models, "num_few_shot")