In [1]:
from dataclasses import dataclass
import sample
import json
import pandas as pd
from solver import perform_one_token_cpc, perform_cot_cpc
from solver import perform_one_token_cpc, perform_cot_cpc

from llm import LLM
from openai import OpenAI

from dotenv import load_dotenv
load_dotenv()

True

When the ai considers whether to step back or not, does its one-word answer differ from its CoT answer?

In [2]:
passages = json.load(open("data/passages1.json"))

Each 'passage' is a lengthy text where we are reasoning through a problem.
Consider progressively larger context parts of each passage (that is, we are 'checking in' as we proceed through reading the passage), and for each context part, ask the LLM if the current approach is working or not.

In [None]:
# one-row test
llm = LLM("gpt-3.5-turbo")
for context in sample.checkpoints(passages["coding_interviews"][0], 1000):
    one_token_cpc_result = perform_one_token_cpc(llm, context)
    (cot_cpc_thoughts, cot_cpc_result) = perform_cot_cpc(llm, context)
    print(context.text)
    print(one_token_cpc_result)
    print(cot_cpc_thoughts)
    print(cot_cpc_result)
    print()

In [19]:
def experiment1(llm, passages):
    df = pd.DataFrame(columns=["category", "context", "one_token_cpc_result", "cot_cpc_thoughts", "cot_cpc_result"])
    for category, passages_list in passages.items():
        for passage_index, passage in enumerate(passages_list):
            print(f"Category {category}: passage {passage_index} of {len(passages_list)}...")
            for context in sample.checkpoints(passage, 1000):
                one_token_cpc_result = perform_one_token_cpc(llm, context)
                (cot_cpc_thoughts, cot_cpc_result) = perform_cot_cpc(llm, context)
                df = pd.concat([df, pd.DataFrame(
                    {"category": category, "context": context.text, "one_token_cpc_result": one_token_cpc_result,
                     "cot_cpc_thoughts": cot_cpc_thoughts, "cot_cpc_result": cot_cpc_result}, index=[0])],
                                        ignore_index=True)
    return df

In [20]:
experiment1_gpt3 = experiment1(LLM("gpt-3.5-turbo"), passages)
experiment1_gpt4 = experiment1(LLM("gpt-4"), passages)

Category proofs: passage 0 of 10...
Category proofs: passage 1 of 10...
Category proofs: passage 2 of 10...
Category proofs: passage 3 of 10...
Category proofs: passage 4 of 10...
Category proofs: passage 5 of 10...
Category proofs: passage 6 of 10...
Category proofs: passage 7 of 10...
Category proofs: passage 8 of 10...
Category proofs: passage 9 of 10...
Category coding_interviews: passage 0 of 10...
Category coding_interviews: passage 1 of 10...
Category coding_interviews: passage 2 of 10...
Category coding_interviews: passage 3 of 10...
Category coding_interviews: passage 4 of 10...
Category coding_interviews: passage 5 of 10...
Category coding_interviews: passage 6 of 10...
Category coding_interviews: passage 7 of 10...
Category coding_interviews: passage 8 of 10...
Category coding_interviews: passage 9 of 10...
Category proofs: passage 0 of 10...
Category proofs: passage 1 of 10...
Category proofs: passage 2 of 10...
Category proofs: passage 3 of 10...
Category proofs: passage 4

For each result, determine whether the result is good (the two cpc methods agreed) or bad (they disagreed).

In [21]:
def score(df):
    df["score"] = df.apply(
        lambda row: 1.0 if row.one_token_cpc_result.lower() == row.cot_cpc_result.lower() else 0.0, axis=1)
    print(df["score"].mean())
    return df

In [23]:
score(experiment1_gpt3)
score(experiment1_gpt4)

0.7012987012987013
0.8311688311688312


Unnamed: 0,category,context,one_token_cpc_result,cot_cpc_thoughts,cot_cpc_result,score
0,proofs,"For this problem, I need to prove that for any...",Yes,"Yes, it may be easier to use a direct proof to...",No,0.0
1,proofs,"For this problem, I need to prove that for any...",Yes,"Yes, it seems like our current approach is not...",No,0.0
2,proofs,"For this problem, I need to prove that for any...",No,"No, there is no need to change the approach. T...",No,1.0
3,proofs,"For this problem, I need to prove that for any...",No,There is no need to change to a different appr...,No,1.0
4,proofs,"Hmm, I need to prove this statement about prim...",No,"No, there's no need to change the approach. Th...",No,1.0
...,...,...,...,...,...,...
72,coding_interviews,"For this interview question, I need to write a...",No,"No, your current approach seems sound and effi...",No,1.0
73,coding_interviews,"For this coding interview question, I need to ...",Yes,"Yes, we should use a different approach. A mor...",No,0.0
74,coding_interviews,"For this coding interview question, I need to ...",No,"No, there isn't a need to change the approach ...",No,1.0
75,coding_interviews,"For this coding interview question, I need to ...",No,The current approach is working well and shoul...,No,1.0


In [24]:
experiment1_gpt3.to_csv("experiment1_20narratives_gpt35")
experiment1_gpt4.to_csv("experiment1_20narratives_gpt4")