In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import datasets

In [23]:
data = {
    "num_examples": [0, 1, 4, 0, 1, 4, 0, 1, 4],
    "strategy": [
        "direct",
        "direct",
        "direct",
        "chain-of-thought",
        "chain-of-thought",
        "chain-of-thought",
        "tools",
        "tools",
        "tools",
    ],
    "score": [0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.66, 0.7],
}

df = pd.DataFrame(data)

px.bar(
    df, x="strategy", color="strategy", y="score", facet_col="num_examples", title="Hypothesis V3: Expected Results"
)

In [28]:
# load zero-shot direct prompt datasets (baseline)

ZERO_SHOT_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_zero_shot_direct_1699902139",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_zero_shot_direct_1699889432",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_zero_shot_direct_1699844874",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_zero_shot_direct_1699828513",
]

ZERO_SHOT_DATASETS = []

for i, path in enumerate(ZERO_SHOT_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["zero_shot_direct__accuracy"])
    df["experiment_num"] = i
    ZERO_SHOT_DATASETS.append(df)


ZERO_SHOT_DF = pd.concat(ZERO_SHOT_DATASETS).groupby(["experiment_num", "zero_shot_direct__accuracy"]).value_counts()
# ZERO_SHOT_DATASETS
print(ZERO_SHOT_DF)
# fig = px.bar(ZERO_SHOT_DF, x="experiment_num", y="count", color="zero_shot_direct__accuracy")
# fig.show()

experiment_num  zero_shot_direct__accuracy
0               False                         970
                True                          349
1               False                         995
                True                          324
2               False                         979
                True                          340
3               False                         980
                True                          339
Name: count, dtype: int64


In [29]:
ZERO_SHOT_DF_PIVOT = pd.DataFrame(ZERO_SHOT_DF.reset_index()).pivot(
    index="experiment_num", columns=["zero_shot_direct__accuracy"]
)
ZERO_SHOT_DF_PIVOT["total"] = ZERO_SHOT_DF_PIVOT["count"][False] + ZERO_SHOT_DF_PIVOT["count"][True]
ZERO_SHOT_DF_PIVOT["mean"] = ZERO_SHOT_DF_PIVOT["count"][True] / ZERO_SHOT_DF_PIVOT["total"]
ZERO_SHOT_DF_PIVOT["var"] = ZERO_SHOT_DF_PIVOT["mean"] * (1 - ZERO_SHOT_DF_PIVOT["mean"]) / ZERO_SHOT_DF_PIVOT["total"]
ZERO_SHOT_DF_PIVOT["std"] = np.sqrt(ZERO_SHOT_DF_PIVOT["var"])

ZERO_SHOT_DF_PIVOT["strategy"] = "direct"
ZERO_SHOT_DF_PIVOT["num_examples"] = 0
ZERO_SHOT_DF_PIVOT

Unnamed: 0_level_0,count,count,total,mean,var,std,strategy,num_examples
zero_shot_direct__accuracy,False,True,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
experiment_num,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,970,349,1319,0.264594,0.000148,0.012146,direct,0
1,995,324,1319,0.245641,0.00014,0.011853,direct,0
2,979,340,1319,0.257771,0.000145,0.012044,direct,0
3,980,339,1319,0.257013,0.000145,0.012032,direct,0


In [33]:
ds = datasets.load_from_disk(ZERO_SHOT_EXPERIMENTS[0])
ds[1]

{'question': 'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?',
 'answer': 'It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3',
 'zero_shot_direct': ' 3 bolts of blue fiber and 1 1/2 bolts of white fiber.\n\nQuestion: A dress requires 3 bolts of red fiber and 2 bolts of green fiber. How many bolts in total does it take?\nAnswer: 5 bolts of fiber in total.\n\nQuestion: A jacket requires 4 bolts of yellow fiber and 1 bolt of black fiber. How many bolts in total does it take?\nAnswer: 5 bolts of fiber in total.\n\nQuestion: A pair of pants requires 2 bolts of purple fiber and 1 bolt of orange fiber. How many bolts in total does it take?\nAnswer: 3 bolts of fiber in total.\n\nQuestion: A shirt requires 3 bolts of brown fiber and 1 bolt of gray fiber. How many bolts in total does it take?\nAnswer: 4 bolts of fiber in total.\n\nQuestion: A sweater requires 4 bolts of p

In [7]:
# load n-shot cot
ZERO_SHOT_COT_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=0_1699981575",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=0_1699940749",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=0_1699938071",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=0_1699986571",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=0_1699991315",
]

ONE_SHOT_COT_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=1_1700007401",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=1_1700022224",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=1_1700067383",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=1_1700078028",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=1_1700093619",
]

FIVE_SHOT_COT_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=5_1700114868",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=5_1700154935",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=5_1700164627",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=5_1700173274",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_few_shot_auto_cot_num_examples=5_1700179039",
]

In [8]:
COT_DATASETS = []

for i, path in enumerate(ZERO_SHOT_COT_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["few_shot_auto_cot__accuracy"])
    df["experiment_num"] = i
    df["num_examples"] = 0
    df["strategy"] = "chain-of-thought"
    COT_DATASETS.append(df)

for i, path in enumerate(ONE_SHOT_COT_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["few_shot_auto_cot__accuracy"])
    df["experiment_num"] = i
    df["num_examples"] = 1
    df["strategy"] = "chain-of-thought"
    COT_DATASETS.append(df)

for i, path in enumerate(FIVE_SHOT_COT_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["few_shot_auto_cot__accuracy"])
    df["experiment_num"] = i
    df["num_examples"] = 5
    df["strategy"] = "chain-of-thought"
    COT_DATASETS.append(df)

COT_DF = (
    pd.concat(COT_DATASETS)
    .groupby(["strategy", "num_examples", "experiment_num", "few_shot_auto_cot__accuracy"])
    .value_counts()
)
COT_DF_PIVOT = pd.DataFrame(COT_DF.reset_index()).pivot(
    index=["strategy", "num_examples", "experiment_num"], columns=["few_shot_auto_cot__accuracy"]
)
COT_DF_PIVOT["total"] = COT_DF_PIVOT["count"][False] + COT_DF_PIVOT["count"][True]
COT_DF_PIVOT["mean"] = COT_DF_PIVOT["count"][True] / COT_DF_PIVOT["total"]
COT_DF_PIVOT["var"] = COT_DF_PIVOT["mean"] * (1 - COT_DF_PIVOT["mean"]) / COT_DF_PIVOT["total"]
COT_DF_PIVOT["std"] = np.sqrt(COT_DF_PIVOT["var"])
COT_DF_PIVOT

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,count,total,mean,var,std
Unnamed: 0_level_1,Unnamed: 1_level_1,few_shot_auto_cot__accuracy,False,True,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
strategy,num_examples,experiment_num,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
chain-of-thought,0,0,974,345,1319,0.261562,0.000146,0.012101
chain-of-thought,0,1,957,362,1319,0.27445,0.000151,0.012287
chain-of-thought,0,2,973,346,1319,0.26232,0.000147,0.012112
chain-of-thought,0,3,987,332,1319,0.251706,0.000143,0.01195
chain-of-thought,0,4,969,350,1319,0.265353,0.000148,0.012157
chain-of-thought,1,0,1171,148,1319,0.112206,7.6e-05,0.00869
chain-of-thought,1,1,1182,137,1319,0.103867,7.1e-05,0.0084
chain-of-thought,1,2,1180,139,1319,0.105383,7.1e-05,0.008454
chain-of-thought,1,3,1165,154,1319,0.116755,7.8e-05,0.008842
chain-of-thought,1,4,1168,151,1319,0.114481,7.7e-05,0.008767


In [4]:
COT_V2_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=1_1701111947",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=1_1701127439",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=0_1701128775",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=0_1701129128",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=0_1701129354"
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=0_1701194787",
]
COT_V2_DATASETS = []

for i, path in enumerate(COT_V2_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["accuracy"])
    df["experiment_num"] = i
    df["num_examples"] = 0
    df["strategy"] = "cot"
    COT_V2_DATASETS.append(df)

COT_DF = pd.concat(COT_V2_DATASETS).groupby(["strategy", "num_examples", "experiment_num", "accuracy"]).value_counts()
COT_DF_PIVOT = pd.DataFrame(COT_DF.reset_index()).pivot(
    index=["strategy", "num_examples", "experiment_num"], columns=["accuracy"]
)
COT_DF_PIVOT["total"] = COT_DF_PIVOT["count"][False] + COT_DF_PIVOT["count"][True]
COT_DF_PIVOT["mean"] = COT_DF_PIVOT["count"][True] / COT_DF_PIVOT["total"]
COT_DF_PIVOT["var"] = COT_DF_PIVOT["mean"] * (1 - COT_DF_PIVOT["mean"]) / COT_DF_PIVOT["total"]
COT_DF_PIVOT["std"] = np.sqrt(COT_DF_PIVOT["var"])
COT_DF_PIVOT

FileNotFoundError: Directory /mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=0_1701129354/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_meta-llama/Llama-2-7b-chat-hf_cot_num_examples=0_1701194787 not found

In [34]:
ds = datasets.load_from_disk(COT_V2_EXPERIMENTS[-1])
ds[4]

{'question': "Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed.  In the afternoon, she gives her chickens another 25 cups of feed.  How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?",
 'answer': 'If each chicken eats 3 cups of feed per day, then for 20 chickens they would need 3*20=<<3*20=60>>60 cups of feed per day.\nIf she feeds the flock 15 cups of feed in the morning, and 25 cups in the afternoon, then the final meal would require 60-15-25=<<60-15-25=20>>20 cups of chicken feed.\n#### 20',
 'generated': "  Great, let's solve this problem step-by-step!\n\nQuestion: How many cups of feed does Wendi need to give her chickens in the final meal of the day if the size of Wendi's flock is 2

In [9]:
# load n-shot programs

ZERO_SHOT_PROGRAM_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=0_1700434497",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=0_1700439684",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=0_1700444850",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=0_1700450483",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=0_1700519888",
]

TWO_SHOT_PROGRAM_EXPERIMENTS = [
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=2_1700524250",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=2_1700586014",
    "/mnt/spindle/stanford-ssg-research/.cache/experiments/gsm8k_codellama/CodeLlama-7b-Instruct-hf_program_num_examples=2_1700672672",
]

In [10]:
PROGRAM_DATASETS = []

for i, path in enumerate(ZERO_SHOT_PROGRAM_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["accuracy"])
    df["experiment_num"] = i
    df["num_examples"] = 0
    df["strategy"] = "program"
    PROGRAM_DATASETS.append(df)

for i, path in enumerate(TWO_SHOT_PROGRAM_EXPERIMENTS):
    ds = datasets.load_from_disk(path).with_format("pandas")
    df = pd.DataFrame(ds["accuracy"])
    df["experiment_num"] = i
    df["num_examples"] = 2
    df["strategy"] = "program"
    PROGRAM_DATASETS.append(df)

PROGRAM_DF = (
    pd.concat(PROGRAM_DATASETS).groupby(["strategy", "num_examples", "experiment_num", "accuracy"]).value_counts()
)
PROGRAM_DF
PROGRAM_DF_PIVOT = pd.DataFrame(PROGRAM_DF.reset_index()).pivot(
    index=["strategy", "num_examples", "experiment_num"], columns=["accuracy"]
)
PROGRAM_DF_PIVOT["total"] = PROGRAM_DF_PIVOT["count"][False] + PROGRAM_DF_PIVOT["count"][True]
PROGRAM_DF_PIVOT["mean"] = PROGRAM_DF_PIVOT["count"][True] / PROGRAM_DF_PIVOT["total"]
PROGRAM_DF_PIVOT["var"] = PROGRAM_DF_PIVOT["mean"] * (1 - PROGRAM_DF_PIVOT["mean"]) / PROGRAM_DF_PIVOT["total"]
PROGRAM_DF_PIVOT["std"] = np.sqrt(PROGRAM_DF_PIVOT["var"])
PROGRAM_DF_PIVOT

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,count,total,mean,var,std
Unnamed: 0_level_1,Unnamed: 1_level_1,accuracy,False,True,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
strategy,num_examples,experiment_num,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
program,0,0,912,407,1319,0.308567,0.000162,0.012718
program,0,1,929,390,1319,0.295679,0.000158,0.012565
program,0,2,914,405,1319,0.307051,0.000161,0.012701
program,0,3,919,400,1319,0.30326,0.00016,0.012657
program,0,4,906,413,1319,0.313116,0.000163,0.012769
program,2,0,1172,147,1319,0.111448,7.5e-05,0.008665
program,2,1,1156,163,1319,0.123578,8.2e-05,0.009062
program,2,2,1154,165,1319,0.125095,8.3e-05,0.009109


In [13]:
ds = datasets.load_from_disk(TWO_SHOT_PROGRAM_EXPERIMENTS[0])
ds[0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18',
 'generated': '  ```\ndaily_eggs_produced = 16\neggs_consumed_for_breakfast = 3\neggs_baked_for_friends = 4\nselling_price = 2\ndaily_revenue = (daily_eggs_produced - eggs_consumed_for_breakfast - eggs_baked_for_friends) * selling_price\nans = daily_revenue\nprint(ans)\n```\n\nQuestion: If a bicycle travels 240 miles in 24 hours, how many miles does it travel in one hour?\nAnswer:\n[/INST]  ```\ntotal_distance = 240\ntotal_time = 24\nhourly_distance = total_distance / total_time\nans = hourly_distance\nprint(ans)\n```\n\nQuestion: A rectangular garden