In [None]:
import json
import os
from tqdm import tqdm
from collections import defaultdict
import numpy as np
from typing import *
import time
import copy
import warnings

from data.openai import *
from data.generation import *
from data.finetune import *
from data.inference import *
from data.io import *
from data.evaluation import *
from data.split import *

from utils.paths import *
from utils.metadata import *

In [None]:
import openai
openai.api_key = None

In [None]:
ALL_DATASETS = [
    "single_eq",
    "addsub",
    "multiarith",
    "gsm8k",
    "aqua",
    "svamp",
    
    "date_understanding",
    "coin_flip",
    
    "tracking_shuffled_objects",
    "last_letter_concatenation",
    
    "commonsense_qa",
    "strategy_qa",
]
datasets = ALL_DATASETS

# Teacher Zero-shot-CoT

In [None]:
for dataset_key in datasets:
    print(" {} ".format(dataset_key).center(80, "#"))
    completion_key = "zs_cot"
    model_key = "text-davinci-002"
    train_indices, test_indices = get_train_test_indices(dataset_key)
    all_indices = train_indices + test_indices  # subset of aqua, gsm8k, for which we subsampled 10000 for train
    generate_cot_completions(completion_key, dataset_key, model_key, indices=all_indices)

# Student Zero-Shot-CoT

In [None]:
for model_key in ["ada", "babbage", "curie"]:
    for dataset_key in datasets:
        print("#" * 80)
        print("Inferring test zs_cot for {}-{}".format(model_key, dataset_key))
        print("#" * 80)
        completion_key = "zs_cot"
        dataset = load_dataset(dataset_key)
        train_indices, test_indices = get_train_test_indices(dataset_key)
        generate_cot_completions(completion_key, dataset_key, model_key, indices=test_indices)

# Student Zero-Shot

In [None]:
datasets = ALL_DATASETS
for model_key in ["ada", "babbage", "curie"]:
    for dataset_key in datasets:
        print("#" * 80)
        print("Inferring test zs for {}-{}".format(model_key, dataset_key))
        print("#" * 80)
        completion_key = "zs"
        dataset = load_dataset(dataset_key)
        train_indices, test_indices = get_train_test_indices(dataset_key)
        infer_cot_completions(completion_key, dataset_key, model_key, template=None, split="test")

# Student Few-Shot-CoT

In [None]:
datasets = ALL_DATASETS
for model_key in ["ada", "babbage", "curie"]:
    for dataset_key in datasets:
        if dataset_key == "tracking_shuffled_objects":
            continue
        print("#" * 80)
        print("Inferring test fs-cot for {}-{}".format(model_key, dataset_key))
        print("#" * 80)
        completion_key = "fs_cot_long"
        dataset = load_dataset(dataset_key)
        infer_cot_completions(completion_key, dataset_key, model_key, template="few_shot_cot", split="test", max_tokens=1024)

# Curate and Fine-Tune

In [None]:
for dataset_key in datasets:
    train_indices, test_indices = get_train_test_indices(dataset_key)

    # Get train completions
    completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")

    # Generate FT data
    template = "special"
    file_key = "zs_cot_{}_{}_train".format(template, dataset_key)
    generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                          indices=train_indices)

    # Create file on OpenAI
    create_finetune_file(file_key)  # openai
    
    # Create finetunes
    for base_model in ["ada", "babbage", "curie"]:
        model_key = "{}_{}".format(base_model, file_key)
        create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

# Fetch Fine-tune-CoT Student Models

In [None]:
total = 0
complete = 0
for base_model_key in ["ada", "babbage", "curie"]:
    for dataset_key in datasets:
        total += 1
        template = "special"
        file_key = "zs_cot_{}_{}_train".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        if get_model_id(model_key, strict=False):
            print("{:60s} exists".format(model_key))
            complete += 1
            continue

        finetune_id = get_finetune_id(model_key)
        if finetune_id is None:
            print("{:60s} no finetune found".format(model_key))
            continue

        response = openai.FineTune.retrieve(finetune_id)
        model_id = response["fine_tuned_model"]
        if model_id is not None:
            set_model_id(model_key, model_id)
            print("{:60s} fetched".format(model_key))
            print("    {:40s}".format(model_id))
            complete += 1
        else:
            print("{:60s} {}".format(model_key, response["status"]))


print("{} of {} models are ready".format(complete, total))

# Fine-Tune-CoT Inference

In [None]:
template = "special"
completion_key  = "finetune_cot"
for dataset_key in datasets:
    for base_model_key in ["ada", "babbage", "curie"]:
        print("#" * 80)
        print(dataset_key, base_model_key)
        print("#" * 80)
        file_key = "zs_cot_{}_{}_train".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template)

### Long Inference

In [None]:
template = "special"
completion_key  = "finetune_cot_long"
for dataset_key in datasets:
    for base_model_key in ["ada", "babbage", "curie"]:
        print("#" * 80)
        print(dataset_key, base_model_key)
        print("#" * 80)
        file_key = "zs_cot_{}_{}_train".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template, max_tokens=1024)

# Fine-tune-CoT (Other Teachers)

### Teacher Zero-shot-CoT 

In [None]:
model_ablation_datasets = ["multiarith", "svamp", "date_understanding", "last_letter_concatenation"]

In [None]:
for dataset_key in model_ablation_datasets:
    print(" {} ".format(dataset_key).center(80, "#"))
    completion_key = "zs_cot"
    for model_key in ["davinci", "text-davinci-001", "text-davinci-003"]:
        train_indices, test_indices = get_train_test_indices(dataset_key)
        all_indices = train_indices + test_indices  # subset of aqua, gsm8k, for which we subsampled 10000 for train
        generate_cot_completions(completion_key, dataset_key, model_key, indices=all_indices)

### Fine-tune Students

In [None]:
for dataset_key in model_ablation_datasets:
    for teacher_model in ["davinci", "text-davinci-001", "text-davinci-003"]:
        train_indices, test_indices = get_train_test_indices(dataset_key)

        # Get train completions
        completion_data = load_completion_data("zs_cot", dataset_key, teacher_model)
        
        evaluation = evaluate_completions(completion_data, dataset_key, template=None, indices=train_indices)
        if evaluation.correct.sum() == 0:
            print("No correct samples for {:20s} {:20s}".format(dataset_key, teacher_model))
            continue

        # Generate FT data
        template = "special"
        file_key = "zs_cot_{}_{}_{}_train".format(template, dataset_key, teacher_model)  # third key added
        generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                              indices=train_indices)

        # Create file on OpenAI
        create_finetune_file(file_key)  # openai

        # Create finetunes
        for base_model in ["ada", "babbage", "curie"]:
            model_key = "{}_{}".format(base_model, file_key)
            create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Students

In [None]:
total = 0
complete = 0
for teacher_model in ["davinci", "text-davinci-001", "text-davinci-003"]:
    for base_model_key in ["ada", "babbage", "curie"]:
        for dataset_key in model_ablation_datasets:
            total += 1
            template = "special"
            file_key = "zs_cot_{}_{}_{}_train".format(template, dataset_key, teacher_model)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                print("{:80s} exists".format(model_key))
                complete += 1
                continue

            finetune_id = get_finetune_id(model_key)
            if finetune_id is None:
                print("{:80s} no finetune found".format(model_key))
                continue

            response = openai.FineTune.retrieve(finetune_id)
            model_id = response["fine_tuned_model"]
            if model_id is not None:
                set_model_id(model_key, model_id)
                print("{:80s} fetched".format(model_key))
                print("    {:40s}".format(model_id))
                complete += 1
            else:
                print("{:80s} {}".format(model_key, response["status"]))
print("{} of {} models are ready".format(complete, total))

### Run Students

In [None]:
template = "special"
completion_key  = "finetune_cot"
for base_model_key in ["ada", "babbage", "curie"]:
    for teacher_model in ["davinci", "text-davinci-001", "text-davinci-003"]:
        for dataset_key in model_ablation_datasets:
            print("#" * 80)
            print("{}-{}-{}".format(base_model_key, teacher_model, dataset_key))
            print("#" * 80)
            file_key = "zs_cot_{}_{}_{}_train".format(template, dataset_key, teacher_model)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template)
            else:
                print("Skipping unavailable model {}".format(model_key))

### Run Students (Long Inference)

In [None]:
template = "special"
completion_key  = "finetune_cot_long"
for base_model_key in ["ada", "babbage", "curie"]:
    for teacher_model in ["davinci", "text-davinci-001", "text-davinci-003"]:
        for dataset_key in model_ablation_datasets:
            print("#" * 80)
            print("{}-{}-{}".format(base_model_key, teacher_model, dataset_key))
            print("#" * 80)
            file_key = "zs_cot_{}_{}_{}_train".format(template, dataset_key, teacher_model)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template, max_tokens=1024)
            else:
                print("Skipping unavailable model {}".format(model_key))

# ☆ Fine-Tune-CoT w/ Template Splits

In [None]:
ts_datasets = ["multiarith", "date_understanding"]

### Teacher inference

In [None]:
for dataset_key in ts_datasets:
    dataset = load_dataset(dataset_key)
    train_indices, test_indices = get_train_test_indices(dataset_key, split_key="template_split")

    # Get train completions
    completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")
    train_completions = {i: completion_data[i] for i in train_indices}
    test_completions = {i: completion_data[i] for i in test_indices}

    # Generate FT data
    template = "special"
    file_key = "zs_cot_{}_{}_template_train".format(template, dataset_key)
    generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                          indices=train_indices)

    # Create file on OpenAI
    create_finetune_file(file_key)  # openai
    
    # Create finetunes
    for base_model in ["ada", "babbage", "curie"]:
        model_key = "{}_{}".format(base_model, file_key)
        create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Students

In [None]:
total = 0
complete = 0
for base_model_key in ["ada", "babbage", "curie"]:
    for dataset_key in ts_datasets:
        total += 1
        template = "special"
        file_key = "zs_cot_{}_{}_template_train".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        if get_model_id(model_key, strict=False):
            print("{:60s} exists".format(model_key))
            complete += 1
            continue

        finetune_id = get_finetune_id(model_key)
        if finetune_id is None:
            print("{:60s} no finetune found".format(model_key))
            continue

        response = openai.FineTune.retrieve(finetune_id)
        model_id = response["fine_tuned_model"]
        if model_id is not None:
            set_model_id(model_key, model_id)
            print("{:60s} fetched".format(model_key))
            print("    {:40s}".format(model_id))
            complete += 1
        else:
            print("{:60s} {}".format(model_key, response["status"]))


print("{} of {} models are ready".format(complete, total))

###  Student Inference

In [None]:
template = "special"
completion_key  = "finetune_cot_long"
for dataset_key in ts_datasets:
    for base_model_key in ["ada", "babbage", "curie"]:
        print("#" * 80)
        print(dataset_key, base_model_key)
        print("#" * 80)
        file_key = "zs_cot_{}_{}_template_train".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        train, test = get_train_test_indices(dataset_key, split_key="template_split")
        _ = infer_cot_completions(completion_key, dataset_key, model_key, indices=test, template=template, max_tokens=1024)

# ☆ Fine-Tune-CoT w/ Golden Filters

In [None]:
dataset_key = "date_understanding"
completions = load_completion_data("zs_cot", dataset_key, "text-davinci-002")

In [None]:
train, test = get_train_test_indices(dataset_key)
for i in train:
    print("Sample #{:03d}".format(i))
    q = completions[i][0]["question"]
    a = completions[i][0]["answer"]
    r = completions[i][0]["reasoning_completion"]
    c = completions[i][0]["completion"]
    
    clean_a = cleanse_answer(a, dataset_key)
    clean_prediction, prediction_candidates = cleanse_prediction(c, dataset_key, answer_prefix=None)
    
    if clean_a == clean_prediction:
        print("-" * 80)
        print(q.strip())
        print("-" * 80)
        print(r.strip())
        print(c.strip())
        print("-" * 80)
    else:
        print("-" * 80)
        print("WRONG")
        print("-" * 80)
    
    break  # for human-filtering...........

In [None]:
train, test = get_train_test_indices(dataset_key)
completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")

correct = []
for i in train:
    sample = completion_data[i][0]
    answer_prefix = "-->"
    prediction = cleanse_prediction(sample["completion"], dataset_key, answer_prefix=answer_prefix,
                                    return_all=False)
    answer = cleanse_answer(sample["answer"], dataset_key)
    is_correct_completion = compare_prediction_and_answer(prediction, answer, dataset_key)
    if is_correct_completion:
        correct.append(i)

In [None]:
len(correct)

In [None]:
good = [6,8,10,16,18,19,27,29,30,35,36,44,45,46,59,61,63,67,68,71,75,76,78,79,80,90,97,112,114,122,133,134,136,138,139,140,141,142,144,145,146,149,152,154,162,164,166,168,170,179,182,184,186,188,190,191,194,196,199,204,208,210,213,215,216,223,226,228,229,231,232,234,236,238,239,241,246,252,253,254,255,258,259,260,261,263,266,268,269,270,271,272,275,276,278,284,287,295,296,297,298,299,300,301,302,304,305,306,307,308,309,311,316,319,320,324,330,334,354,356,357,360,368]
bad = [5,7,11,20,24,33,34,37,52,54,64,65,86,89,92,101,124,126,135,155,157,158,159,160,171,198,200,205,206,212,218,230,233,245,248,249,281,282,283,286,293,313,329,344,350,352,363]

assert set(good).intersection(set(bad)) == set()
assert set(good).issubset(set(correct))
assert set(bad).issubset(set(correct))
assert set(good + bad) == set(correct)

In [None]:
len(train), len(correct), len(good), len(bad)

In [None]:
state = np.random.RandomState(0)
good_baseline = state.permutation(correct)[:len(good)]

In [None]:
for dataset_key in ["date_understanding"]:
    dataset = load_dataset(dataset_key)

    # Get train completions
    completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")
    
    # Generate FT data
    template = "special"
    file_key = "zs_cot_{}_{}_good_baseline".format(template, dataset_key)
    generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                          indices=good_baseline)

    # Create file on OpenAI
    create_finetune_file(file_key)  # openai
    
    # Create finetunes
    for base_model in ["ada", "babbage", "curie"]:
        model_key = "{}_{}".format(base_model, file_key)
        create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Models

In [None]:
total = 0
complete = 0
for base_model_key in ["ada", "babbage", "curie"]:
    for dataset_key in ["date_understanding"]:
        total += 1
        template = "special"
        file_key = "zs_cot_{}_{}_good_baseline".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        if get_model_id(model_key, strict=False):
            print("{:60s} exists".format(model_key))
            complete += 1
            continue

        finetune_id = get_finetune_id(model_key)
        if finetune_id is None:
            print("{:60s} no finetune found".format(model_key))
            continue

        response = openai.FineTune.retrieve(finetune_id)
        model_id = response["fine_tuned_model"]
        if model_id is not None:
            set_model_id(model_key, model_id)
            print("{:60s} fetched".format(model_key))
            print("    {:40s}".format(model_id))
            complete += 1
        else:
            print("{:60s} {}".format(model_key, response["status"]))


print("{} of {} models are ready".format(complete, total))

###  Student Inference

In [None]:
template = "special"
completion_key  = "finetune_cot_long"
for dataset_key in ["date_understanding"]:
    for base_model_key in ["ada", "babbage", "curie"]:
        print("#" * 80)
        print(dataset_key, base_model_key)
        print("#" * 80)
        file_key = "zs_cot_{}_{}_good".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        train, test = get_train_test_indices(dataset_key)
        _ = infer_cot_completions(completion_key, dataset_key, model_key, indices=test, template=template, max_tokens=1024)

# ☆ Fine-Tune-CoT w/ All Samples

In [None]:
for dataset_key in ["date_understanding"]:
    dataset = load_dataset(dataset_key)

    # Get train completions
    completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")
    
    # Generate FT data
    template = "special"
    file_key = "zs_cot_{}_{}_all".format(template, dataset_key)
    train, test = get_train_test_indices(dataset_key)
    generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                           indices=train, include_incorrect=True)

    # Create file on OpenAI
    create_finetune_file(file_key)  # openai
    
    # Create finetunes
    for base_model in ["ada", "babbage", "curie"]:
        model_key = "{}_{}".format(base_model, file_key)
        create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

In [None]:
total = 0
complete = 0
for base_model_key in ["ada", "babbage", "curie"]:
    for dataset_key in ["date_understanding"]:
        total += 1
        template = "special"
        file_key = "zs_cot_{}_{}_all".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        if get_model_id(model_key, strict=False):
            print("{:60s} exists".format(model_key))
            complete += 1
            continue

        finetune_id = get_finetune_id(model_key)
        if finetune_id is None:
            print("{:60s} no finetune found".format(model_key))
            continue

        response = openai.FineTune.retrieve(finetune_id)
        model_id = response["fine_tuned_model"]
        if model_id is not None:
            set_model_id(model_key, model_id)
            print("{:60s} fetched".format(model_key))
            print("    {:40s}".format(model_id))
            complete += 1
        else:
            print("{:60s} {}".format(model_key, response["status"]))


print("{} of {} models are ready".format(complete, total))

In [None]:
template = "special"
completion_key  = "finetune_cot_long"
for dataset_key in ["date_understanding"]:
    for base_model_key in ["ada", "babbage", "curie"]:
        print("#" * 80)
        print(dataset_key, base_model_key)
        print("#" * 80)
        file_key = "zs_cot_{}_{}_all".format(template, dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        train, test = get_train_test_indices(dataset_key)
        _ = infer_cot_completions(completion_key, dataset_key, model_key, indices=test,
                                  template=template, max_tokens=1024)

# Fine-tune-CoT (All Samples w/ Diverse Reasoning)

In [None]:
datasets = ["multiarith", "svamp", "date_understanding", "last_letter_concatenation"]
for augmentations in [1, 2, 4, 8]:
    print(" Generate {}aug Data ".format(augmentations).center(80, "#"))
    for dataset_key in datasets:
        print(dataset_key)
        completion_key = "zs_cot_aug"
        model_key = "text-davinci-002"
        train_indices, _ = get_train_test_indices(dataset_key)
        generate_cot_completions(completion_key, dataset_key, model_key, indices=train_indices,
                                 augmentations=augmentations)

    print(" Generate File, Upload File, Run Fine-tune ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
        train_indices, _ = get_train_test_indices(dataset_key)

        # Generate FT data
        template = "special"
        file_key = "zs_cot_{}_{}_all_{}aug".format(template, dataset_key, augmentations)
        _ = generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                                   indices=train_indices, augmentations=augmentations, include_incorrect=True)

        # Create file on OpenAI
        file_id = get_file_id(file_key)
        if file_id:
            print("{:60s}{}".format(file_key, file_id))
        else:
            create_finetune_file(file_key)  # openai

        # Create finetunes
        for base_model in ["ada", "babbage", "curie"]:
            model_key = "{}_{}".format(base_model, file_key)
            finetune_id = get_finetune_id(model_key)
            if finetune_id:
                print("{:60s}{}".format(model_key, finetune_id))
            else:
                create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Models

In [None]:
total = 0
complete = 0

# datasets = ["multiarith", "svamp"]
datasets = ["multiarith", "svamp", "date_understanding", "last_letter_concatenation"]
for augmentations in [1, 2, 4, 8]:
    for base_model_key in ["ada", "babbage", "curie"]:
        for dataset_key in datasets:
            total += 1
            template = "special"
            file_key = "zs_cot_{}_{}_all_{}aug".format(template, dataset_key, augmentations)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                print("{:60s} exists".format(model_key))
                complete += 1
                continue

            finetune_id = get_finetune_id(model_key)
            if finetune_id is None:
                print("{:60s} no finetune found".format(model_key))
                continue

            response = openai.FineTune.retrieve(finetune_id)
            model_id = response["fine_tuned_model"]
            if model_id is not None:
                set_model_id(model_key, model_id)
                print("{:60s} fetched".format(model_key))
                print("    {:40s}".format(model_id))
                complete += 1
            else:
                print("{:60s} {}".format(model_key, response["status"]))

print("{} of {} models are ready".format(complete, total))

### Long Inference

In [None]:
# datasets = ["multiarith", "svamp"]
datasets = ["multiarith", "svamp", "date_understanding", "last_letter_concatenation"]
template = "special"
completion_key  = "finetune_cot_long"
for base_model_key in ["ada", "babbage", "curie"]:
#     for augmentations in [1, 2, 4, 8, 16, 32, 64]:
    for augmentations in [1, 2, 4, 8]:
        for dataset_key in datasets:
            file_key = "zs_cot_{}_{}_all_{}aug".format(template, dataset_key, augmentations)
            print(" {} ".format(file_key).center(80, "#"))
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False) is not None:
                _ = infer_cot_completions(completion_key, dataset_key, model_key,
                                          template=template, max_tokens=1024)
            else:
                print("Model not found")
            print()

# Few-shot Fine-tune-CoT (Fine-tuning with Few-Shot Samples)

In [None]:
for shots in [8, 32, 128]:
    print(" Generate {}shot Data ".format(shots).center(80, "#"))
    for dataset_key in datasets:
        print(dataset_key)
        completion_key = "zs_cot"
        model_key = "text-davinci-002"
        indices = get_few_shot_train_indices(dataset_key, shots=shots)
        generate_cot_completions(completion_key, dataset_key, model_key, indices=indices)

    print(" Correct Samples ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")
        indices = get_few_shot_train_indices(dataset_key, shots=shots)
        evaluation = evaluate_completions(completion_data, dataset_key, template=None,
                                          indices=indices)
        correct = get_evaluation_metrics(evaluation)["accuracy"] * shots
        print("{:40s}: {:03d}".format(dataset_key, round(correct)))


    print(" Generating FT Data and Registering File ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot", dataset_key, "text-davinci-002")
        indices = get_few_shot_train_indices(dataset_key, shots=shots)

        # Generate FT data
        template = "special"
        file_key = "zs_cot_{}_{}_{}shot".format(template, dataset_key, shots)
        data = generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                                      indices=indices)

        # Create file on OpenAI
        file_id = get_file_id(file_key)
        if file_id:
            print("{:60s}{}".format(file_key, file_id))
        else:
            create_finetune_file(file_key)  # openai

        # Create finetunes
        for base_model in ["ada", "babbage", "curie"]:
            model_key = "{}_{}".format(base_model, file_key)
            finetune_id = get_finetune_id(model_key)
            if finetune_id:
                print("{:60s}{}".format(model_key, finetune_id))
            else:
                create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Models

In [None]:
total = 0
complete = 0

for shots in [8, 32, 128]:
    for base_model_key in ["ada", "babbage", "curie"]:
        for dataset_key in datasets:
            total += 1
            template = "special"
            file_key = "zs_cot_{}_{}_{}shot".format(template, dataset_key, shots)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                print("{:60s} exists".format(model_key))
                complete += 1
                continue

            finetune_id = get_finetune_id(model_key)
            if finetune_id is None:
                print("{:60s} no finetune found".format(model_key))
                continue

            response = openai.FineTune.retrieve(finetune_id)
            model_id = response["fine_tuned_model"]
            if model_id is not None:
                set_model_id(model_key, model_id)
                print("{:60s} fetched".format(model_key))
                print("    {:40s}".format(model_id))
                complete += 1
            else:
                print("{:60s} pending".format(model_key))

print("{} of {} models are ready".format(complete, total))

### Inference

In [None]:
for shots in [8, 32, 128]:
    template = "special"
    completion_key  = "finetune_cot"
    for dataset_key in datasets:
        for base_model_key in ["ada", "babbage", "curie"]:
            print(" {}-{}-{}shot ".format(dataset_key, base_model_key, shots).center(80, "-"))
            file_key = "zs_cot_{}_{}_{}shot".format(template, dataset_key, shots)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False) is not None:
                _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template)
            else:
                print("Model not found")
            print()

### Long Inference

In [None]:
for shots in [8, 32, 128]:
    template = "special"
    completion_key  = "finetune_cot_long"
    for dataset_key in datasets:
        for base_model_key in ["ada", "babbage", "curie"]:
            print(" {}-{}-{}shot ".format(dataset_key, base_model_key, shots).center(80, "-"))
            file_key = "zs_cot_{}_{}_{}shot".format(template, dataset_key, shots)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False) is not None:
                _ = infer_cot_completions(completion_key, dataset_key, model_key,
                                          template=template, max_tokens=1024)
            else:
                print("Model not found")
            print()

# Vanilla FT

In [None]:
datasets = ALL_DATASETS
for dataset_key in datasets:
    dataset = load_dataset(dataset_key)
    train_indices, _ = get_train_test_indices(dataset_key)

    # Generate FT data
    file_key = "{}_train".format(dataset_key)
    data = generate_finetune_data(dataset, dataset_key, template=None, file_key=file_key, indices=train_indices)

    # Create file on OpenAI
    file_id = get_file_id(file_key)
    if file_id:
        print("{:60s}{}".format(file_key, file_id))
    else:
        create_finetune_file(file_key)  # openai

    # Create finetunes
    for base_model in ["ada", "babbage", "curie"]:
        model_key = "{}_{}".format(base_model, file_key)
        finetune_id = get_finetune_id(model_key)
        if finetune_id:
            print("{:60s}{}".format(model_key, finetune_id))
        else:
            create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Models

In [None]:
total = 0
complete = 0

print("-" * 80)
for base_model_key in ["ada", "babbage", "curie"]:
    for dataset_key in datasets:
        total += 1
        file_key = "{}_train".format(dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        if get_model_id(model_key, strict=False):
            print("{:60s} exists".format(model_key))
            complete += 1
            continue

        finetune_id = get_finetune_id(model_key)
        if finetune_id is None:
            print("{:60s} no finetune found".format(model_key))
            continue

        response = openai.FineTune.retrieve(finetune_id)
        model_id = response["fine_tuned_model"]
        if model_id is not None:
            set_model_id(model_key, model_id)
            print("{:60s} fetched".format(model_key))
            print("    {:40s}".format(model_id))
            complete += 1
        else:
            print("{:60s} {}".format(model_key, response["status"]))
            
print("-" * 80)
print("{} of {} models are ready".format(complete, total))

### Inference

In [None]:
completion_key  = "ft"
for dataset_key in datasets:
    for base_model_key in ["ada", "babbage", "curie"]:
        print(" {}-{} ".format(dataset_key, base_model_key).center(80, "-"))
        file_key = "{}_train".format(dataset_key)
        model_key = "{}_{}".format(base_model_key, file_key)
        if get_model_id(model_key, strict=False) is not None:
            completion_data = infer_cot_completions(completion_key, dataset_key, model_key, template=None)
        else:
            print("Model not found")
        print()

# Few-Shot Vanilla FT

In [None]:
datasets = ALL_DATASETS
for shots in [8, 32, 128]:
    print(" Generate {}shot Data ".format(shots).center(80, "#"))
    for dataset_key in datasets:
        dataset = load_dataset(dataset_key)
        indices = get_few_shot_train_indices(dataset_key, shots=shots)

        # Generate FT data
        file_key = "{}_{}shot".format(dataset_key, shots)
        data = generate_finetune_data(dataset, dataset_key, template=None, file_key=file_key, indices=indices)

        # Create file on OpenAI
        file_id = get_file_id(file_key)
        if file_id:
            print("{:60s}{}".format(file_key, file_id))
        else:
            create_finetune_file(file_key)  # openai

        # Create finetunes
        for base_model in ["ada", "babbage", "curie"]:
            model_key = "{}_{}".format(base_model, file_key)
            finetune_id = get_finetune_id(model_key)
            if finetune_id:
                print("{:60s}{}".format(model_key, finetune_id))
            else:
                create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Models

In [None]:
total = 0
complete = 0

print("-" * 80)
for shots in [8, 32, 128]:
    for base_model_key in ["ada", "babbage", "curie"]:
        for dataset_key in datasets:
            total += 1
            file_key = "{}_{}shot".format(dataset_key, shots)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                print("{:60s} exists".format(model_key))
                complete += 1
                continue

            finetune_id = get_finetune_id(model_key)
            if finetune_id is None:
                print("{:60s} no finetune found".format(model_key))
                continue

            response = openai.FineTune.retrieve(finetune_id)
            model_id = response["fine_tuned_model"]
            if model_id is not None:
                set_model_id(model_key, model_id)
                print("{:60s} fetched".format(model_key))
                print("    {:40s}".format(model_id))
                complete += 1
            else:
                print("{:60s} {}".format(model_key, response["status"]))
            
print("-" * 80)
print("{} of {} models are ready".format(complete, total))

### Inference

In [None]:
completion_key  = "ft"
for shots in [8, 32, 128]:
    for dataset_key in datasets:
        for base_model_key in ["ada", "babbage", "curie"]:
            print(" {}-{} ".format(dataset_key, base_model_key).center(80, "-"))
            file_key = "{}_{}shot".format(dataset_key, shots)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False) is not None:
                completion_data = infer_cot_completions(completion_key, dataset_key, model_key, template=None)
            else:
                print("Model not found")
            print()

## \* Diverse Reasoning for Confidence Sampling (Date)

In [None]:
for augmentations in [1, 2, 4, 8, 16]:
    print(" Generate {}aug Data ".format(augmentations).center(80, "#"))
    dataset_key = "date_understanding"
    completion_key = "zs_cot_aug"
    model_key = "text-davinci-002"
    train_indices, _ = get_train_test_indices(dataset_key)
    generate_cot_completions(completion_key, dataset_key, model_key, indices=train_indices,
                             augmentations=augmentations, reasoning_temperature=0.7)

    print(" Augmented Accuracy (Train) ".center(80, "#"))
    completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
    train_indices, _ = get_train_test_indices(dataset_key)
    evaluation = evaluate_completions(completion_data, dataset_key, template=None,
                                      indices=train_indices, augmentations=augmentations)
    accuracy = get_evaluation_metrics(evaluation)["accuracy"]
    print("{:60s}: {:f}".format(dataset_key, accuracy))

# Fine-tune-CoT (Diverse Reasoning)

In [None]:
datasets = ["multiarith", "svamp"]
datasets = ["date_understanding", "last_letter_concatenation"]
datasets = ["single_eq", "addsub", "tracking_shuffled_objects", "coin_flip", "strategy_qa"]
for augmentations in [2, 1, 4, 8]:
    print(" Generate {}aug Data ".format(augmentations).center(80, "#"))
    for dataset_key in datasets:
        print(dataset_key)
        completion_key = "zs_cot_aug"
        model_key = "text-davinci-002"
        train_indices, _ = get_train_test_indices(dataset_key)
        generate_cot_completions(completion_key, dataset_key, model_key, indices=train_indices,
                                 augmentations=augmentations)

    print(" Augmented Accuracy (Train) ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
        train_indices, _ = get_train_test_indices(dataset_key)
        evaluation = evaluate_completions(completion_data, dataset_key, template=None,
                                          indices=train_indices, augmentations=augmentations)
        accuracy = get_evaluation_metrics(evaluation)["accuracy"]
        print("{:60s}: {:f}".format(dataset_key, accuracy))


    print(" Generate File, Upload File, Run Fine-tune ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
        train_indices, _ = get_train_test_indices(dataset_key)

        # Generate FT data
        template = "special"
        file_key = "zs_cot_{}_{}_{}aug".format(template, dataset_key, augmentations)
        _ = generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                                   indices=train_indices, augmentations=augmentations)

        # Create file on OpenAI
        file_id = get_file_id(file_key)
        if file_id:
            print("{:60s}{}".format(file_key, file_id))
        else:
            create_finetune_file(file_key)  # openai

        # Create finetunes
        for base_model in ["ada", "babbage", "curie"]:
            model_key = "{}_{}".format(base_model, file_key)
            finetune_id = get_finetune_id(model_key)
            if finetune_id:
                print("{:60s}{}".format(model_key, finetune_id))
            else:
                create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

In [None]:
datasets = ["svamp"]
for augmentations in [64]:
    print(" Generate {}aug Data ".format(augmentations).center(80, "#"))
    for dataset_key in datasets:
        print(dataset_key)
        completion_key = "zs_cot_aug"
        model_key = "text-davinci-002"
        train_indices, _ = get_train_test_indices(dataset_key)
        generate_cot_completions(completion_key, dataset_key, model_key, indices=train_indices,
                                 augmentations=augmentations)

    print(" Augmented Accuracy (Train) ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
        train_indices, _ = get_train_test_indices(dataset_key)
        evaluation = evaluate_completions(completion_data, dataset_key, template=None,
                                          indices=train_indices, augmentations=augmentations)
        accuracy = get_evaluation_metrics(evaluation)["accuracy"]
        print("{:60s}: {:f}".format(dataset_key, accuracy))


    print(" Generate File, Upload File, Run Fine-tune ".center(80, "#"))
    for dataset_key in datasets:
        completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
        train_indices, _ = get_train_test_indices(dataset_key)

        # Generate FT data
        template = "special"
        file_key = "zs_cot_{}_{}_{}aug".format(template, dataset_key, augmentations)
        _ = generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                                   indices=train_indices, augmentations=augmentations)

        # Create file on OpenAI
        file_id = get_file_id(file_key)
        if file_id:
            print("{:60s}{}".format(file_key, file_id))
        else:
            create_finetune_file(file_key)  # openai

        # Create finetunes
        for base_model in ["ada", "babbage", "curie"]:
            model_key = "{}_{}".format(base_model, file_key)
            finetune_id = get_finetune_id(model_key)
            if finetune_id:
                print("{:60s}{}".format(model_key, finetune_id))
            else:
                create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

### Fetch Models

In [None]:
total = 0
complete = 0

# datasets = ["multiarith", "svamp"]
# datasets = ["date_understanding", "last_letter_concatenation"]
datasets = ["single_eq", "addsub", "tracking_shuffled_objects", "coin_flip", "strategy_qa"]
for augmentations in [1, 2, 4, 8]:
    for base_model_key in ["ada", "babbage", "curie"]:
        for dataset_key in datasets:
            total += 1
            template = "special"
            file_key = "zs_cot_{}_{}_{}aug".format(template, dataset_key, augmentations)
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False):
                print("{:60s} exists".format(model_key))
                complete += 1
                continue

            finetune_id = get_finetune_id(model_key)
            if finetune_id is None:
                print("{:60s} no finetune found".format(model_key))
                continue

            response = openai.FineTune.retrieve(finetune_id)
            model_id = response["fine_tuned_model"]
            if model_id is not None:
                set_model_id(model_key, model_id)
                print("{:60s} fetched".format(model_key))
                print("    {:40s}".format(model_id))
                complete += 1
            else:
                print("{:60s} {}".format(model_key, response["status"]))

print("{} of {} models are ready".format(complete, total))

### Inference

In [None]:
# datasets = ["multiarith", "svamp"]
datasets = ["date_understanding", "last_letter_concatenation"]
template = "special"
completion_key  = "finetune_cot"
for base_model_key in ["ada", "babbage", "curie"]:
#     for augmentations in [1, 2, 4, 8, 16, 32, 64]:
    for augmentations in [1, 2, 4, 8]:
        for dataset_key in datasets:
            file_key = "zs_cot_{}_{}_{}aug".format(template, dataset_key, augmentations)
            print(" {} ".format(file_key).center(80, "#"))
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False) is not None:
                _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template)
            else:
                print("Model not found")
            print()

### Long Inference

In [None]:
# datasets = ["multiarith", "svamp"]
datasets = ["date_understanding", "last_letter_concatenation"]
datasets = ["single_eq", "addsub", "tracking_shuffled_objects", "coin_flip", "strategy_qa"]
template = "special"
completion_key  = "finetune_cot_long"
for base_model_key in ["ada", "babbage", "curie"]:
#     for augmentations in [1, 2, 4, 8, 16, 32, 64]:
    for augmentations in [8]:
        for dataset_key in datasets:
            file_key = "zs_cot_{}_{}_{}aug".format(template, dataset_key, augmentations)
            print(" {} ".format(file_key).center(80, "#"))
            model_key = "{}_{}".format(base_model_key, file_key)
            if get_model_id(model_key, strict=False) is not None:
                _ = infer_cot_completions(completion_key, dataset_key, model_key,
                                          template=template, max_tokens=1024)
            else:
                print("Model not found")
            print()

# Few-shot Fine-tune-CoT (Diverse Reasoning)

In [None]:
datasets = ["multiarith", "svamp"]
for shots in [8, 32, 128]:
    for augmentations in [1, 2, 4, 8, 16, 32, 64]:
        print(" Generate {}shot {}aug Data ".format(shots, augmentations).center(80, "#"))
        for dataset_key in datasets:
            print(dataset_key)
            completion_key = "zs_cot_aug"
            model_key = "text-davinci-002"
            indices = get_few_shot_train_indices(dataset_key, shots=shots)
            generate_cot_completions(completion_key, dataset_key, model_key, indices=indices,
                                     augmentations=augmentations)

        print(" Augmented Accuracy (Train) ".center(80, "#"))
        for dataset_key in datasets:
            completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
            indices = get_few_shot_train_indices(dataset_key, shots=shots)
            evaluation = evaluate_completions(completion_data, dataset_key, template=None,
                                              indices=indices, augmentations=augmentations)
            accuracy = get_evaluation_metrics(evaluation)["accuracy"]
            print("{:60s}: {:f}".format(dataset_key, accuracy))

        print(" Generate File, Upload File, Run Fine-tune ".center(80, "#"))
        for dataset_key in datasets:
            completion_data = load_completion_data("zs_cot_aug", dataset_key, "text-davinci-002")
            indices = get_few_shot_train_indices(dataset_key, shots=shots)

            # Generate FT data
            template = "special"
            file_key = "zs_cot_{}_{}_{}shot_{}aug".format(template, dataset_key, shots, augmentations)
            _ = generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                                       indices=indices, augmentations=augmentations)

            # Create file on OpenAI
            file_id = get_file_id(file_key)
            if file_id:
                print("{:60s}{}".format(file_key, file_id))
            else:
                create_finetune_file(file_key)  # openai

            # Create finetunes
            for base_model in ["ada", "babbage", "curie"]:
                model_key = "{}_{}".format(base_model, file_key)
                finetune_id = get_finetune_id(model_key)
                if finetune_id:
                    print("{:60s}{}".format(model_key, finetune_id))
                else:
                    create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])

In [None]:
total = 0
complete = 0

datasets = ["multiarith", "svamp"]
for shots in [8, 32, 128]:
    for augmentations in [1, 2, 4, 8, 16, 32, 64]:
        for base_model_key in ["ada", "babbage", "curie"]:
            for dataset_key in datasets:
                total += 1
                template = "special"
                file_key = "zs_cot_{}_{}_{}shot_{}aug".format(template, dataset_key, shots, augmentations)
                model_key = "{}_{}".format(base_model_key, file_key)
                if get_model_id(model_key, strict=False):
                    print("{:60s} exists".format(model_key))
                    complete += 1
                    continue

                finetune_id = get_finetune_id(model_key)
                if finetune_id is None:
                    print("{:60s} no finetune found".format(model_key))
                    continue

                response = openai.FineTune.retrieve(finetune_id)
                model_id = response["fine_tuned_model"]
                if model_id is not None:
                    set_model_id(model_key, model_id)
                    print("{:60s} fetched".format(model_key))
                    print("    {:40s}".format(model_id))
                    complete += 1
                else:
                    print("{:60s} {}".format(model_key, response["status"]))

print("{} of {} models are ready".format(complete, total))

### Inference

In [None]:
summary = []
datasets = ["multiarith", "svamp"]
template = "special"
completion_key  = "finetune_cot"
for base_model_key in ["ada", "babbage", "curie"]:
    for shots in [8, 32, 128]:
        for augmentations in [1, 2, 4, 8, 16, 32, 64]:
            for dataset_key in datasets:
                file_key = "zs_cot_{}_{}_{}shot_{}aug".format(template, dataset_key, shots, augmentations)
                print(" {} ".format(file_key).center(80, "#"))
                model_key = "{}_{}".format(base_model_key, file_key)
                if get_model_id(model_key, strict=False) is not None:
                    _ = infer_cot_completions(completion_key, dataset_key, model_key, template=template)
                else:
                    print("Model not found")
                print()

### Long Inference

In [None]:
summary = []
datasets = ["multiarith", "svamp"]
template = "special"
completion_key  = "finetune_cot_long"
for base_model_key in ["ada", "babbage", "curie"]:
    for shots in [8, 32, 128]:
        for augmentations in [1, 2, 4, 8, 16, 32, 64]:
            for dataset_key in datasets:
                file_key = "zs_cot_{}_{}_{}shot_{}aug".format(template, dataset_key, shots, augmentations)
                print(" {} ".format(file_key).center(80, "#"))
                model_key = "{}_{}".format(base_model_key, file_key)
                if get_model_id(model_key, strict=False) is not None:
                    _ = infer_cot_completions(completion_key, dataset_key, model_key,
                                              template=template, max_tokens=1024)
                else:
                    print("Model not found")
                print()

# Reasoning Length Analysis

In [None]:
long_datasets = "aqua,commonsense_qa,strategy_qa,gsm8k,svamp".split(",")
long_datasets_go = "strategy_qa,gsm8k,svamp".split(",")

## Test Generation

Generate test answers w/ full reason (2048) for analysis.

In [None]:
completion_key = "zs_cot_long"
model_key = "text-davinci-002"
for dataset_key in datasets:
    train, test = get_train_test_indices(dataset_key)
    generate_cot_completions(completion_key, dataset_key, model_key, indices=test,
                             max_tokens=2048, request_batch_size=5)

In [None]:
train_indices, test_indices = get_train_test_indices("gsm8k")
print(len(train_indices))
train_indices, test_indices = get_train_test_indices("multiarith")
print(len(train_indices))

### Test Generation

In [None]:
metrics_by_dataset = dict()
evaluation_by_dataset = dict()
for dataset_key in long_datasets:
    completion_data = load_completion_data("zs_cot_long", dataset_key, "text-davinci-002")
    train_indices, test_indices = get_train_test_indices(dataset_key)
    evaluation = evaluate_completions(completion_data, dataset_key, template=None,
                                      indices=test_indices)
    evaluation_by_dataset[dataset_key] = evaluation
    metrics_by_dataset[dataset_key] = get_evaluation_metrics(evaluation)
with pd.option_context('display.float_format', '{:,.3f}'.format):
    display(pd.DataFrame(metrics_by_dataset).T)

In [None]:
from data.tokens import get_token_count

lengths_by_dataset = defaultdict(dict)
for dataset_key in long_datasets:
    average = 0
    print(" {} ".format(dataset_key).center(80, "#"))
    train_indices, test_indices = get_train_test_indices(dataset_key)
    samples = load_completion_data(completion_key, dataset_key, model_key)
    for i in test_indices:
        s = samples[i][0]
        lengths_by_dataset[dataset_key][i] = get_token_count(s["reasoning_completion"])

In [None]:
thresholds = [0, 64, 128, 256, 384, 512, 768, 1024, 2047, 2048]
accuracy_by_threshold_by_dataset = dict()
for dataset in long_datasets:
    lengths = lengths_by_dataset[dataset]
    e = evaluation_by_dataset[dataset].set_index("sample_index")
    l = pd.Series(lengths)
    l.name = "length"
    df = pd.concat([e, l], axis=1)
    df = df.loc[:, ["correct", "length"]]
    accuracy_by_threshold = defaultdict(list)
    for t1, t2 in zip(thresholds[:-1], thresholds[1:]):
        samples = df[(df.length > t1) & (df.length <= t2)]
        accuracy = samples.correct.sum() / samples.correct.count()
        accuracy_by_threshold["t1"].append(t1)
        accuracy_by_threshold["t2"].append(t2)
        accuracy_by_threshold["count"].append(samples.correct.count())
        accuracy_by_threshold["correct"].append(samples.correct.sum())
        accuracy_by_threshold["accurcy"].append(accuracy)
    accuracy_by_threshold = pd.DataFrame(accuracy_by_threshold)
    accuracy_by_threshold_by_dataset[dataset] = accuracy_by_threshold

In [None]:
accuracy_by_threshold_by_dataset["strategy_qa"]

In [None]:
accuracy_by_threshold_by_dataset["commonsense_qa"]

In [None]:
accuracy_by_threshold_by_dataset["gsm8k"]

In [None]:
accuracy_by_threshold_by_dataset["svamp"]

In [None]:
accuracy_by_threshold_by_dataset["aqua"]

In [None]:
long_reason_indices = defaultdict(list)
inf_reason_indices = defaultdict(list)

for dataset in long_datasets_go:
    lengths = lengths_by_dataset[dataset]
    for i, length in lengths.items():
        if length == 2048:
            inf_reason_indices[dataset].append(i)
        elif length > 512:
            long_reason_indices[dataset].append(i)

In [None]:
for dataset in long_datasets_go:
    print(dataset.upper().center(80, "#"))
    completion_key = "zs_cot_long"
    model_key = "text-davinci-002"
    samples = load_completion_data(completion_key, dataset, model_key)
    for i in inf_reason_indices[dataset]:
        s = samples[i]
        print()
        print("#" * 80)
        print("SAMPLE #{}".format(s[0]["sample_index"]))
        print(s[0]["question"])
        print("-" * 80)
        print(s[0]["reasoning_completion"])
        print("-" * 80)
        print("Answer", s[0]["answer"])
        print("-" * 80)
        print("Tokens", get_token_count(s[0]["reasoning_completion"]))
        print("-" * 80)

In [None]:
for dataset in long_datasets_go:
    print(dataset.upper().center(80, "#"))
    completion_key = "zs_cot_long"
    model_key = "text-davinci-002"
    samples = load_completion_data(completion_key, dataset, model_key)
    for i in long_reason_indices[dataset]:
        s = samples[i]
        print()
        print("#" * 80)
        print("SAMPLE #{}".format(s[0]["sample_index"]))
        print(s[0]["question"])
        print("-" * 80)
        print(s[0]["reasoning_completion"])
        print("-" * 80)
        print("Answer", s[0]["answer"])
        print("-" * 80)
        print("Tokens", get_token_count(s[0]["reasoning_completion"]))
        print("-" * 80)

In [None]:
from data.tokens import get_token_count

for dataset_key in long_datasets:
    print("#" * 80)
    print(dataset_key)
    print("#" * 80)
    samples = load_completion_data(completion_key, dataset_key, model_key)
    for s in samples:
        if get_token_count(s[0]["reasoning_completion"]) > 512:
            print(s[0]["question"])
            print("-" * 80)
            print(s[0]["reasoning_completion"])
            print("-" * 80)
            print("Answer", s[0]["answer"])
            print("-" * 80)
            print("Tokens", get_token_count(s[0]["reasoning_completion"]))
            print("-" * 80)
            break

## Train Generation

Note, train answers are generated w/ truncated reason (512) for fine-tuning, test answers have been generated w/ full reason for analysis.

In [None]:
long_datasets = "aqua,commonsense_qa,strategy_qa,gsm8k,svamp".split(",")
# long_datasets = "strategy_qa,gsm8k,svamp".split(",")

In [None]:
completion_key = "zs_cot_long"
model_key = "text-davinci-002"
for dataset_key in long_datasets:
    print(dataset_key.upper().center(80, "="))
    train, test = get_train_test_indices(dataset_key)
    generate_cot_completions(completion_key, dataset_key, model_key, indices=train,
                             max_tokens=2048, max_tokens_used=512)

# Curate and Fine-Tune

In [None]:
for dataset_key in long_datasets:
    dataset = load_dataset(dataset_key)
    train_indices, test_indices = get_train_test_indices(dataset_key)

    # Get train completions
    completion_data = load_completion_data("zs_cot_long", dataset_key, "text-davinci-002")
    train_completions = {i: completion_data[i] for i in train_indices}
    test_completions = {i: completion_data[i] for i in test_indices}

    # Generate FT data
    template = "special"
    file_key = "zs_cot_long_{}_{}_train".format(template, dataset_key)
    generate_finetune_data(completion_data, dataset_key, template=template, file_key=file_key,
                          indices=train_indices)

    # Create file on OpenAI
    create_finetune_file(file_key)  # openai
    
    # Create finetunes
    for base_model in ["ada", "babbage", "curie"]:
        model_key = "{}_{}".format(base_model, file_key)
        create_finetune(file_key, model_key=model_key, model=base_model, suffix=file_key[:40])