In [9]:
import pandas as pd
import numpy as np

model_list = [
    'debug-mpt-7b-base-gsm8k-ft',
    'debug-mpt-7b-base-metamathqa-ft',
    'debug-mpt-7b-base-metamathqa-bs48-ft',
    'debug-mpt-7b-base-metamathqa-fixed-packing-ratio-ft',
]

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

num_few_shot = 8

def pretty_print(df, split='train', idx=None):
    IS_CORRECT=False
    if idx == None:
        print("Index not passed, picking a random row.")
        idx = np.random.randint(0, len(df))
    print(f"Showing {split} example at index {idx}")
    row = df.iloc[idx]
    print(f"Prompt:\n{row['prompt']}\n\n")
    print(f"Correct Answer:\n{bcolors.OKGREEN}{row['answer']}{bcolors.ENDC}\n\n")
    actual_answer = row['answer'].split('####')[-1].strip()
    if actual_answer in row['model_generation']:
        pretty_generation = row['model_generation'].replace(actual_answer, f"{bcolors.UNDERLINE}{bcolors.OKGREEN}{actual_answer}{bcolors.ENDC}")
        print(f"Generated Answer:\n{pretty_generation}\n\n")
        IS_CORRECT=True
    else:
        # print generated answers in red
        print(f"Generated Answer:\n{bcolors.FAIL}{row['model_generation']}{bcolors.ENDC}\n\n")
    
    if IS_CORRECT:
        print(f"{bcolors.OKGREEN}It actually got it right!{bcolors.ENDC}")
    else:
        print(f"{bcolors.FAIL}Boo. Got it wrong.{bcolors.ENDC}")
    return IS_CORRECT

In [10]:
model_idx = 3
num_few_shot = 8

def get_model_generations(model_idx=0, num_few_shot=0):
    filename = f'/mnt/workdisk/kartik/llm-foundry/{model_list[model_idx]}_{num_few_shot}_shot_gsm8k_generations.csv'
    print(f"Inspecting Model {model_list[model_idx]} with {num_few_shot} shot generations.")
    df = pd.read_csv(filename)
    return df

df = get_model_generations(model_idx=model_idx, num_few_shot=num_few_shot)

Inspecting Model debug-mpt-7b-base-metamathqa-fixed-packing-ratio-ft with 8 shot generations.


#### I'm fairly convinced that the metamathqa model is the best one, but doesn't know when to shut up.

In [17]:
def get_accuracy_stats(df):
    # check metmath accuracy
    num_correct = 0
    num_maybe_correct = 0
    num_rambles = 0
    num_wrong = 0
    for idx in range(len(df)):
        row = df.iloc[idx]
        actual_answer = row['answer'].split('####')[-1].strip()
        if f"#### {actual_answer}" in row['model_generation']:
            num_correct += 1
            if len(row['model_generation'].split(f'#### {actual_answer}')[-1]) > 30:
                num_rambles += 1
        elif actual_answer in row['model_generation']:
            # looser check. might be a false positive
            num_maybe_correct += 1
        else:
            num_wrong += 1
    print(f"Model gets {num_correct} correct out of {len(df)}.")
    print(f"Model rambles {num_rambles} times after getting the right answer.")
    print(f"Model gets {num_maybe_correct} somewhat correct out of {len(df)}.")
    print(f"Model gets {num_wrong} wrong out of {len(df)}.")

In [18]:
df = get_model_generations(model_idx=3, num_few_shot=8)

Inspecting Model debug-mpt-7b-base-metamathqa-fixed-packing-ratio-ft with 8 shot generations.


In [20]:
print(f"Evaluating {model_list[3]} on train set")
get_accuracy_stats(df[df['split'] == 'train'])

Evaluating debug-mpt-7b-base-metamathqa-fixed-packing-ratio-ft on train set
Model gets 61 correct out of 100.
Model rambles 52 times after getting the right answer.
Model gets 10 somewhat correct out of 100.
Model gets 29 wrong out of 100.


In [21]:
print(f"Evaluating {model_list[3]} on test set")
get_accuracy_stats(df[df['split'] == 'test'])

Evaluating debug-mpt-7b-base-metamathqa-fixed-packing-ratio-ft on test set
Model gets 36 correct out of 100.
Model rambles 33 times after getting the right answer.
Model gets 18 somewhat correct out of 100.
Model gets 46 wrong out of 100.


In [24]:
for model_idx in range(len(model_list)):
    df = get_model_generations(model_idx=model_idx, num_few_shot=8)
    print("---------------------------------------------------------------")
    print(f"Evaluating {model_list[model_idx]} on train set")
    get_accuracy_stats(df[df['split'] == 'train'])

    print(f"Evaluating {model_list[model_idx]} on test set")
    get_accuracy_stats(df[df['split'] == 'test'])
    print("---------------------------------------------------------------\n\n")

Inspecting Model debug-mpt-7b-base-gsm8k-ft with 8 shot generations.
---------------------------------------------------------------
Evaluating debug-mpt-7b-base-gsm8k-ft on train set
Model gets 6 correct out of 100.
Model rambles 3 times after getting the right answer.
Model gets 24 somewhat correct out of 100.
Model gets 70 wrong out of 100.
Evaluating debug-mpt-7b-base-gsm8k-ft on test set
Model gets 15 correct out of 100.
Model rambles 7 times after getting the right answer.
Model gets 22 somewhat correct out of 100.
Model gets 63 wrong out of 100.
---------------------------------------------------------------


Inspecting Model debug-mpt-7b-base-metamathqa-ft with 8 shot generations.
---------------------------------------------------------------
Evaluating debug-mpt-7b-base-metamathqa-ft on train set
Model gets 42 correct out of 100.
Model rambles 0 times after getting the right answer.
Model gets 14 somewhat correct out of 100.
Model gets 44 wrong out of 100.
Evaluating debug-m

## Conclusion: The metamathQA model actually is the best model, despite the formatting. It just needs to learn how to stop talking after spitting out the answer. Also, incorrect packing ratio seems to be the reason why it was rambling. Once that gets changed to auto, changing the batch size/format affects accuracy but doesn't change the rambling behavior.