In [5]:
import pandas as pd
import numpy as np

model_list = [
    'mpt-7b-base-gsm8k-ft',
    'mpt-7b-chat-gsm8k-ft-hf',
    'mpt-7b-base-metamathqa-ft',
]

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

num_few_shot = 0

def pretty_print(df, split='train', idx=None):
    IS_CORRECT=False
    if idx == None:
        print("Index not passed, picking a random row.")
        idx = np.random.randint(0, len(df))
    print(f"Showing {split} example at index {idx}")
    row = df.iloc[idx]
    print(f"Prompt:\n{row['prompt']}\n\n")
    print(f"Correct Answer:\n{bcolors.OKGREEN}{row['answer']}{bcolors.ENDC}\n\n")
    actual_answer = row['answer'].split('####')[-1].strip()
    if actual_answer in row['model_generation']:
        pretty_generation = row['model_generation'].replace(actual_answer, f"{bcolors.UNDERLINE}{bcolors.OKGREEN}{actual_answer}{bcolors.ENDC}")
        print(f"Generated Answer:\n{pretty_generation}\n\n")
        IS_CORRECT=True
    else:
        # print generated answers in red
        print(f"Generated Answer:\n{bcolors.FAIL}{row['model_generation']}{bcolors.ENDC}\n\n")
    
    if IS_CORRECT:
        print(f"{bcolors.OKGREEN}It actually got it right!{bcolors.ENDC}")
    else:
        print(f"{bcolors.FAIL}Boo. Got it wrong.{bcolors.ENDC}")
    return IS_CORRECT

In [11]:
model_idx = 0
num_few_shot = 0

def get_model_generations(model_idx=0, num_few_shot=0):
    filename = f'/mnt/workdisk/kartik/llm-foundry/{model_list[model_idx]}_{num_few_shot}_shot_gsm8k_generations.csv'
    print(f"Inspecting Model {model_list[model_idx]} with {num_few_shot} shot generations.")
    df = pd.read_csv(filename)
    return df

df = get_model_generations(model_idx=model_idx, num_few_shot=num_few_shot)

Inspecting Model mpt-7b-base-gsm8k-ft with 0 shot generations.


In [12]:
pretty_print(df, split='train', idx=0)

Showing train example at index 0
Prompt:
Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?


Correct Answer:
[92mNatalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72[0m


Generated Answer:

Natalia sold 48 clips in April, and sold half as many clips in May, so she sold 48/2 = <<48/2=24>>24 clips in May.
She sold a total of 48+24 = <<48+24=[4m[92m72[0m>>[4m[92m72[0m clips in April and May.
Natalia sold a total of [4m[92m72[0m clips in April and May.
This is [4m[92m72[0m/2=<<[4m[92m72[0m/2=36>>36 clips.
Natalia sold a total of 36 clips altogether in April and May.


[92mIt actually got it right![0m


True

In [13]:
df = get_model_generations(model_idx=1, num_few_shot=8)
pretty_print(df, split='train', idx=0)

Inspecting Model mpt-7b-chat-gsm8k-ft-hf with 8 shot generations.
Showing train example at index 0
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred

True

In [14]:
pretty_print(df, split='train')

Index not passed, picking a random row.
Showing train example at index 110
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred will have $48 + $28 = $

False

In [15]:
pretty_print(df, split='train')

Index not passed, picking a random row.
Showing train example at index 159
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred will have $48 + $28 = $

False

In [16]:
pretty_print(df, split='train')

Index not passed, picking a random row.
Showing train example at index 122
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred will have $48 + $28 = $

True

#### NOTE: The chat model doesn't really seem to have a pattern in the kind of errors it's making. Also, note that I'm inspecting outputs on the train set. This is stuff that it should have got memorized.
1. I see errors where it "keeps talking" even though it has output the correct answer.
2. I see errors where it screws up the problem -> formulation.
3. Finally, I also see a few errors where it just gets the math wrong, although this seems to be surprisingly the most rare.
(my takeaway from this is that we don't necessarily need to focus on giving it numbers)

In [18]:
df = get_model_generations(model_idx=2, num_few_shot=8)

Inspecting Model mpt-7b-base-metamathqa-ft with 8 shot generations.


In [19]:
pretty_print(df, split='train')

Index not passed, picking a random row.
Showing train example at index 27
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred will have $48 + $28 = $<

True

#### Oh wow. This is an interesting example. The model got the answer wrong because it kept talking. I also see that it has gotten used to the metamathqa format and is using that to decide EOS. So it went onto say "The answer is: true" and only then stopped.

In [21]:
pretty_print(df, split='train')

Index not passed, picking a random row.
Showing train example at index 60
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred will have $48 + $28 = $<

True

#### Woo hoo! I think I see a pattern. I think the model is having trouble generating EOS at the right time because of the metamathqa format.

In [23]:
pretty_print(df, split='test')

Index not passed, picking a random row.
Showing test example at index 145
Prompt:
Q: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
A: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220
Q: Earl has $90; Fred has $48; Greg has $36. Earl owes Fred $28. Fred owes Greg $32. Greg owes Earl $40. When all debts are paid, how much will Greg and Earl have together in dollars?
A: Earl will have $90 - $28 = $<<90-28=62>>62 after paying off his debt to Fred.
Fred will have $48 + $28 = $<

True

#### I'm fairly convinced that the metamathqa model is the best one, but doesn't know when to shut up.

In [26]:
def get_accuracy_stats(df):
    # check metmath accuracy
    num_correct = 0
    num_maybe_correct = 0
    num_rambles = 0
    num_wrong = 0
    for idx in range(len(df)):
        row = df.iloc[idx]
        actual_answer = row['answer'].split('####')[-1].strip()
        if f"#### {actual_answer}" in row['model_generation']:
            num_correct += 1
            if len(row['model_generation'].split(f'#### {actual_answer}')[-1]) > 10:
                num_rambles += 1
        elif actual_answer in row['model_generation']:
            # looser check. might be a false positive
            num_maybe_correct += 1
        else:
            num_wrong += 1
    print(f"Model gets {num_correct} correct out of {len(df)}.")
    print(f"Model rambles {num_rambles} times after getting the right answer.")
    print(f"Model gets {num_maybe_correct} somewhat correct out of {len(df)}.")
    print(f"Model gets {num_wrong} wrong out of {len(df)}.")

print("Evaluating mpt-7b-base-metamathqa-ft on train set")
get_accuracy_stats(df[df['split'] == 'train'])

Evaluating mpt-7b-base-metamathqa-ft on train set
Model gets 71 correct out of 100.
Model rambles 70 times after getting the right answer.
Model gets 10 somewhat correct out of 100.
Model gets 19 wrong out of 100.


In [27]:
print("Evaluating mpt-7b-base-metamathqa-ft on test set")
get_accuracy_stats(df[df['split'] == 'test'])

Evaluating mpt-7b-base-metamathqa-ft on test set
Model gets 44 correct out of 100.
Model rambles 43 times after getting the right answer.
Model gets 14 somewhat correct out of 100.
Model gets 42 wrong out of 100.


In [30]:
df = get_model_generations(model_idx=0, num_few_shot=8)

Inspecting Model mpt-7b-base-gsm8k-ft with 8 shot generations.


In [31]:
print(f"Evaluating {model_list[0]} on train set")
get_accuracy_stats(df[df['split'] == 'train'])

Evaluating mpt-7b-base-gsm8k-ft on train set
Model gets 14 correct out of 100.
Model rambles 0 times after getting the right answer.
Model gets 17 somewhat correct out of 100.
Model gets 69 wrong out of 100.


In [32]:
print(f"Evaluating {model_list[0]} on test set")
get_accuracy_stats(df[df['split'] == 'test'])

Evaluating mpt-7b-base-gsm8k-ft on test set
Model gets 18 correct out of 100.
Model rambles 0 times after getting the right answer.
Model gets 16 somewhat correct out of 100.
Model gets 66 wrong out of 100.


In [34]:
df = get_model_generations(model_idx=1, num_few_shot=8)
print(f"Evaluating {model_list[1]} on train set")
get_accuracy_stats(df[df['split'] == 'train'])

Inspecting Model mpt-7b-chat-gsm8k-ft-hf with 8 shot generations.
Evaluating mpt-7b-chat-gsm8k-ft-hf on train set
Model gets 15 correct out of 100.
Model rambles 0 times after getting the right answer.
Model gets 15 somewhat correct out of 100.
Model gets 70 wrong out of 100.


In [35]:
df = get_model_generations(model_idx=1, num_few_shot=8)
print(f"Evaluating {model_list[1]} on test set")
get_accuracy_stats(df[df['split'] == 'test'])

Inspecting Model mpt-7b-chat-gsm8k-ft-hf with 8 shot generations.
Evaluating mpt-7b-chat-gsm8k-ft-hf on test set
Model gets 14 correct out of 100.
Model rambles 0 times after getting the right answer.
Model gets 18 somewhat correct out of 100.
Model gets 68 wrong out of 100.


## Conclusion: The metamathQA model actually is the best model, despite the formatting. It just needs to learn how to stop talking after spitting out the answer. I think I can test this by asking eval to log generations.