In [40]:
# make algebra problems
# solve with Llama 8b
# check cot performance
# check no-cot performance
# tweak task prompt
# tweak algebra structure

import pandas as pd
import random
import transformers
import torch
import dotenv
import os

dotenv.load_dotenv()

True

In [41]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    token=os.getenv('HF_TOKEN')
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.58it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [42]:
def make_problem(digits, format=None):
    if format is None:
        n = random.randint(10**(digits-1), 10**digits-1)
        return str(n), n
    elif format[0] == '+':
        p1 = make_problem(digits, format[1][0])
        p2 = make_problem(digits, format[1][1])
        return f"({p1[0]}+{p2[0]})", p1[1] + p2[1]
    elif format[0] == '-':
        p1 = make_problem(digits, format[1][0])
        p2 = make_problem(digits, format[1][1])
        return f"({p1[0]}-{p2[0]})", p1[1] - p2[1]
    elif format[0] == '*':
        p1 = make_problem(digits, format[1][0])
        p2 = make_problem(digits, format[1][1])
        return f"({p1[0]}*{p2[0]})", p1[1] * p2[1]
    
def make_problem_set(digits, format, num_problems):
    return pd.DataFrame([make_problem(digits, format) for _ in range(num_problems)], columns=['problem', 'correct_solution'])

In [43]:
def solve_problem(problem, sys_prompt="What is the value of this expression?"):
    messages = [
    {"role": "system", "content": sys_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

def solve_problem_set(problem_set, sys_prompt="What is the value of this expression?"):
    solved = problem_set.copy()
    solved['solution'] = [solve_problem(row['problem'], sys_prompt) for _, row in problem_set.iterrows()]
    return solved

In [44]:
def correct_rate(solution_set):
    df_with_correct = solution_set.copy()
    df_with_correct['correct'] = df_with_correct.apply(lambda x: str(x['correct_solution']) in x['solution'], axis=1)
    return df_with_correct

In [45]:
addition_4 = make_problem_set(4, ('+', (None, None)), 100)
addition_4 = solve_problem_set(addition_4)
addition_4 = correct_rate(addition_4)
addition_4

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,problem,correct_solution,solution,correct
0,(6166+2280),8446,"To evaluate this expression, we need to follow...",True
1,(4495+7022),11517,"To evaluate this expression, we need to add 44...",True
2,(3221+3477),6698,"To evaluate this expression, we need to add 32...",True
3,(3331+3557),6888,"To evaluate this expression, we need to follow...",True
4,(7519+8978),16497,"To evaluate this expression, I'll follow the o...",True
...,...,...,...,...
95,(1974+5822),7796,"To evaluate this expression, we need to add 19...",True
96,(8036+5950),13986,"To evaluate this expression, we need to add 80...",True
97,(1714+1671),3385,"To evaluate this expression, we need to follow...",True
98,(8733+2807),11540,"To evaluate this expression, we need to add 87...",True


In [46]:
sum(addition_4['correct'])

94