In [1]:
import torch;
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import datasets

In [2]:
MODEL = "/scratch/bchk/aguha/models/llama3p1_8b_base"
DEVICE = "cuda"
tokenizer = AutoTokenizer.from_pretrained(MODEL, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.bfloat16,
).to(device=DEVICE)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
test_data = datasets.load_dataset("nuprl/engineering-llm-systems", "math_word_problems", split="test")

In [31]:
pal_prompt = f"""Instruction: Solve each of the following math word problems by filling in function solve() with python code that solves the problem. Be sure to approach the problem step by step.

Examples:
Question: A train travels 120 miles in 3 hours. What is its average speed in miles per hour?
Let's solve this step by step!
Answer:
def solve():
    total_miles = 120
    total_hours = 3
    avg_speed = total_miles / total_hours
    return avg_speed

Question: A rectangle has a length of 8 cm and a width of 5 cm. What is the sum of its perimeter and area?
Let's solve this step by step!
Answer:
def solve():
    length = 8
    width = 5
    perimeter = 2 * (length + width)
    area = length * width
    return perimeter + area

Question: Larry is buying water bottles for the next two weeks. For the first week, Larry drinks 2 bottles of water a day. For the second week, Larry drinks 2 times the amount of water he drank the first week. If a crate of water bottles has 21 water bottles, how many crates of water bottles does Larry have to buy for two weeks?
Let's solve this step by step!
Answer:
def solve():
    bottles_per_day_week_one = 2
    total_bottles_week_one = bottles_per_day_week_one * 7
    total_bottles_week_two = total_bottles_week_one * 2
    total_bottles = total_bottles_week_one + total_bottles_week_two
    bottles_per_crate = 21
    crates_needed = total_bottles / bottles_per_crate
    return crates_needed

Question: Sally has 3 bags of marbles. The first bag has 12 marbles, the second has 8 marbles, and she has 5 fewer marbles in the third bag than the first bag. How many marbles does Sally have in total?
Let's solve this step by step!
Answer:
def solve():
    marbles_in_first_bag = 12
    marbles_in_second_bag = 8
    marbles_in_third_bag = marbles_in_first_bag - 5
    total_marbles = marbles_in_first_bag + marbles_in_second_bag + marbles_in_third_bag
    return total_marbles

Question: A recipe calls for 2 cups of flour to make 12 cookies. How many cups of flour are needed to make 30 cookies?
Let's solve this step by step!
Answer:
def solve():
    flour_per_cookie = 2 / 12
    total_flour = flour_per_cookie * 30
    return total_flour

Question: It takes 3 workers 6 hours to paint a house. If 9 workers paint at the same rate, how long will it take them to paint the same house?
Let's solve this step by step!
Answer:
def solve():
    total_worker_hours = 3 * 6
    num_workers = 9
    time_needed = total_worker_hours / num_workers
    return time_needed

Question: A restaurant has 45 tables. If 28 tables are occupied and each table seats 4 people, how many empty seats are there?
Let's solve this step by step!
Answer:
def solve():
    total_tables = 45
    seats_per_table = 4
    total_seats = total_tables * seats_per_table
    occupied_tables = 28
    occupied_seats = occupied_tables * seats_per_table
    empty_seats = total_seats - occupied_seats
    return empty_seats
    
Question: Alex runs 4 laps during each workout session, and he works out 4 times a week. Each lap is 70 meters long. How many total meters does Alex run in a week?
Let's solve this step by step!
Answer:
def solve():
    laps_per_session = 4
    sessions_per_week = 4
    laps_per_week = laps_per_session * sessions_per_week
    meters_per_lap = 70
    total_meters = meters_per_lap * laps_per_week
    return total_meters

Question: A password consists of 3 letters followed by 2 digits. If letters can be A-Z and digits can be 0-9, how many different passwords can be formed?
Let's solve this step by step!
Answer:
def solve():
    num_letters = 26 ** 3
    num_digits = 10 ** 2
    total_passwords = num_letters * num_digits
    return total_passwords

Question: A store is having a 25% off sale. If a shirt originally costs $80, and there is also a 8% sales tax, how much will the shirt cost in total?
Let's solve this step by step!
Answer:
def solve():
    original_price = 80
    discount_amount = 80 * 0.25
    discount_price = original_price - discount_amount
    tax_amount = discount_price * 0.08
    final_price = discount_price + tax_amount
    return final_price
    
Question: Ben eats one sandwich a day and buys packs that contain 10 sandwiches each at a cost of $5 per pack. How much will he spend on sandwiches in 50 days?
Let's solve this step by step!
Answer:
def solve():
    sandwiches_per_day = 1
    num_days = 50
    total_sandwiches = sandwiches_per_day * num_days
    sandwiches_per_pack = 10
    num_packs_needed = total_sandwiches / sandwiches_per_pack
    cost_per_pack = 5
    total_cost = num_packs_needed * cost_per_pack
    return total_cost

Question: Marie ordered one chicken meal that costs $12, 5 packs of milk that cost $3 each, 4 apples that cost $1.50 each, and some boxes of pizza. Marie paid a total of $50. How many boxes of pizza did Marie order if each box costs $8.50?
Let's solve this step by step!
Answer:
def solve():
    chicken_meal_cost = 12
    milk_cost = 5 * 3
    apples_cost = 4 * 1.50
    subtotal = chicken_meal_cost + milk_cost + apples_cost
    remaining = 50 - subtotal
    pizza_boxes = remaining / 8.50
    return pizza_boxes
    """

In [32]:
def generate(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.3,
            top_p=0.9,
            top_k=50,
            num_return_sequences=1)
    return [tokenizer.decode(output[inputs["input_ids"].shape[1]:]) for output in outputs]

from collections import namedtuple
PALAnswer = namedtuple("PALAnswer", ["question", "answer", "model_completions", "model_answers", "score", "error"])
def pal(problem):
    question = problem["question"]
    answer = problem["answer"]
    prompt = f"{pal_prompt}\n\nQuestion: {question}\nLet's solve this step by step!\nAnswer:\n"
    completions = generate(prompt, tokenizer, model)
    completions = [completion.split("Question")[0].strip() for completion in completions]
    answers = []
    score = 0
    try:
        for completion in completions:
            exec(completion)
            model_answer = round(eval("solve()"))
            answers.append(model_answer)
            if model_answer == answer:
                score += 1
        return PALAnswer(question, answer, completions, answers, score, "")
    except Exception as e:
        return PALAnswer(question, answer, completions, -1, score, repr(e))

In [14]:
from tqdm.auto import tqdm
correct = 0
wrong_problems = ""
for problem in tqdm(test_data):
    out = pal(problem)
    correct += out.score
    if out.score == 0:
        print(out)
        print("\n\n")
        wrong_problems += out.question + "\n"
print(f"total correct: {correct}\n")
print(wrong_problems)

  0%|          | 0/50 [00:00<?, ?it/s]

PALAnswer(question="Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed.  In the afternoon, she gives her chickens another 25 cups of feed.  How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?", answer=20, model_completions=['def solve():\n    morning_feed = 15\n    afternoon_feed = 25\n    total_feed = morning_feed + afternoon_feed\n    feed_per_chicken = total_feed / 20\n    return feed_per_chicken'], model_answers=[2], score=0, error='')



PALAnswer(question='In a dance class of 20 students, 20% enrolled in contemporary dance, 25% of the remaining enrolled in jazz dance, and the rest enrolled in hip-hop dance. What percentage of the entire students enrolled in hip-hop dance?', answer=60, 

In [33]:
def generate_batch(prompts, tokenizer, model):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.3,
            top_p=0.9,
            top_k=50,
            num_return_sequences=1)
    return [tokenizer.decode(output[inputs["input_ids"].shape[1]:]) for output in outputs]

def execute_completion(completion):
  try:
    exec(completion)
    ans = round(eval("solve()"))
    return ans, None
  except Exception as e:
    return None, e

def process_completion(completion):
    split_index = min(
        completion.find("Question") if "Question" in completion else float("inf"),
        completion.find(tokenizer.eos_token) if tokenizer.eos_token in completion else float("inf")
    )
        
    return completion[:split_index].strip() if split_index != float("inf") else completion.strip()

from collections import namedtuple
from pprint import pprint
PALAnswer = namedtuple("PALAnswer", ["question", "answer", "model_competion", "model_answer", "score", "error"])
def pal_batch(problems):
    questions = [problem["question"] for problem in problems]
    prompts = [f"{pal_prompt}\n\nQuestion: {question}\nLet's solve this step by step!\nAnswer:\n" for question in questions]
    completions = generate_batch(prompts, tokenizer, model)
    completions = [process_completion(completion) for completion in completions]
    pal_batch_out = []
    for problem, completion in zip(problems, completions):
        model_answer, err = execute_completion(completion)
        if err != None:
            pal_out = PALAnswer(problem["question"], problem["answer"], completion, None, 0, repr(err))
            pal_batch_out.append(pal_out)
            pprint(pal_out._asdict(), indent=2)
        if model_answer != problem["answer"]:
            pal_out = PALAnswer(problem["question"], problem["answer"], completion, model_answer, 0, None)
            pal_batch_out.append(pal_out)
            pprint(pal_out._asdict(), indent=2)
        else:
            pal_out = PALAnswer(problem["question"], problem["answer"], completion, model_answer, 1, None)
            pal_batch_out.append(pal_out)
    return pal_batch_out

In [34]:
from tqdm.auto import tqdm

def split_list(dict_list, x):
    return [dict_list[i:i + x] for i in range(0, len(dict_list), x)]

batches = split_list(test_data.to_list(), 10)
correct = 0
for batch in tqdm(batches):
    pal_batch_out = pal_batch(batch)
    for pal_out in pal_batch_out:
        correct += pal_out.score
print(f"total correct: {correct}\n")

  0%|          | 0/5 [00:00<?, ?it/s]

{ 'answer': 18,
  'error': None,
  'model_answer': 9,
  'model_competion': 'def solve():\n'
                     '    eggs_per_day = 16\n'
                     '    eggs_for_breakfast = 3\n'
                     '    eggs_for_muffins = 4\n'
                     '    eggs_for_market = eggs_per_day - eggs_for_breakfast '
                     '- eggs_for_muffins\n'
                     '    eggs_per_dollar = 1\n'
                     '    total_eggs = eggs_for_market * eggs_per_dollar\n'
                     '    return total_eggs',
  'question': 'Janet’s ducks lay 16 eggs per day. She eats three for breakfast '
              'every morning and bakes muffins for her friends every day with '
              "four. She sells the remainder at the farmers' market daily for "
              '$2 per fresh duck egg. How much in dollars does she make every '
              "day at the farmers' market?",
  'score': 0}
{ 'answer': 20,
  'error': None,
  'model_answer': 40,
  'model_competion': 'def sol