In [28]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [29]:
import math
from tqdm import tqdm
import toolmode.cfg_decoding as cfg_decoding
import toolmode.data.funcqa as funcqa
import toolmode.data.funcqa.runner as funcqa_runner

In [30]:
dataset = funcqa.load()
dataset

Saving the dataset (1/1 shards): 100%|██████████| 60/60 [00:00<00:00, 21833.96 examples/s]


Dataset({
    features: ['question', 'answer', 'func', 'operation', 'operation_num_params', 'prompt'],
    num_rows: 20
})

In [31]:
dataset[0]

{'question': 'A car travels 0.384 miles and then travels 1.63 more miles. How far did the car travel in total?',
 'answer': 2.014,
 'func': '<T>add(0.384, 1.63)=2.014',
 'operation': 'add',
 'operation_num_params': 2,
 'prompt': 'Answer the following question with the operator add:\n\nQ: A store sells 311 apples and 112 oranges. How many fruits are sold in total?\nA: The total number of fruits sold by the store is 311+112=<T>add(311, 112)=423.\n\nQ: A bakery made 201 loaves of bread in the morning and 31 in the afternoon. How many loaves of bread did they make in total?\nA: The total number of loaves of bread made is 201+31=<T>add(201, 31)=232.\n\nQ: A car traveled 118.01 miles in the morning and 451.96 miles in the afternoon. How many miles did the car travel in total?\nA: The car traveled 118.01+451.96=<T>add(118.01, 451.96)=569.97 miles in total.\n\nQ: A store sells 277 apples, 209 oranges, and 72 bananas. How many fruits are sold in total?\nA: The total number of fruits sold by the

In [32]:
print(dataset[0]["prompt"])

Answer the following question with the operator add:

Q: A store sells 311 apples and 112 oranges. How many fruits are sold in total?
A: The total number of fruits sold by the store is 311+112=<T>add(311, 112)=423.

Q: A bakery made 201 loaves of bread in the morning and 31 in the afternoon. How many loaves of bread did they make in total?
A: The total number of loaves of bread made is 201+31=<T>add(201, 31)=232.

Q: A car traveled 118.01 miles in the morning and 451.96 miles in the afternoon. How many miles did the car travel in total?
A: The car traveled 118.01+451.96=<T>add(118.01, 451.96)=569.97 miles in total.

Q: A store sells 277 apples, 209 oranges, and 72 bananas. How many fruits are sold in total?
A: The total number of fruits sold by the store is 277+209+72=<T>add(277, 209, 72)=558.

Q: A car travels 0.384 miles and then travels 1.63 more miles. How far did the car travel in total?
A:


In [33]:
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"
cfg_decoder = cfg_decoding.CfgDecoder(MODEL_NAME, device_map="cuda:0")

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.33s/it]


In [34]:
results = {}

for i, row in tqdm(list(enumerate(dataset))):
    prompt = row["prompt"]
    answer_p1 = cfg_decoder.generate(prompt, max_new_tokens=500)
    answer_p2 = cfg_decoder.generate_constrained(prompt + answer_p1, max_new_tokens=100)
    
    operation = answer_p2.split("<T>")[1]
    result = funcqa_runner.evaluate_toolcall(operation)

    y_pred = float(result) if result is not None else float("nan")
    y_true = float(row["answer"])
    is_correct = math.isclose(y_pred, y_true, rel_tol=0.1)
    results[i] = {"y_true": y_true, "y_pred": y_pred, "is_correct": is_correct}
    
    print(row["question"])
    print(answer_p1)
    print(answer_p2)
    print(result)
    print(row["answer"])
    print(is_correct)
    print()

  5%|▌         | 1/20 [00:08<02:43,  8.63s/it]

A car travels 0.384 miles and then travels 1.63 more miles. How far did the car travel in total?
The car traveled 0.384+1.63=
<T>add(0.384,1.63)
2.01
2.014
True



 10%|█         | 2/20 [00:17<02:41,  8.96s/it]

A marathon runner has completed 8.7832 miles so far. He has 5.3689 miles left to run. What is the total distance of the marathon?
The total distance of the marathon is 8.7832+5.3689=
<T>add(8.7832,5.3689)
14.15
14.1521
True



 15%|█▌        | 3/20 [00:27<02:34,  9.10s/it]

A company has 13,289 employees and plans to hire 7,899 more. How many employees will the company have in total?
The total number of employees the company will have is 13,289+7,899=
<T>add(13,289,7,899)
1208
21188.0
False



 20%|██        | 4/20 [00:38<02:37,  9.83s/it]

This week, Mary earns 17.83 dollars on Monday, 18.99 dollars on Tuesday, 23.45 dollars on Wednesday, 14.12 dollars on Thursday, and 8.22 dollars on Friday. How much money did Mary earn this week?
The total amount of money Mary earned this week is 17.83+18.99+23.45+14.12+8.22=
<T>add(17.83,18.99,23.45,14.12,8.22)
82.61
82.61
True



 25%|██▌       | 5/20 [00:48<02:31, 10.12s/it]

 The cost of an estate is 19,543,778.4 dollars. If the furnishings will cost at least 632,361 dollars, what is the minimum amount of money needed to buy the estate and furnish it?
The minimum amount of money needed to buy the estate and furnish it is 19,543,778.4+632,361=
<T>add(19,543,778.4,632,361.40)
2333.8
20176139.4
False



 30%|███       | 6/20 [00:57<02:15,  9.70s/it]

The gas tank of car can hold up to 11.28 gallons of gas. Currently, a car only has 1.89 gallons of gas in its tank. How many more gallons of gas can the car hold if the driver wants to fill the tank?
The car can hold 11.28-1.89=
<T>subtract(11.28,1.89)
9.39
9.39
True



 35%|███▌      | 7/20 [01:06<02:01,  9.35s/it]

If a recipe calls for 892 cups of flour and you only have 134 cups, how many cups of flour do you need to buy?
You need 892-134=
<T>subtract(892,134)
758
758.0
True



 40%|████      | 8/20 [01:15<01:51,  9.26s/it]

A runner has completed 8.9832 miles out of a 29.47-mile marathon. How many miles does the runner have left to run?
The runner has 29.47-8.9832=
<T>subtract(29.47,8.9832)
20.49
20.4868
True



 45%|████▌     | 9/20 [01:25<01:46,  9.68s/it]

A company had 18,778 employees last year. This year, the company has 13,289 employees. How many employees did the company lose?
The company had 18,778 employees last year and now has 13,289 employees, so the company lost 18,778-13,289=
<T>subtract(18,778,13,289)
-1062
5489.0
False



 50%|█████     | 10/20 [01:37<01:42, 10.21s/it]

A company has a budget of $1028137.23. They spent $79952.88 on office expenses. How much money is left?
The company has $1028137.23, and they spent $79952.88, so now they have $1028137.23-79952.88=
<T>subtract(1028137.23,79952.88)
948184.35
948184.35
True



 55%|█████▌    | 11/20 [01:46<01:28,  9.81s/it]

 A reader is reading a really long book. She reads 67 pages per day, and it takes her 253 days to finish the book. How many pages are in the book at most?
The reader reads 67 pages per day, so she reads 67*253=
<T>multiply(67,253)
16951
16951.0
True



 60%|██████    | 12/20 [01:55<01:18,  9.81s/it]

If there is a gemstone that costs only $0.37 a gram and Jack wants to buy 48,172.7 grams, how much would he need to spend?
The gemstone costs $0.37 per gram, so Jack would need to spend 48172.7*0.37=
<T>multiply(48172.7,0.37)
17823.9
17823.899
True



 65%|██████▌   | 13/20 [02:05<01:07,  9.62s/it]

 A swimming pool has a length of 0.848 kilometers, and a width of 0.3766 kilometers. What is the area of the swimming pool in square kilometers?
The area of the swimming pool is 0.848*0.3766=
<T>multiply(0.848,0.3766)
0.32
0.3193568
True



 70%|███████   | 14/20 [02:13<00:55,  9.31s/it]

If a car travels 0.384 miles in 1 minute, how many miles will it travel in 49 minutes?
The car travels 0.384*49=
<T>multiply(0.384,49)
18.82
18.816
True



 75%|███████▌  | 15/20 [02:24<00:48,  9.67s/it]

If a pizza has a diameter of 0.6273 inches, what is its circumference? (take pi to be 3.14159)
The circumference of a circle with diameter 0.6273 inches is given by C=2*pi*d=2*3.14159*0.6273=
<T>multiply(2,3.14159,0.6273)
3.94
1.970719407
False



 80%|████████  | 16/20 [02:33<00:37,  9.41s/it]

If 327.87 ounces of sugar are required to make 52 cookies, how many ounces of sugar are required to make each cookie?
Each cookie requires 327.87/52=
<T>divide(327.87,52)
6.31
6.3051
True



 85%|████████▌ | 17/20 [02:42<00:28,  9.50s/it]

A company has a total budget of 21412879.32 dollars and wants to evenly distribute the money to 8889 employees. How much money will each employee get?
Each employee will get 21412879.32/8889=
<T>divide(21412879.32,8889)
2408.92
2408.9188
True



 90%|█████████ | 18/20 [02:51<00:18,  9.35s/it]

A store sells a total of 898.75 ounces of cereal. If they sell 43 boxes of cereal, how many ounces of cereal are in each box?
Each box of cereal contains 898.75/43=
<T>divide(898.75,43)
20.9
20.9012
True



 95%|█████████▌| 19/20 [03:01<00:09,  9.36s/it]

A swimming pool has a volume of 19,338.74 gallons. If the water can evenly fill 532 buckets, how many gallons of water are in each bucket?
The water in each bucket is 19,338.74/532=
<T>divide(19,338.74,532)
0.00011
36.351
False



100%|██████████| 20/20 [03:10<00:00,  9.53s/it]

A marathon is 26.2823 miles long. If a runner finishes the marathon in 216.3 minutes, what is the runner's average speed per mile?
The runner's average speed per mile is 216.3/26.2823=
<T>divide(216.3,26.2823)
8.23
8.2299
True






In [35]:
accuracy = sum([1 for r in results.values() if r["is_correct"]]) / len(results)
accuracy * 100

75.0

In [36]:
results = {}

for i, row in tqdm(list(enumerate(dataset))):
    prompt = row["prompt"]
    answer_p1 = cfg_decoder.generate(prompt, max_new_tokens=500)
    answer_p2 = "<T>" + cfg_decoder.generate(prompt + answer_p1 + "<T>", max_new_tokens=100).split("=")[0]
    
    try: 
        operation = answer_p2.split("<T>")[1]
        result = funcqa_runner.evaluate_toolcall(operation)
    except:
        result = None

    y_pred = float(result) if result is not None else float("nan")
    y_true = float(row["answer"])
    is_correct = math.isclose(y_pred, y_true, rel_tol=0.1)
    results[i] = {"y_true": y_true, "y_pred": y_pred, "is_correct": is_correct}
    
    print(row["question"])
    print(answer_p1)
    print(answer_p2)
    print(result)
    print(row["answer"])
    print(is_correct)
    print()

  5%|▌         | 1/20 [00:03<01:03,  3.32s/it]

A car travels 0.384 miles and then travels 1.63 more miles. How far did the car travel in total?
The car traveled 0.384+1.63=
<T>add(0.384, 1.63)
2.01
2.014
True



 10%|█         | 2/20 [00:07<01:05,  3.64s/it]

A marathon runner has completed 8.7832 miles so far. He has 5.3689 miles left to run. What is the total distance of the marathon?
The total distance of the marathon is 8.7832+5.3689=
<T>add(8.7832, 5.3689)
14.15
14.1521
True



 15%|█▌        | 3/20 [00:10<01:02,  3.66s/it]

A company has 13,289 employees and plans to hire 7,899 more. How many employees will the company have in total?
The total number of employees the company will have is 13,289+7,899=
<T>add(13,289, 7,899)
1208
21188.0
False



 20%|██        | 4/20 [00:16<01:09,  4.36s/it]

This week, Mary earns 17.83 dollars on Monday, 18.99 dollars on Tuesday, 23.45 dollars on Wednesday, 14.12 dollars on Thursday, and 8.22 dollars on Friday. How much money did Mary earn this week?
The total amount of money Mary earned this week is 17.83+18.99+23.45+14.12+8.22=
<T>add(17.83, 18.99, 23.45, 14.12, 8.22)
82.61
82.61
True



 25%|██▌       | 5/20 [00:21<01:08,  4.60s/it]

 The cost of an estate is 19,543,778.4 dollars. If the furnishings will cost at least 632,361 dollars, what is the minimum amount of money needed to buy the estate and furnish it?
The minimum amount of money needed to buy the estate and furnish it is 19,543,778.4+632,361=
<T>add(19,543,778.4, 632,361)
2333.4
20176139.4
False



 30%|███       | 6/20 [00:24<00:59,  4.25s/it]

The gas tank of car can hold up to 11.28 gallons of gas. Currently, a car only has 1.89 gallons of gas in its tank. How many more gallons of gas can the car hold if the driver wants to fill the tank?
The car can hold 11.28-1.89=
<T>subtract(11.28, 1.89)
9.39
9.39
True



 35%|███▌      | 7/20 [00:31<01:05,  5.04s/it]

If a recipe calls for 892 cups of flour and you only have 134 cups, how many cups of flour do you need to buy?
You need 892-134=
<T>subtract(892, 134)
758
758.0
True



 40%|████      | 8/20 [00:37<01:05,  5.47s/it]

A runner has completed 8.9832 miles out of a 29.47-mile marathon. How many miles does the runner have left to run?
The runner has 29.47-8.9832=
<T>subtract(29.47, 8.9832)
20.49
20.4868
True



 45%|████▌     | 9/20 [00:46<01:09,  6.34s/it]

A company had 18,778 employees last year. This year, the company has 13,289 employees. How many employees did the company lose?
The company had 18,778 employees last year and now has 13,289 employees, so the company lost 18,778-13,289=
<T>subtract(18,778, 13,289)
-1062
5489.0
False



 50%|█████     | 10/20 [00:54<01:10,  7.05s/it]

A company has a budget of $1028137.23. They spent $79952.88 on office expenses. How much money is left?
The company has $1028137.23, and they spent $79952.88, so now they have $1028137.23-79952.88=
<T>subtract(1028137.23, 79952.88)
948184.35
948184.35
True



 55%|█████▌    | 11/20 [00:58<00:53,  5.97s/it]

 A reader is reading a really long book. She reads 67 pages per day, and it takes her 253 days to finish the book. How many pages are in the book at most?
The reader reads 67 pages per day, so she reads 67*253=
<T>multiply(67, 253)
16951
16951.0
True



 60%|██████    | 12/20 [01:05<00:51,  6.42s/it]

If there is a gemstone that costs only $0.37 a gram and Jack wants to buy 48,172.7 grams, how much would he need to spend?
The gemstone costs $0.37 per gram, so Jack would need to spend 48172.7*0.37=
<T>multiply(48172.7, 0.37)
17823.9
17823.899
True



 65%|██████▌   | 13/20 [01:12<00:46,  6.64s/it]

 A swimming pool has a length of 0.848 kilometers, and a width of 0.3766 kilometers. What is the area of the swimming pool in square kilometers?
The area of the swimming pool is 0.848*0.3766=
<T>multiply(0.848, 0.3766)
0.32
0.3193568
True



 70%|███████   | 14/20 [01:16<00:34,  5.68s/it]

If a car travels 0.384 miles in 1 minute, how many miles will it travel in 49 minutes?
The car travels 0.384*49=
<T>multiply(0.384, 49)
18.82
18.816
True



 75%|███████▌  | 15/20 [01:23<00:31,  6.25s/it]

If a pizza has a diameter of 0.6273 inches, what is its circumference? (take pi to be 3.14159)
The circumference of a circle with diameter 0.6273 inches is given by C=2*pi*d=2*3.14159*0.6273=
<T>multiply(2, 3.14159, 0.6273)
3.94
1.970719407
False



 80%|████████  | 16/20 [01:27<00:21,  5.40s/it]

If 327.87 ounces of sugar are required to make 52 cookies, how many ounces of sugar are required to make each cookie?
Each cookie requires 327.87/52=
<T>divide(327.87, 52)
6.31
6.3051
True



 85%|████████▌ | 17/20 [01:31<00:14,  4.98s/it]

A company has a total budget of 21412879.32 dollars and wants to evenly distribute the money to 8889 employees. How much money will each employee get?
Each employee will get 21412879.32/8889=
<T>divide(21412879.32, 8889)
2408.92
2408.9188
True



 90%|█████████ | 18/20 [01:35<00:09,  4.63s/it]

A store sells a total of 898.75 ounces of cereal. If they sell 43 boxes of cereal, how many ounces of cereal are in each box?
Each box of cereal contains 898.75/43=
<T>divide(898.75, 43)
20.9
20.9012
True



 95%|█████████▌| 19/20 [01:42<00:05,  5.35s/it]

A swimming pool has a volume of 19,338.74 gallons. If the water can evenly fill 532 buckets, how many gallons of water are in each bucket?
The water in each bucket is 19,338.74/532=
<T>divide(19,338.74, 532)
0.00011
36.351
False



100%|██████████| 20/20 [01:46<00:00,  5.31s/it]

A marathon is 26.2823 miles long. If a runner finishes the marathon in 216.3 minutes, what is the runner's average speed per mile?
The runner's average speed per mile is 216.3/26.2823=
<T>divide(216.3, 26.2823)
8.23
8.2299
True






In [38]:
accuracy = sum([1 for r in results.values() if r["is_correct"]]) / len(results)
accuracy * 100

75.0