# Making Difficulty-Calibrated Datasets for Different LLM's

## Setup

In [12]:
import pandas as pd
import numpy as np
import random
import transformers
import torch
import dotenv
import os
import matplotlib.pyplot as plt
import re
import string
from tqdm import tqdm
import pysat

dotenv.load_dotenv()

True

In [13]:
def make_dot_product_problem_set(vec_len, vec_mag, num_problems, avoid_collisions=True):
    if vec_len < 2:
        raise ValueError("Need vectors of length 2 or greater to have two intermediates")
    if vec_mag < 2:
        raise ValueError("We remove 0 and 1 from the vector magnitudes to avoid collisions")
    if avoid_collisions and not ((vec_mag-1)**vec_len > num_problems):
        raise ValueError("To avoid collisions, need a bigger space than the number of problems requested")

    def make_dot_product_problem(vec_len, avoid_collisions=True):
        a = np.random.randint(2, vec_mag, vec_len)
        b = np.random.randint(2, vec_mag, vec_len)
        problem = (f"[{', '.join([str(x) for x in a])}] ⋅ [{', '.join([str(x) for x in b])}]"
        , np.dot(a, b)
        , a[0]*b[0],
        a[-1]*b[-1])
        if avoid_collisions and str(problem[2]) in problem[0] or str(problem[3]) in problem[0]:
            return make_dot_product_problem(vec_len, avoid_collisions)
        return problem

    return pd.DataFrame(
        [make_dot_product_problem(vec_len, avoid_collisions) for _ in range(num_problems)],
        columns=['problem', 'correct_solution', 'intermediate_1', 'intermediate_2'])

In [14]:
def solve_problem_cot(pipeline, problem, sys_prompt, cot_prompt):
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    messages = [
    {"role": "system", "content": sys_prompt + ' ' + cot_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

def solve_problem_memo(pipeline, problem, sys_prompt, memo_prompt, max_toks=10): # setting max_toks to 1 because we're doing small digit numbers
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    messages = [
    {"role": "system", "content": sys_prompt + ' ' + memo_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=max_toks,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

In [15]:
problem_prompt = "What is the dot product of these two vectors?"
cot_prompt = "Show your work."
memo_prompt = """Answer with only a number. Do not do any calculations. For example:
                 Problem: [2, 9, -3] ⋅ [-6, -2, -9]
                 Solution: -3
                 """

In [16]:
def repeated_solve_problem_memo(pipeline, problem, sys_prompt, memo_prompt, max_toks=10, return_retries=False):
    soln = solve_problem_memo(pipeline, problem, sys_prompt, memo_prompt, max_toks=max_toks)
    i = 0
    while (soln[0]== '-' and len(soln) > 4) or (soln[0]!= '-' and len(soln) > 3):
        if i >= 50: # tap out
            return soln if not return_retries else (soln, i)
        i += 1
        print('retrying', i, soln)
        soln = solve_problem_memo(pipeline, problem, sys_prompt, memo_prompt, max_toks=max_toks)
    return soln if not return_retries else (soln, i)

In [17]:
def test_memo_prompt(pipeline, memo_prompt, problems, n):
    dot_problems = problems
    memo_solutions = []
    memo_correct = 0
    max_retries = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        sol, retries = repeated_solve_problem_memo(pipeline, row['problem'], problem_prompt, memo_prompt=memo_prompt, max_toks=20, return_retries=True)
        memo_solutions.append(sol)
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]
        max_retries = max(max_retries, retries)
    print(f"Max retries: {max_retries}")
    print(f"Correct memorization: {memo_correct}/{n}")

In [18]:
def dot_test(pipeline, dot_problems, n, problem_prompt=problem_prompt, cot_prompt=cot_prompt, memo_prompt=memo_prompt):
    cot_solutions = []
    cot_correct = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        cot_solutions.append(solve_problem_cot(pipeline, row['problem'], problem_prompt, cot_prompt))
        cot_correct += str(row['correct_solution']) in cot_solutions[-1]

    memo_solutions = []
    memo_correct = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        memo_solutions.append(solve_problem_memo(pipeline, row['problem'], problem_prompt, memo_prompt))
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]

    print(f"Correct COT: {cot_correct}/{n}")
    print(f"Correct memorization: {memo_correct}/{n}")
    return cot_solutions, memo_solutions, dot_problems

In [19]:
results = {}

# Llama 8b reproduction

In [20]:
llama8b_pipe = transformers.pipeline(
        "text-generation",
        model="meta-llama/Meta-Llama-3-8B-Instruct",
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        token=os.getenv('HF_TOKEN')
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]


In [21]:
n=10
test_memo_prompt(llama8b_pipe, memo_prompt=memo_prompt, problems=make_dot_product_problem_set(4, 10, 20), n=n)
results[('llama8b', 4, 10)] = dot_test(llama8b_pipe, make_dot_product_problem_set(4, 10, 100), n=n, problem_prompt=problem_prompt, cot_prompt=cot_prompt, memo_prompt=memo_prompt)

100%|██████████| 10/10 [00:01<00:00,  9.87it/s]


Max retries: 0/10
Correct memorization: 0/10


100%|██████████| 10/10 [01:03<00:00,  6.32s/it]
100%|██████████| 10/10 [00:00<00:00, 11.35it/s]

Correct COT: 10/10
Correct memorization: 0/10





In [22]:
llama70b_pipe = transformers.pipeline(
        "text-generation",
        model="meta-llama/Meta-Llama-3-70B-Instruct",
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        token=os.getenv('HF_TOKEN')
    )

Loading checkpoint shards: 100%|██████████| 30/30 [00:33<00:00,  1.13s/it]
Some parameters are on the meta device device because they were offloaded to the cpu.


In [25]:
n=10
test_memo_prompt(llama70b_pipe, memo_prompt=memo_prompt, problems=make_dot_product_problem_set(4, 10, 20), n=n)

100%|██████████| 10/10 [00:25<00:00,  2.50s/it]

Max retries: 0/10
Correct memorization: 0/10





In [24]:
results[('llama70b', 4, 10)] = dot_test(llama70b_pipe, make_dot_product_problem_set(4, 10, 100), n=n, problem_prompt=problem_prompt, cot_prompt=cot_prompt, memo_prompt=memo_prompt)

100%|██████████| 10/10 [34:41<00:00, 208.10s/it]
100%|██████████| 10/10 [00:25<00:00,  2.51s/it]

Correct COT: 10/10
Correct memorization: 0/10





In [26]:
results[('llama70b', 4, 10)]

(["To find the dot product of the two vectors, I'll multiply corresponding components and sum them up. Here's the step-by-step calculation:\n\n[9, 9, 2, 7] ⋅ [2, 4, 4, 8] =?\n\n1. Multiply the first components: 9 × 2 = 18\n2. Multiply the second components: 9 × 4 = 36\n3. Multiply the third components: 2 × 4 = 8\n4. Multiply the fourth components: 7 × 8 = 56\n5. Add up the products: 18 + 36 + 8 + 56 = 118\n\nSo, the dot product of the two vectors is:\n\n[9, 9, 2, 7] ⋅ [2, 4, 4, 8] = 118",
  "To find the dot product of the two vectors, I'll multiply corresponding components and sum them up. Here's the step-by-step calculation:\n\n[6, 5, 8, 5] ⋅ [9, 9, 9, 9] =?\n\n1. Multiply the first components: 6 × 9 = 54\n2. Multiply the second components: 5 × 9 = 45\n3. Multiply the third components: 8 × 9 = 72\n4. Multiply the fourth components: 5 × 9 = 45\n5. Add up the products: 54 + 45 + 72 + 45 = 216\n\nSo, the dot product of the two vectors is:\n\n[6, 5, 8, 5] ⋅ [9, 9, 9, 9] = 216",
  "To find