# Making Difficulty-Calibrated Datasets for Different LLM's

## Setup

In [1]:
import pandas as pd
import numpy as np
import random
import transformers
import torch
import dotenv
import os
import matplotlib.pyplot as plt
import re
import string
from tqdm import tqdm
import pysat

dotenv.load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
def make_dot_product_problem_set(vec_len, vec_mag, num_problems, avoid_collisions=True):
    if vec_len < 2:
        raise ValueError("Need vectors of length 2 or greater to have two intermediates")
    if vec_mag < 2:
        raise ValueError("We remove 0 and 1 from the vector magnitudes to avoid collisions")
    if avoid_collisions and not ((vec_mag-1)**vec_len > num_problems):
        raise ValueError("To avoid collisions, need a bigger space than the number of problems requested")

    def make_dot_product_problem(vec_len, avoid_collisions=True):
        a = np.random.randint(2, vec_mag, vec_len)
        b = np.random.randint(2, vec_mag, vec_len)
        problem = (f"[{', '.join([str(x) for x in a])}] ⋅ [{', '.join([str(x) for x in b])}]"
        , np.dot(a, b)
        , a[0]*b[0],
        a[-1]*b[-1])
        if avoid_collisions and str(problem[2]) in problem[0] or str(problem[3]) in problem[0]:
            print(f"resampling {problem}")
            return make_dot_product_problem(vec_len, avoid_collisions)
        return problem

    return pd.DataFrame(
        [make_dot_product_problem(vec_len, avoid_collisions) for _ in range(num_problems)],
        columns=['problem', 'correct_solution', 'intermediate_1', 'intermediate_2'])

In [3]:
def solve_problem_cot(model_id, problem, sys_prompt, cot_prompt):
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        token=os.getenv('HF_TOKEN')
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    messages = [
    {"role": "system", "content": sys_prompt + ' ' + cot_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

def solve_problem_memo(model_id, problem, sys_prompt, memo_prompt, max_toks=10): # setting max_toks to 1 because we're doing small digit numbers
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        token=os.getenv('HF_TOKEN')
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    messages = [
    {"role": "system", "content": sys_prompt + ' ' + memo_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=max_toks,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

In [4]:
problem_prompt = "What is the dot product of these two vectors?"
cot_prompt = "Show your work."
memo_prompt = """Answer with only a number. Do not do any calculations. For example:
                 Problem: [2, 9, -3] ⋅ [-6, -2, -9]
                 Solution: -3
                 """
n=100

In [5]:
def repeated_solve_problem_memo(model, problem, sys_prompt, memo_prompt, max_toks=10, return_retries=False):
    soln = solve_problem_memo(model, problem, sys_prompt, memo_prompt, max_toks=max_toks)
    i = 0
    while sum([len([n for n in soln if n.isnumeric()]) > 3]):
        if i >= 50: # tap out
            return soln if not return_retries else (soln, i)
        i += 1
        print('retrying', i, soln)
        soln = solve_problem_memo(model, problem, sys_prompt, memo_prompt, max_toks=max_toks)
    return soln if not return_retries else (soln, i)

def test_memo_prompt(model, memo_prompt):
    dot_problems = make_dot_product_problem_set(3, 10, n)
    memo_solutions = []
    memo_correct = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        memo_solutions.append(repeated_solve_problem_memo(row['problem'], problem_prompt, memo_prompt=memo_prompt, max_toks=20))
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]
    print(sum([len([n for n in x if n.isnumeric()]) <= 2 for x in memo_solutions]))
    print('\n'.join(memo_solutions))

In [6]:
def test_memo_prompt(model, memo_prompt, n):
    dot_problems = make_dot_product_problem_set(3, 10, n)
    memo_solutions = []
    memo_correct = 0
    max_retries = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        sol, retries = repeated_solve_problem_memo(model, row['problem'], problem_prompt, memo_prompt=memo_prompt, max_toks=20, return_retries=True)
        memo_solutions.append(sol)
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]
        max_retries = max(max_retries, retries)
    print(f"Max retries: {max_retries}")
    print(f"Correctness: {memo_correct}")

# Llama 8b reproduction

In [7]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
test_memo_prompt(model_id, memo_prompt=memo_prompt, n=10)

  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
No module named 'optimum'