In [4]:
import pandas as pd
import numpy as np
import random
import transformers
import torch
import dotenv
import os
import matplotlib.pyplot as plt
import re
import string
from tqdm import tqdm

dotenv.load_dotenv()

True

In [5]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    token=os.getenv('HF_TOKEN')
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Loading checkpoint shards: 100%|██████████| 4/4 [00:38<00:00,  9.59s/it]


# Rule 110 Cellular Automaton

In [None]:
def make_cellular_problem_set(size, steps, num_problems, boundary='wrap'):
    if steps < 3:
        raise ValueError("Need 3 steps to have a solution and 2 intermediates")
    def int_to_binary_list(n, min_length=8):
        binary = bin(n)[2:]  # Convert to binary string and remove '0b' prefix
        binary_list = [int(b) for b in binary.zfill(min_length)]  # Pad with zeros if necessary
        return binary_list
    
    dict_110 = {
        (0, 0, 0): 0,
        (0, 0, 1): 1,
        (0, 1, 0): 1,
        (0, 1, 1): 1,
        (1, 0, 0): 0,
        (1, 0, 1): 1,
        (1, 1, 0): 1,
        (1, 1, 1): 0
    }

    def rule_110(prev):
        next_state = []
        for i in range(len(prev)):
            left = prev[(i-1) % len(prev)] if boundary == 'wrap' or (i > 0 and i < len(prev)-1) else boundary
            center = prev[i]
            right = prev[(i+1) % len(prev)] if boundary == 'wrap' or (i > 0 and i < len(prev)-1) else boundary
            pattern = (left, center, right)
            next_state.append(dict_110[pattern])
        return next_state
    
    def make_rule_110_problem(initial_state, steps):
        current_state = initial_state
        states = [current_state]
        for _ in range(steps):
            current_state = rule_110(current_state)
            states.append(current_state)
        return (''.join(str(x) for x in initial_state),
            ''.join(str(x) for x in states[-1]),
            ''.join(str(x) for x in states[1]),
            ''.join(str(x) for x in states[-2]))

    return pd.DataFrame(
        [make_rule_110_problem(int_to_binary_list(((i+1)*33581)%(2**(size))), steps) for i in range(num_problems)]
        , columns=['problem', 'correct_solution', 'intermediate_1', 'intermediate_2'])

In [None]:
make_cellular_problem_set(10, 3, 1000, boundary=0)

# SAT

In [None]:
from pysat.formula import CNF
from pysat.solvers import Glucose3

def solve_nsat(clauses):
    # Create a CNF formula
    cnf = CNF()
    for clause in clauses:
        cnf.append(clause)

    # Create a SAT solver
    with Glucose3(bootstrap_with=cnf) as solver:
        # Check if the formula is satisfiable
        if solver.solve():
            return solver.get_model()
        else:
            return None

In [None]:
def make_nsat_problem_set(vars_per_clause, num_clauses, num_problems):
    def make_nsat_problem(vars_per_clause, num_clauses):
        text_variables = [string.ascii_lowercase[i] for i in range(vars_per_clause)]
        text_problem = []
        pysat_problem = []
        for _ in range(num_clauses):
            clause = random.sample(range(vars_per_clause), 3)
            signs = [random.choice([-1, 1]) for _ in range(3)]
            pysat_clause = [signs[i]*(var+1) for i, var in enumerate(clause)]
            pysat_problem.append(pysat_clause)
            pysat_solution = solve_nsat(pysat_problem)
            if pysat_solution is None:
                text_solution = None
            else:
                text_solution_letters = [f"{'¬' if var <0 else ''}{text_variables[abs(var)-1]}" for i, var in enumerate(pysat_solution)]
                text_solution = f"{' ^ '.join(text_solution_letters)}"

            text_clause = [f"{'¬' if signs[i] == -1 else ''}{text_variables[var]}" for i, var in enumerate(clause)]
            text_problem.append(f"({' v '.join(text_clause)})")
        return ' ^ '.join(text_problem), text_solution, None, None

    return pd.DataFrame(
        [make_nsat_problem(vars_per_clause, num_clauses) for _ in range(num_problems)],
        columns=['problem', 'correct_solution', 'intermediate_1', 'intermediate_2'])

In [None]:
make_nsat_problem_set(3, 20, 1000)

# Dot Product

In [6]:
def make_dot_product_problem_set(vec_len, vec_mag, num_problems):
    if vec_len < 2:
        raise ValueError("Need vectors of length 2 or greater to have two intermediates")
    def make_dot_product_problem(vec_len):
        a = np.random.randint(0, vec_mag, vec_len)
        b = np.random.randint(0, vec_mag, vec_len)
        return (f"[{', '.join([str(x) for x in a])}] ⋅ [{', '.join([str(x) for x in b])}]"
        , np.dot(a, b)
        , a[0]*b[0],
        a[-1]*b[-1])

    return pd.DataFrame(
        [make_dot_product_problem(vec_len) for _ in range(num_problems)],
        columns=['problem', 'correct_solution', 'intermediate_1', 'intermediate_2'])

In [7]:
make_dot_product_problem_set(3, 10, 1000)

Unnamed: 0,problem,correct_solution,intermediate_1,intermediate_2
0,"[7, 7, 8] ⋅ [3, 7, 6]",118,21,48
1,"[7, 3, 4] ⋅ [9, 8, 3]",99,63,12
2,"[2, 9, 8] ⋅ [2, 9, 3]",109,4,24
3,"[3, 1, 4] ⋅ [2, 6, 6]",36,6,24
4,"[2, 4, 8] ⋅ [5, 8, 4]",74,10,32
...,...,...,...,...
995,"[2, 1, 2] ⋅ [4, 7, 8]",31,8,16
996,"[0, 6, 7] ⋅ [3, 3, 2]",32,0,14
997,"[8, 3, 3] ⋅ [1, 6, 2]",32,8,6
998,"[6, 9, 3] ⋅ [6, 0, 4]",48,36,12


In [8]:
def solve_problem_cot(problem, sys_prompt, cot_prompt):
    messages = [
    {"role": "system", "content": sys_prompt + ' ' + cot_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

def solve_problem_memo(problem, sys_prompt, memo_prompt, max_toks=10): # setting max_toks to 1 because we're doing small digit numbers
    messages = [
    {"role": "system", "content": sys_prompt + ' ' + memo_prompt},
    {"role": "user", "content": problem},
    ]

    outputs = pipeline(
    messages,
    max_new_tokens=max_toks,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id
    )

    return outputs[0]['generated_text'][-1]['content']

In [9]:
problem_prompt = "What is the dot product of these two vectors?"
cot_prompt = "Show your work."
memo_prompt = "Answer with only a number."
n=100

In [10]:
def dot_test(dot_problems, problem_prompt=problem_prompt, cot_prompt=cot_prompt, memo_prompt=memo_prompt, n=n):
    cot_solutions = []
    cot_correct = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        cot_solutions.append(solve_problem_cot(row['problem'], problem_prompt, cot_prompt))
        cot_correct += str(row['correct_solution']) in cot_solutions[-1]

    memo_solutions = []
    memo_correct = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        memo_solutions.append(solve_problem_memo(row['problem'], problem_prompt, memo_prompt))
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]

    print(cot_correct, memo_correct)
    return cot_solutions, memo_solutions

In [14]:
results = {}

In [None]:
results[(3, 10)] = dot_test(make_dot_product_problem_set(3, 10, n))

In [None]:
results[(2, 10)] = dot_test(make_dot_product_problem_set(2, 10, n))

In [None]:
results[(2, 20)] = dot_test(make_dot_product_problem_set(2, 20, n))

In [None]:
results[(5, 10)] = dot_test(make_dot_product_problem_set(5, 10, n))

In [11]:
def repeated_solve_problem_memo(problem, sys_prompt, memo_prompt, max_toks=10, return_retries=False):
    soln = solve_problem_memo(problem, sys_prompt, memo_prompt, max_toks=max_toks)
    i = 0
    while sum([len([n for n in soln if n.isnumeric()]) > 3]):
        if i >= 50: # tap out
            return soln if not return_retries else (soln, i)
        i += 1
        print('retrying', i, soln)
        soln = solve_problem_memo(problem, sys_prompt, memo_prompt, max_toks=max_toks)
    return soln if not return_retries else (soln, i)

def test_memo_prompt(memo_prompt):
    dot_problems = make_dot_product_problem_set(3, 10, n)
    memo_solutions = []
    memo_correct = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        memo_solutions.append(repeated_solve_problem_memo(row['problem'], problem_prompt, memo_prompt=memo_prompt, max_toks=20))
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]
    print(sum([len([n for n in x if n.isnumeric()]) <= 2 for x in memo_solutions]))
    print('\n'.join(memo_solutions))

test_memo_prompt("""Answer with only a number. Do not do any calculations. For example:
                 Problem: [2, 9, -3] ⋅ [-6, -2, -9]
                 Solution: -3
                 """)

  9%|▉         | 9/100 [00:02<00:12,  7.27it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:07<00:00, 12.79it/s]

100
34
56
22
70
48
28
0
21
16
30
32
42
36
54
35
56
63
18
54
35
18
27
70
30
34
56
35
14
54
36
21
21
42
21
32
63
9
23
48
54
64
60
20
6
32
56
24
24
72
14
30
24
30
20
60
30
70
42
33
5
35
70
48
56
20
34
12
8
0
32
20
26
30
14
72
20
63
42
49
0
35
56
18
21
18
56
32
21
56
33
31
27
24
14
30
6
30
20
14
21





In [12]:
one_shot_true_memo_prompt = """Answer with only a number. Do not do any calculations. For example:
                 Problem: [2, 9, -3] ⋅ [-6, -2, -9]
                 Solution: -3
                 """

one_shot_false_memo_prompt = """Answer with only a number. Do not do any calculations. For example:
                 Problem: [2, 9, -3] ⋅ [-6, -2, -9]
                 Solution: 6
                 """

def test_memo_prompt_correctness(memo_prompt, n):
    dot_problems = make_dot_product_problem_set(3, 10, n)
    memo_solutions = []
    memo_correct = 0
    max_retries = 0
    for i, row in tqdm(list(dot_problems.iterrows())[:n]):
        sol, retries = repeated_solve_problem_memo(row['problem'], problem_prompt, memo_prompt=memo_prompt, max_toks=20, return_retries=True)
        memo_solutions.append(sol)
        memo_correct += str(row['correct_solution']) in memo_solutions[-1]
        max_retries = max(max_retries, retries)
    print(f"Max retries: {max_retries}")
    print(f"Correctness: {memo_correct}")

In [13]:
test_memo_prompt_correctness(one_shot_true_memo_prompt, n=1000)
test_memo_prompt_correctness(one_shot_false_memo_prompt, n=1000)

  0%|          | 0/1000 [00:00<?, ?it/s]

 93%|█████████▎| 934/1000 [00:57<00:07,  9.12it/s]

retrying 1 7 + 3 + 0 = 10


100%|██████████| 1000/1000 [01:01<00:00, 16.30it/s]


Max retries: 1
Correctness: 59


  4%|▍         | 42/1000 [00:02<00:58, 16.40it/s]

retrying 1 9 + 6 + 4 = 19
retrying 2 9 + 6 + 4 = 19
retrying 3 9 + 6 + 4 = 19


  5%|▍         | 46/1000 [00:04<03:07,  5.07it/s]

retrying 4 9 + 6 + 4 = 19


 40%|███▉      | 396/1000 [00:25<01:06,  9.07it/s]

retrying 1 9 + 3 + 8 = 20


100%|██████████| 1000/1000 [01:03<00:00, 15.83it/s]

Max retries: 4
Correctness: 47





# Found a useable memo prompt
Correct example elicits more reliably
Correct example doesn't seem to significantly boost performance

In [15]:
results = {}

In [16]:
results[(3, 10)] = dot_test(make_dot_product_problem_set(3, 10, n), memo_prompt=one_shot_true_memo_prompt)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [05:21<00:00,  3.22s/it]
100%|██████████| 100/100 [00:06<00:00, 16.56it/s]

100 6





In [19]:
results[(4, 10)] = dot_test(make_dot_product_problem_set(4, 10, n), memo_prompt=one_shot_true_memo_prompt)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [06:53<00:00,  4.14s/it]
100%|██████████| 100/100 [00:06<00:00, 16.65it/s]

100 0





In [20]:
results[(4, 10)]

(["To find the dot product of two vectors, we multiply corresponding elements of the two vectors and add up the results. Here's the step-by-step calculation:\n\n[7, 9, 5, 2] ⋅ [9, 7, 1, 4] =?\n\n1. Multiply the first elements: 7 × 9 = 63\n2. Multiply the second elements: 9 × 7 = 63\n3. Multiply the third elements: 5 × 1 = 5\n4. Multiply the fourth elements: 2 × 4 = 8\n5. Add up the results: 63 + 63 + 5 + 8 = 139\n\nTherefore, the dot product of the two vectors is 139.",
  'To find the dot product of two vectors, we multiply corresponding elements of the two vectors and add them up.\n\nHere are the corresponding elements of the two vectors:\n\n* 9 (from the first vector) × 8 (from the second vector) = 72\n* 4 (from the first vector) × 6 (from the second vector) = 24\n* 6 (from the first vector) × 4 (from the second vector) = 24\n* 2 (from the first vector) × 6 (from the second vector) = 12\n\nNow, we add up these products:\n\n72 + 24 + 24 + 12 = 132\n\nSo, the dot product of the two vec

In [45]:
results[(2, 10)] = dot_test(make_dot_product_problem_set(2, 10, n), memo_prompt=one_shot_true_memo_prompt)

100%|██████████| 100/100 [04:06<00:00,  2.47s/it]
100%|██████████| 100/100 [00:07<00:00, 13.13it/s]

86 12





In [46]:
results[(3, 5)] = dot_test(make_dot_product_problem_set(3, 5, n), memo_prompt=one_shot_true_memo_prompt)

100%|██████████| 100/100 [05:19<00:00,  3.20s/it]
100%|██████████| 100/100 [00:07<00:00, 12.61it/s]

86 12





In [47]:
make_dot_product_problem_set(4, 10, 1000).to_csv('dot_product_problems_4_10.csv')