# create balanced data with various number of samples (N)
this is done by sampling
- all 1 digit additions
- 900 * N /10000 2 digit additions
- N - (100 + 900 * N / 100000) 3 digit additions

In [1]:
import math
import random
import os

In [2]:
def numCarryOps(a, b):
    a,b=int(a),int(b)
    def digitSum(n):
        return sum(map(int,str(n)))
    # assert(a >= 0); assert(b >= 0);
    return int((digitSum(a) + digitSum(b) - digitSum(a+b)) / 9)

In [3]:
def create_balanced_data(filename, total_num_examples=10000, num_digits = 3):
    num_digit_2 = int(900*total_num_examples/10000)
    num_digit_list = [100, num_digit_2, total_num_examples - 100 - num_digit_2]
    print(num_digit_list)

    # create a list of number of carries - we target each number of carries to have the same number of examples
    target_num_carry_examples = math.ceil(total_num_examples / (num_digits+1))
    num_carry_list = [0 for i in range(num_digits+1)]

    with open(filename, 'w') as f:
        num_example = 0

        # generate all 1 digit examples
        for a in range(10):
            for b in range(10):
                c = a + b
                f.write(f'{a}+{b}={c}\n')
                num_example += 1
                num_carry = numCarryOps(a, b)
                num_carry_list[num_carry] += 1

        for num_digit in range(2, num_digits+1):
            num_digit_example = 0
            print(num_digit,  num_example, num_carry_list)
            while num_digit_example < num_digit_list[num_digit-1] and num_example < total_num_examples:
                # generate a random number between 0 and 10^(i+1) - 1
                a = random.randint(0, 10**(num_digit) - 1)
                b = random.randint(0, 10**(num_digit) - 1)
                c = a + b

                # count number of carries in c
                num_carry = numCarryOps(a, b)
                if num_carry_list[num_carry] < target_num_carry_examples:
                    # write the example to file
                    f.write(f'{a}+{b}={c}\n')
                    # increment num_carry_list[num_carry]
                    num_carry_list[num_carry] += 1
                    num_digit_example += 1
                    num_example += 1
                else:
                    continue
        
    print(num_carry_list)

In [43]:
total_num_examples = 10000
num_digits = 3
input_file_path = f'train_{num_digits}digit_{total_num_examples}.txt'
if not os.path.exists(input_file_path):
    create_balanced_data(input_file_path, total_num_examples=total_num_examples, num_digits=3)
else:
    print(f'File {input_file_path} already exists')

File train_3digit_10000.txt already exists


In [21]:
total_num_examples = 20000
num_digits = 3
input_file_path = f'train_{num_digits}digit_{total_num_examples}.txt'
if not os.path.exists(input_file_path):
    create_balanced_data(input_file_path, total_num_examples=total_num_examples, num_digits=3)
else:
    print(f'File {input_file_path} already exists')

[100, 1800, 18100]
2 100 [55, 45, 0, 0]
3 1900 [545, 893, 462, 0]
[5000, 5000, 5000, 5000]


In [22]:
total_num_examples = 40000
num_digits = 3
input_file_path = f'train_{num_digits}digit_{total_num_examples}.txt'
if not os.path.exists(input_file_path):
    create_balanced_data(input_file_path, total_num_examples=total_num_examples, num_digits=3)
else:
    print(f'File {input_file_path} already exists')

[100, 3600, 36300]
2 100 [55, 45, 0, 0]
3 3700 [1120, 1668, 912, 0]
[10000, 10000, 10000, 10000]


In [5]:
# create dataset with smaller number of examples, sampled from the above dataset

def create_subset_dataset(input_file_path='train_3digit_10000.txt', num_samples=10000):
    with open(input_file_path, 'r') as f:
        lines = f.readlines()

    selected_lines = lines[:100] # select the first 100 lines (1-digit examples)
    # sample int(900*total_num_examples/10000) 2-digit examples (lines[100:1000])
    selected_lines += random.sample(lines[100:1000], int(900*num_samples/10000))
    selected_lines += random.sample(lines[1000:], num_samples - 100 - int(900*num_samples/10000))

    num_carry_list = [0 for i in range(4)]
    for line in selected_lines:
        a, b = line.split('+')
        b = b.split('=')[0]
        num_carry = numCarryOps(a, b)
        num_carry_list[num_carry] += 1
    print(num_carry_list)

    output_file_path = f'train_3digit_{num_samples}.txt'
    if os.path.exists(output_file_path):
        print(f'File {output_file_path} already exists')
        
        return
    
    with open(output_file_path, 'w') as f:
        f.writelines(selected_lines)
        print(f'created {num_samples} number of examples and saved to {output_file_path}')
        
    

In [18]:
num_samples = [500, 1000, 2000, 3000, 4000, 5000]
for num_sample in num_samples:
    create_subset_dataset(num_samples=num_sample)

[163, 141, 97, 99]
created 500 number of examples and saved to train_3digit_500.txt
[283, 268, 236, 213]
created 1000 number of examples and saved to train_3digit_1000.txt
[535, 502, 481, 482]
created 2000 number of examples and saved to train_3digit_2000.txt
[781, 782, 748, 689]
created 3000 number of examples and saved to train_3digit_3000.txt
[1020, 1016, 958, 1006]
created 4000 number of examples and saved to train_3digit_4000.txt
[1279, 1271, 1229, 1221]
created 5000 number of examples and saved to train_3digit_5000.txt


In [6]:
create_subset_dataset(num_samples=250)

[92, 82, 39, 37]
created 250 number of examples and saved to train_3digit_250.txt


In [3]:
# create 1 digit addition dataset
filename = 'train_1digit_100.txt'
if not os.path.exists(filename):
    with open(filename, 'w') as f:
        for _ in range(20):
            for a in range(10):
                for b in range(10):
                    c = a + b
                    f.write(f'{a}+{b}={c}\n')

# Create 2 digit addition dataset
filename = 'train_2digit_10000.txt'
if not os.path.exists(filename):
    with open(filename, 'w') as f:
        for a in range(100):
            for b in range(100):
                c = a + b
                f.write(f'{a}+{b}={c}\n')


filename = 'train_2digit_10000_v2.txt'
if not os.path.exists(filename):
    with open(filename, 'w') as f:
        for _ in range(99):
            for a in range(10):
                for b in range(10):
                    c = a + b
                    f.write(f'{a}+{b}={c}\n')
        for a in range(100):
            for b in range(100):
                c = a + b
                f.write(f'{a}+{b}={c}\n')

In [5]:
def create_2digit_samples(num_sample=2000):
    filename = f'train_2digit_{num_sample}.txt'
    if not os.path.exists(filename):
        with open(filename, 'w') as f:
            for a in range(10):
                for b in range(10):
                    c = a + b
                    f.write(f'{a}+{b}={c}\n')

            for _ in range(num_sample - 100):
                a = random.randint(0, 99)
                b = random.randint(0, 99)
                c = a + b
                f.write(f'{a}+{b}={c}\n')


create_2digit_samples(num_sample=2000)
create_2digit_samples(num_sample=5000)


# Create Non-overlapping test dataset


In [103]:
total_num_examples = 10000
num_digits = 3
input_file_path = f'train_{num_digits}digit_{total_num_examples}.txt'
output_file_path = f'train_{num_digits}digit_{total_num_examples}_nonoverlap.txt'

lines_to_remove = set()
with open(input_file_path, 'r') as f:
    for line in f.readlines():
        lines_to_remove.add(line)

print(len(lines_to_remove))

with open(output_file_path, 'w') as f:
    for x in range(1000):
        for y in range(1000):
            line_to_add = f'{x}+{y}={x+y}\n'
            if line_to_add in lines_to_remove:
                lines_to_remove.remove(line_to_add)
            else:
                f.write(line_to_add)

9891


In [51]:
num_test_samples = 10000
input_file_path = f'train_{num_digits}digit_{total_num_examples}_nonoverlap.txt'

with open(input_file_path, 'r') as f:
    lines = f.readlines()
    random.shuffle(lines)
    with open(f'test_{num_digits}digit_{num_test_samples}.txt', 'w') as f2:
        for line in lines[:num_test_samples]:
            f2.write(line)

In [104]:
num_test_samples = 1000
input_file_path = f'train_{num_digits}digit_{total_num_examples}_nonoverlap.txt'

with open(input_file_path, 'r') as f:
    lines = f.readlines()
    random.shuffle(lines)
    with open(f'test_{num_digits}digit_{num_test_samples}.txt', 'w') as f2:
        for line in lines[:num_test_samples]:
            f2.write(line)

# Make 1-digit and 2-digit test data

In [49]:
filename = 'test_1digit_100.txt'

if not os.path.exists(filename):
    with open(filename, 'w') as f:
        for x in range(10):
            for y in range(10):
                f.write(f'{x}+{y}={x+y}\n')
else:
    print(f'File {filename} already exists')

In [50]:
filename = 'test_2digit_10000.txt'

if not os.path.exists(filename):
    with open(filename, 'w') as f:
        for x in range(100):
            for y in range(100):
                f.write(f'{x}+{y}={x+y}\n')
else:
    print(f'File {filename} already exists')

In [3]:
def create_subset_multiplication_dataset(input_file_path='train_multiplication_3000.txt', num_samples=10000):
    with open(input_file_path, 'r') as f:
        lines = f.readlines()

    selected_lines = lines[:100] # select the first 100 lines (1-digit examples)
    # sample int(900*total_num_examples/10000) 2-digit examples (lines[100:1000])
    selected_lines += random.sample(lines[100:], num_samples-100)


    output_file_path = f'train_multiplication_{num_samples}.txt'
    if os.path.exists(output_file_path):
        print(f'File {output_file_path} already exists')
        
        return
    
    with open(output_file_path, 'w') as f:
        f.writelines(selected_lines)
        print(f'created {num_samples} number of examples and saved to {output_file_path}')
        
    

In [4]:
num_samples = [1000, 2000]
for num_sample in num_samples:
    create_subset_multiplication_dataset(num_samples=num_sample)

created 1000 number of examples and saved to train_multiplication_1000.txt
created 2000 number of examples and saved to train_multiplication_2000.txt


In [7]:
def make_multiplication_data(num_samples=5000):
    with open('train_multiplication_3000.txt', 'r') as f:
        lines = f.readlines()
    out_file_name = f'train_multiplication_{num_samples}.txt'
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    
    print('making multiplication data file: ', out_file_name)
    
    count = 0
    while count < num_samples - 3000:
        a, b = random.randint(0, 99), random.randint(0, 99)
        if a < 10 and b < 10:
            continue
        c = a * b
        lines.append(f'{a}*{b}={c}\n')
        count += 1
    with open(out_file_name, 'w') as f:
        f.writelines(lines)

In [8]:
for num_sample in [5000, 7000]:
    make_multiplication_data(num_samples=num_sample)

train_multiplication_5000.txt already exists
making multiplication data file:  train_multiplication_7000.txt


## Division

In [4]:
def make_division_data(num_samples=5000):
    with open('train_division_3000.txt', 'r') as f:
        lines = f.readlines()
    out_file_name = f'train_division_{num_samples}.txt'
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    
    print('making division data file: ', out_file_name)
    
    count = 0
    while count < num_samples - 3000:
        a, b = random.randint(100, 200), random.randint(1, 101)
        c = a / b
        lines.append(f'{a}/{b}={c}\n')
        count += 1
    with open(out_file_name, 'w') as f:
        f.writelines(lines)

In [5]:
for num_sample in [5000, 7000]:
    make_division_data(num_samples=num_sample)

making division data file:  train_division_5000.txt
making division data file:  train_division_7000.txt


In [6]:
out_file_name = f'train_division_7000.txt'

# read our train file and create a set of all the prompts
# we want to make sure we don't have any overlap between train and test
lines_to_remove = set()
with open(f'{out_file_name}', 'r') as f:
    for line in f.readlines():
        lines_to_remove.add(line.strip())

# now let's create our test file    
out_file_name = 'test_division2.txt'

print('making non-overlapping test file: ', out_file_name)
with open(f'{out_file_name}', 'w') as f:
    for a in range(100, 200):
        for b in range(1, 100):
            line_sample = f'{a}/{b}='
            for line_to_remove in lines_to_remove:
                if line_to_remove.startswith(line_sample):
                    lines_to_remove.remove(line_to_remove)
                    break
            else:
                f.write(line_sample + '\n')

making non-overlapping test file:  test_division2.txt


In [10]:
out_file_name = f'train_multiplication_7000.txt'

# read our train file and create a set of all the prompts
# we want to make sure we don't have any overlap between train and test
lines_to_remove = set()
with open(f'{out_file_name}', 'r') as f:
    for line in f.readlines():
        lines_to_remove.add(line.strip())

# now let's create our test file    
out_file_name = 'test_multiplication2.txt'

print('making non-overlapping test file: ', out_file_name)
with open(f'{out_file_name}', 'w') as f:
    for a in range(100):
        for b in range(100):
            line_sample = f'{a}*{b}='
            for line_to_remove in lines_to_remove:
                if line_to_remove.startswith(line_sample):
                    lines_to_remove.remove(line_to_remove)
                    break
            else:
                f.write(line_sample + '\n')

making non-overlapping test file:  test_multiplication2.txt


In [11]:
out_file_name = 'test_multiplication2.txt'
with open(out_file_name, 'r') as f:
    lines = f.readlines()
    random.shuffle(lines)
    with open(f'test_multiplication2_1000.txt', 'w') as f2:
        for line in lines[:1000]:
            f2.write(line)

In [12]:
out_file_name = 'test_multiplication2.txt'
with open(out_file_name, 'r') as f:
    lines = f.readlines()
    random.shuffle(lines)
    with open(f'test_multiplication2_100.txt', 'w') as f2:
        for line in lines[:100]:
            f2.write(line)

# Algorithmic Reasoning for Multiplication

In [3]:
x = 128
def list_to_string(a):
    a = str(a)
    return a.replace(' ', '')

def num_to_list(num):
    return [int(x) for x in str(num)]

list_x = num_to_list(x)
print(list_x)
print(list_to_string(list_x))



[1, 2, 8]
[1,2,8]


In [37]:
def get_input_string(x,y):
    x, y = str(x), str(y)
    input_str = f'Input:\n{x}*{y}\n'

    return input_str

def get_output_string(x,y):
    x, y = str(x), str(y)

    len_x, len_y = len(x), len(y)
    list_x, list_y = num_to_list(x), num_to_list(y)

    output_str = f'Target:\n<scratch>\n'

    output_str += f'{list_to_string(list_x)} has {len_x} digits.\n'
    output_str += f'{list_to_string(list_y)} has {len_y} digits.\n'

    C = 0
    # for i in range(max(len_x, len_y)):
    for i in range(len_y):
        # a = list_x[-1] if i < len_x else 0
        b = list_y[-1] if i < len_y else 0
        # c = a + b + C
        A = b * int(x)
        B = A * (10**i)
        C_prev = C
        C += B
        # A, B = num_to_list(A), num_to_list(B)

        output_str += f'{list_to_string(list_x)} * {b} , A={list_to_string([int(digit) for digit in str(A)])} , k={10**i} , B={list_to_string([int(digit) for digit in str(B)])} , C={C_prev}+{B}={C}'

        if not i == len_y - 1:
            output_str += '\n'
        list_y = list_y[:-1]
    
    output_str += ' , END\n</scratch>\n'
    # output_str += f'{list_to_string(list_x)} * {list_to_string(list_y)} , A={list_to_string(A)} C={C} , END\n</scratch>\n'
    
    for a in str(C):
        output_str += f'{a} '

    return output_str[:-1]+'\n'

In [38]:
x,y = 128, 367
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)


Input:
128*367
Target:
<scratch>
[1,2,8] has 3 digits.
[3,6,7] has 3 digits.
[1,2,8] * 7 , A=[8,9,6] , k=1 , B=[8,9,6] , C=0+896=896
[1,2,8] * 6 , A=[7,6,8] , k=10 , B=[7,6,8,0] , C=896+7680=8576
[1,2,8] * 3 , A=[3,8,4] , k=100 , B=[3,8,4,0,0] , C=8576+38400=46976 , END
</scratch>
4 6 9 7 6



In [5]:
num_test_samples = 1000
input_file_path = 'test_multiplication_7000.txt'

with open(input_file_path, 'r') as f:
    lines = f.readlines()
    random.shuffle(lines)
    with open(f'test_multiplication_{num_test_samples}.txt', 'w') as f2:
        for line in lines[:num_test_samples]:
            f2.write(line)

# Algorithmic Reasoning for Subtraction

In [5]:
x = 128
def list_to_string(a):
    a = str(a)
    return a.replace(' ', '')

def num_to_list(num):
    return [int(x) for x in str(num)]

list_x = num_to_list(x)
print(list_x)
print(list_to_string(list_x))



[1, 2, 8]
[1,2,8]


In [96]:
def get_input_string(x,y):
    x, y = str(x), str(y)
    input_str = f'Input:\n{x}-{y}\n'

    return input_str

def get_output_string(x,y):
    operator = '-'
    output_str = f'Target:\n<scratch>\n'

    x, y = str(x), str(y)
    len_x, len_y = len(x), len(y)
    list_x, list_y = num_to_list(x), num_to_list(y)
 
    output_str += f'{list_to_string(list_x)} has {len_x} digits.\n'
    output_str += f'{list_to_string(list_y)} has {len_y} digits.\n'

    x_y = 1
    if x >= y:
        output_str += f'{x}>={y}\n'
    else:
        x_y = 0
        output_str += f'{x}<{y} : {x}-{y}=-({y}-{x})\n'
        x, y = y, x
        len_x, len_y = len(x), len(y)
        list_x, list_y = list_y, list_x

    C=0
    A=[]
    total_len = max(len_x, len_y)
    for i in range(total_len):
        a = list_x[-1] if i < len_x else 0
        b = list_y[-1] if i < len_y else 0

        
        if a - b - abs(C) < 0:
            c = a - b + 10 - abs(C)
            update = f"{a}{operator}{b}-{abs(C)}+10={c}"
        else:
            c = a - b - abs(C)
            update = f"{a}{operator}{b}-{abs(C)}={c}"

        # print(f"i: {i}, a: {a}, b: {b}, C: {C}, a-b-C: {a - b - C}, c: {c}")

        output_str += f'{list_to_string(list_x)} {operator} {list_to_string(list_y)} , A={list_to_string(A)} , C={C} , {update} , A->{c} , C->{-1 * ( a - b - abs(C) < 0)}\n'

        A.insert(0, c)
        C = -1 * ( a - b - abs(C) < 0)

        list_x = list_x[:-1]
        list_y = list_y[:-1]

    output_str += f'{list_to_string(list_x)} - {list_to_string(list_y)} , A={list_to_string(A)} , END\n</scratch>\n'
    
    if x_y == 0:
        output_str += '-'
    for a in A:
        output_str += f'{a} '

    return output_str[:-1]+'\n'

In [97]:
x,y = 29, 510
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))


Input:
29-510
Target:
<scratch>
[2,9] has 2 digits.
[5,1,0] has 3 digits.
29<510 : 29-510=-(510-29)
[5,1,0] - [2,9] , A=[] , C=0 , 0-9-0+10=1 , A->1 , C->-1
[5,1] - [2] , A=[1] , C=-1 , 1-2-1+10=8 , A->8 , C->-1
[5] - [] , A=[8,1] , C=-1 , 5-0-1=4 , A->4 , C->0
[] - [] , A=[4,8,1] , END
</scratch>
-4 8 1

306


In [100]:
x,y = 29, 570
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))


Input:
29-570
Target:
<scratch>
[2,9] has 2 digits.
[5,7,0] has 3 digits.
29<570 : 29-570=-(570-29)
[5,7,0] - [2,9] , A=[] , C=0 , 0-9-0+10=1 , A->1 , C->-1
[5,7] - [2] , A=[1] , C=-1 , 7-2-1=4 , A->4 , C->0
[5] - [] , A=[4,1] , C=0 , 5-0-0=5 , A->5 , C->0
[] - [] , A=[5,4,1] , END
</scratch>
-5 4 1

301


In [98]:
x,y = 128, 367
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))


Input:
128-367
Target:
<scratch>
[1,2,8] has 3 digits.
[3,6,7] has 3 digits.
128<367 : 128-367=-(367-128)
[3,6,7] - [1,2,8] , A=[] , C=0 , 7-8-0+10=9 , A->9 , C->-1
[3,6] - [1,2] , A=[9] , C=-1 , 6-2-1=3 , A->3 , C->0
[3] - [1] , A=[3,9] , C=0 , 3-1-0=2 , A->2 , C->0
[] - [] , A=[2,3,9] , END
</scratch>
-2 3 9

312


In [99]:
x,y = 367, 128
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)


Input:
367-128
Target:
<scratch>
[3,6,7] has 3 digits.
[1,2,8] has 3 digits.
367>=128
[3,6,7] - [1,2,8] , A=[] , C=0 , 7-8-0+10=9 , A->9 , C->-1
[3,6] - [1,2] , A=[9] , C=-1 , 6-2-1=3 , A->3 , C->0
[3] - [1] , A=[3,9] , C=0 , 3-1-0=2 , A->2 , C->0
[] - [] , A=[2,3,9] , END
</scratch>
2 3 9



# Algorithmic Reasoning for Subtraction  ver2.

In [3]:
x = 128
def list_to_string(a):
    a = str(a)
    return a.replace(' ', '')

def num_to_list(num):
    return [int(x) for x in str(num)]

list_x = num_to_list(x)
print(list_x)
print(list_to_string(list_x))



[1, 2, 8]
[1,2,8]


In [4]:
def get_input_string(x,y):
    x, y = str(x), str(y)
    input_str = f'Input:\n{x}-{y}\n'

    return input_str

def get_output_string(x,y):
    operator = '-'
    output_str = f'Target:\n<scratch>\n'

    x, y = str(x), str(y)
    len_x, len_y = len(x), len(y)
    list_x, list_y = num_to_list(x), num_to_list(y)
 
    output_str += f'{list_to_string(list_x)} has {len_x} digits.\n'
    output_str += f'{list_to_string(list_y)} has {len_y} digits.\n'


    C=0
    A=[]
    total_len = max(len_x, len_y)
    for i in range(total_len):
        a = list_x[-1] if i < len_x else 0
        b = list_y[-1] if i < len_y else 0

        
        if a - b - abs(C) < 0 and (i < total_len - 1):
            c = a - b + 10 - abs(C)
            update = f"{a}{operator}{b}-{abs(C)}+10={c}"
        else:
            c = a - b - abs(C)
            update = f"{a}{operator}{b}-{abs(C)}={c}"

        # print(f"i: {i}, a: {a}, b: {b}, C: {C}, a-b-C: {a - b - C}, c: {c}")

        output_str += f'{list_to_string(list_x)} {operator} {list_to_string(list_y)} , A={list_to_string(A)} , C={C} , {update} , A->{c} , C->{-1 * ( a - b - abs(C) < 0)}\n'

        A.insert(0, c)
        C = -1 * ( a - b - abs(C) < 0)

        list_x = list_x[:-1]
        list_y = list_y[:-1]

    output_str += f'{list_to_string(list_x)} - {list_to_string(list_y)} , A={list_to_string(A)}\n' # , END\n</scratch>\n'
    
    a = int(A[0])
    n = len(A) - 1
    b = int(''.join([str(x) for x in A[1:]])) if n > 0 else 0
    result = a * (10 ** n) + b
    output_str += f'{a * (10 ** n)}+{b}={result} , END\n</scratch>\n'

    if result < 0:
        result_sign = '-'
        
    else:
        result_sign = ''
    result_str = num_to_list(abs(result))

    output_str += result_sign
    for x in result_str:
        output_str += f'{x} '

    return output_str[:-1]+'\n'

In [8]:
x,y = 29, 510
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))
print(x-y)


Input:
29-510
Target:
<scratch>
[2,9] has 2 digits.
[5,1,0] has 3 digits.
[2,9] - [5,1,0] , A=[] , C=0 , 9-0-0=9 , A->9 , C->0
[2] - [5,1] , A=[9] , C=0 , 2-1-0=1 , A->1 , C->0
[] - [5] , A=[1,9] , C=0 , 0-5-0=-5 , A->-5 , C->-1
[] - [] , A=[-5,1,9]
-500+19=-481 , END
</scratch>
-4 8 1

287
-481


In [42]:
x,y = 128, 367
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))
print(x-y)



Input:
128-367
Target:
<scratch>
[1,2,8] has 3 digits.
[3,6,7] has 3 digits.
[1,2,8] - [3,6,7] , A=[] , C=0 , 8-7-0=1 , A->1 , C->0
[1,2] - [3,6] , A=[1] , C=0 , 2-6-0+10=6 , A->6 , C->-1
[1] - [3] , A=[6,1] , C=-1 , 1-3-1=-3 , A->-3 , C->-1
[] - [] , A=[-3,6,1]
-300+61=-239 , END
</scratch>
-2 3 9

300
-239


In [43]:
x,y = 367, 128
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(x-y)


Input:
367-128
Target:
<scratch>
[3,6,7] has 3 digits.
[1,2,8] has 3 digits.
[3,6,7] - [1,2,8] , A=[] , C=0 , 7-8-0+10=9 , A->9 , C->-1
[3,6] - [1,2] , A=[9] , C=-1 , 6-2-1=3 , A->3 , C->0
[3] - [1] , A=[3,9] , C=0 , 3-1-0=2 , A->2 , C->0
[] - [] , A=[2,3,9]
200+39=239 , END
</scratch>
2 3 9

239


In [44]:
x,y = 3, 6
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(x-y)


Input:
3-6
Target:
<scratch>
[3] has 1 digits.
[6] has 1 digits.
[3] - [6] , A=[] , C=0 , 3-6-0=-3 , A->-3 , C->-1
[] - [] , A=[-3]
-3+0=-3 , END
</scratch>
-3

-3


In [45]:
x,y = 6, 3
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(x-y)


Input:
6-3
Target:
<scratch>
[6] has 1 digits.
[3] has 1 digits.
[6] - [3] , A=[] , C=0 , 6-3-0=3 , A->3 , C->0
[] - [] , A=[3]
3+0=3 , END
</scratch>
3

3


In [47]:
x,y = 941, 940
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(x-y)


Input:
941-940
Target:
<scratch>
[9,4,1] has 3 digits.
[9,4,0] has 3 digits.
[9,4,1] - [9,4,0] , A=[] , C=0 , 1-0-0=1 , A->1 , C->0
[9,4] - [9,4] , A=[1] , C=0 , 4-4-0=0 , A->0 , C->0
[9] - [9] , A=[0,1] , C=0 , 9-9-0=0 , A->0 , C->0
[] - [] , A=[0,0,1]
0+1=1 , END
</scratch>
1

1


# Simplifed AR for subtraction

In [9]:
def add_space(string):
    string = str(string)
    return ' '.join(string)

def list_to_string(a):
    a = str(a)
    return a.replace(' ', '')

def num_to_list(num):
    return [int(x) for x in str(num)]

def get_input_string(x,y):
    x, y = str(x), str(y)
    input_str = f'Input:\n{x}-{y}\n'

    return input_str

def get_output_string(x,y,random_A=False, random_C=False):
    x, y = str(x), str(y)

    len_x, len_y = len(x), len(y)
    list_x, list_y = num_to_list(x), num_to_list(y)

    output_str = f'Target:\n'

    C=0
    A=[]
    for i in range(max(len_x, len_y)):
        a = list_x[-1] if i < len_x else 0
        b = list_y[-1] if i < len_y else 0
        c = a - b - abs(C) 
        C = -1 * ( a - b - abs(C) < 0)

        if c < 0 and i < max(len_x, len_y) - 1:
            trueA = c + 10
        else:
            trueA = c
            
        if random_A:
            randomA = random.randint(0, 9)
        else:
            randomA = trueA
        if random_C:
            randomC = -1 * random.randint(0, 1)
        else:
            randomC = C
        
        output_str += f'A->{randomA} , C->{randomC}\n'

        A.insert(0, trueA)

        list_x = list_x[:-1]
        list_y = list_y[:-1]

    # output_str += f'{list_to_string(list_x)} + {list_to_string(list_y)} , A={list_to_string(A)} C={C} , END\n</scratch>\n'
    
    a = int(A[0])
    n = len(A) - 1
    b = int(''.join([str(x) for x in A[1:]])) if n > 0 else 0
    result = a * (10 ** n) + b
    output_str += f'{a * (10 ** n)}+{b}={result}.\n'

    if result < 0:
        result_sign = '-'
            
    else:
        result_sign = ''
    result_str = [int(x) for x in str(abs(result))]

    output_str += result_sign
    for x in result_str:
        output_str += f'{x}'

    return output_str+'\n'

In [13]:
x,y = 128, 367
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))
print(x-y)


Input:
128-367
Target:
A->1 , C->0
A->6 , C->-1
A->-3 , C->-1
-300+61=-239.
-239

81
-239


In [12]:
x,y = 788, 989
input_str = get_input_string(x,y)
output_str = get_output_string(x,y)

print(input_str+output_str)
print(len(input_str+output_str))
print(x-y)


Input:
788-989
Target:
A->9 , C->-1
A->9 , C->-1
A->-3 , C->-1
-300+99=-201.
-201

82
-201


# Create much smaller dataset for figuring out whether algorithmic reasoning prompting can be training with much smaller data

In [4]:
# create dataset with smaller number of examples, sampled from the above dataset

def create_subset_dataset(input_file_path='train_3digit_10000.txt', output_file_path=None, num_2digit=100, num_3digit=100):
    with open(input_file_path, 'r') as f:
        lines = f.readlines()
    
    num_samples = 100 + num_2digit + num_3digit

    selected_lines = lines[:100] # select the first 100 lines (1-digit examples)
    # sample int(900*total_num_examples/10000) 2-digit examples (lines[100:1000])
    selected_lines += random.sample(lines[100:1000], num_2digit)
    selected_lines += random.sample(lines[1000:], num_3digit)

    num_carry_list = [0 for i in range(4)]
    for line in selected_lines:
        a, b = line.split('+')
        b = b.split('=')[0]
        num_carry = numCarryOps(a, b)
        num_carry_list[num_carry] += 1
    print(num_carry_list)

    output_file_path = f'train_3digit_small_{num_samples}.txt' if output_file_path is None else output_file_path
    if os.path.exists(output_file_path):
        print(f'File {output_file_path} already exists')

    
    with open(output_file_path, 'w') as f:
        f.writelines(selected_lines)
        print(f'created {num_samples} number of examples and saved to {output_file_path}')
        
    

In [None]:
# create dataset with smaller number of examples, sampled from the above dataset

def create_subset_dataset(input_file_path='train_3digit_10000.txt', output_file_path=None, num_2digit=100, num_3digit=100):
    with open(input_file_path, 'r') as f:
        lines = f.readlines()
    
    num_samples = 100 + num_2digit + num_3digit

    selected_lines = lines[:100] # select the first 100 lines (1-digit examples)
    # sample int(900*total_num_examples/10000) 2-digit examples (lines[100:1000])
    selected_lines += random.sample(lines[100:1000], num_2digit)
    selected_lines += random.sample(lines[1000:], num_3digit)

    num_carry_list = [0 for i in range(4)]
    for line in selected_lines:
        a, b = line.split('+')
        b = b.split('=')[0]
        num_carry = numCarryOps(a, b)
        num_carry_list[num_carry] += 1
    print(num_carry_list)

    output_file_path = f'train_3digit_all_{num_samples}.txt' if output_file_path is None else output_file_path
    if os.path.exists(output_file_path):
        print(f'File {output_file_path} already exists')

    
    with open(output_file_path, 'w') as f:
        f.writelines(selected_lines)
        print(f'created {num_samples} number of examples and saved to {output_file_path}')
        
    

In [4]:
create_subset_dataset(num_2digit=100, num_3digit=100)

[104, 113, 57, 26]
created 300 number of examples and saved to train_3digit_small_300.txt


In [5]:
create_subset_dataset(num_2digit=50, num_3digit=50)

[85, 78, 24, 13]
created 200 number of examples and saved to train_3digit_small_200.txt


In [6]:
create_subset_dataset(num_2digit=10, num_3digit=10)

[62, 52, 6, 0]
created 120 number of examples and saved to train_3digit_small_120.txt


In [7]:
create_subset_dataset(num_2digit=200, num_3digit=200)

[168, 153, 114, 65]
created 500 number of examples and saved to train_3digit_small_500.txt


In [5]:
create_subset_dataset(num_2digit=150, num_3digit=50, output_file_path='train_3digit_small_100_150_50.txt')

[106, 125, 55, 14]
created 300 number of examples and saved to train_3digit_small_100_150_50.txt


In [6]:
create_subset_dataset(num_2digit=50, num_3digit=150, output_file_path='train_3digit_small_100_50_150.txt')

[102, 97, 50, 51]
created 300 number of examples and saved to train_3digit_small_100_50_150.txt
