In [2]:
import os
import pickle
import requests
import numpy as np
import random
import math

In [3]:
if not os.path.exists('one-sided-subtraction'):
    os.mkdir('one-sided-subtraction')

In [4]:
def get_data(in_dir_name, file_name):
    with open(f'{in_dir_name}/{file_name}', 'r') as f:
        lines = f.readlines()
    return lines


def reverse_string(a: str) -> str:
    return a[::-1]


def get_abc(line):
    [a,b] = line.split('+')
    if a.startswith('$'):
        a = a.split('$')[1]
    b = b.split('=')[0]
    
    return a, b


def make_binary_file_shuffle(out_dir_name, filepath, input_file_path):
    if os.path.exists(f'{out_dir_name}/{filepath}'):
        print(f'{out_dir_name}/{filepath} already exists')
        return
    
    with open(input_file_path, 'r') as f:
        # data = f.read()        
        lines = f.readlines()
        random.shuffle(lines)
        data = ''.join(lines)

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) 
    train_data = data

    # encode both to integers
    train_ids = encode(train_data)
    print(f"train has {len(train_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    train_ids.tofile(f'{filepath}')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists(f'{out_dir_name}/meta.pkl'):
        print('saving meta file!')
        with open(f'{out_dir_name}/meta.pkl', 'wb') as f:
            pickle.dump(meta, f)


def make_binary_val_file(out_dir_name, filepath='val.bin', reverse=False):
    if os.path.exists(f'{out_dir_name}/{filepath}'):
        print(f'{out_dir_name}/{filepath} already exists')
        return
    
    data = ''
    for i in range(1000):
        x, y = random.randint(0, 999), random.randint(0, 999)
        if x < y:
            x, y = y, x
        z = int(x) - int(y)
        if reverse:
            z = reverse_string(str(z))
            data += f'${x}-{y}={z}$\n'
        else:
            data += f'{x}-{y}={z}\n'

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) # 130,023
    train_data = data

    # encode both to integers
    train_ids = encode(train_data)
    print(f"val has {len(train_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    train_ids.tofile(f'{out_dir_name}/{filepath}')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists(f'{out_dir_name}/meta.pkl'):
        print('saving meta file!')
        with open(f'{out_dir_name}/meta.pkl', 'wb') as f:
            pickle.dump(meta, f)


def make_subtraction_data(in_dir_name, out_dir_name, file_name):
    lines = get_data(in_dir_name, file_name)  
    out_file_name = f'{out_dir_name}/{file_name}'
    
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    print('making subtraction data file: ', out_file_name)
    with open(f'{out_file_name}', 'w') as f:
        for line in lines:
            a, b = get_abc(line)
            if int(a) < int(b):
                a, b = b, a
            c = int(a) - int(b)
            if line.startswith('$'):
                c = reverse_string(str(c))
                f.write(f'${a}-{b}={c}$\n')
            else:
                f.write(f'{a}-{b}={c}\n')
    out_file_name = f'{out_dir_name}/{file_name}'


def make_subtraction_train_prompt(in_dir_name, out_dir_name, file_name):
    lines = get_data(in_dir_name, file_name)  
    file_name = file_name.replace('.txt', '_trainprompt.txt')
    out_file_name = f'{out_dir_name}/{file_name}'
    
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    print('making train prompt file: ', out_file_name)
    with open(f'{out_file_name}', 'w') as f:
        for line in lines:
            a, b = get_abc(line)
            if int(a) < int(b):
                a, b = b, a
            c = int(a) - int(b)
            if line.startswith('$'):
                c = reverse_string(str(c))
                f.write(f'${a}-{b}=\n')
            else:
                f.write(f'{a}-{b}=\n')
    out_file_name = f'{out_dir_name}/{file_name}'


def make_subtraction_test(in_dir_name, out_dir_name, file_name):
    lines = get_data(in_dir_name, file_name)  
    out_file_name = f'{out_dir_name}/{file_name}'
    
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    with open(f'{out_file_name}', 'w') as f:
        for line in lines:
            a, b = get_abc(line)
            if int(a) < int(b):
                a, b = b, a
            if line.startswith('$'):
                f.write(f'${a}-{b}=\n')
            else:
                f.write(f'{a}-{b}=\n')
    out_file_name = f'{out_dir_name}/{file_name}'



In [5]:
in_dir_name = 'addition_bal'
out_dir_name = f'one-sided-subtraction/plain'
binary_name = f'train_10000.bin'

if not os.path.exists(out_dir_name):
    os.mkdir(out_dir_name)

file_name = 'add_examples_10000.txt'

make_subtraction_data(in_dir_name, out_dir_name, file_name)
make_subtraction_train_prompt(in_dir_name, out_dir_name, file_name)
make_subtraction_test(in_dir_name, out_dir_name, 'prompt_addition_test_0.001.txt')

out_file_name = f'{out_dir_name}/{file_name}'
make_binary_file_shuffle(out_dir_name=out_dir_name, filepath=binary_name, input_file_path=out_file_name)
make_binary_val_file(out_dir_name=out_dir_name, filepath='val.bin', reverse=False)


one-sided-subtraction/plain/add_examples_10000.txt already exists
one-sided-subtraction/plain/add_examples_10000_trainprompt.txt already exists
one-sided-subtraction/plain/prompt_addition_test_0.001.txt already exists
one-sided-subtraction/plain/train_10000.bin already exists
one-sided-subtraction/plain/val.bin already exists


In [6]:
in_dir_name = 'addition_dollar_reverse_curr_bal2'
out_dir_name = f'one-sided-subtraction/dollar_reverse'
file_name = 'add_examples.txt'
binary_name = f'train_10000.bin'

if not os.path.exists(out_dir_name):
    os.mkdir(out_dir_name)

make_subtraction_data(in_dir_name, out_dir_name, file_name)
make_subtraction_train_prompt(in_dir_name, out_dir_name, file_name)
make_subtraction_test(in_dir_name, out_dir_name, 'prompt_addition_test_0.001.txt')

out_file_name = f'{out_dir_name}/{file_name}'
make_binary_file_shuffle(out_dir_name=out_dir_name, filepath=binary_name, input_file_path=out_file_name)
make_binary_val_file(out_dir_name=out_dir_name, filepath='val.bin', reverse=True)


one-sided-subtraction/dollar_reverse/add_examples.txt already exists
one-sided-subtraction/dollar_reverse/add_examples_trainprompt.txt already exists
one-sided-subtraction/dollar_reverse/prompt_addition_test_0.001.txt already exists
one-sided-subtraction/dollar_reverse/train_10000.bin already exists
length of dataset in characters: 13,576
all the unique characters: 
$-0123456789=
vocab size: 14
val has 13,576 tokens
saving meta file!
