In [2]:
import os
import pickle
import requests
import numpy as np
import random
import math

In [24]:
if not os.path.exists('multiplication'):
    os.mkdir('multiplication')

In [27]:
def reverse_string(a: str) -> str:
    return a[::-1]


def get_abc(line):
    [a,b] = line.split('*')
    if a.startswith('$'):
        a = a.split('$')[1]
    b = b.split('=')[0]
    
    return a, b


def make_binary_file_shuffle(out_dir_name, filepath, input_file_path):
    if os.path.exists(f'{out_dir_name}/{filepath}'):
        print(f'{out_dir_name}/{filepath} already exists')
        return
    
    with open(input_file_path, 'r') as f:
        # data = f.read()        
        lines = f.readlines()
        random.shuffle(lines)
        data = ''.join(lines)

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) 
    train_data = data

    # encode both to integers
    train_ids = encode(train_data)
    print(f"train has {len(train_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    train_ids.tofile(f'{out_dir_name}/{filepath}')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists(f'{out_dir_name}/meta.pkl'):
        print('saving meta file!')
        with open(f'{out_dir_name}/meta.pkl', 'wb') as f:
            pickle.dump(meta, f)


def make_binary_val_file(out_dir_name, filepath='val.bin', reverse=False):
    if os.path.exists(f'{out_dir_name}/{filepath}'):
        print(f'{out_dir_name}/{filepath} already exists')
        return
    
    data = ''
    for i in range(1000):
        x, y = random.randint(0, 99), random.randint(0, 99)
        z = x * y
        if reverse:
            z = reverse_string(str(z))
            data += f'${x}*{y}={z}$\n'
        else:
            data += f'{x}*{y}={z}\n'

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) # 130,023
    train_data = data

    # encode both to integers
    train_ids = encode(train_data)
    print(f"val has {len(train_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    train_ids.tofile(f'{out_dir_name}/{filepath}')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists(f'{out_dir_name}/meta.pkl'):
        print('saving meta file!')
        with open(f'{out_dir_name}/meta.pkl', 'wb') as f:
            pickle.dump(meta, f)


def get_data(in_dir_name, file_name):
    with open(f'{in_dir_name}/{file_name}', 'r') as f:
        lines = f.readlines()
    return lines


def make_multiplication_data(out_dir_name, file_name, reverse=False):
    out_file_name = f'{out_dir_name}/{file_name}'
    
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    print('making multiplication data file: ', out_file_name)
    with open(f'{out_file_name}', 'w') as f:
        for a in range(10):
            for b in range(10):
                c = a * b
                if reverse:
                    c = reverse_string(str(c))
                    f.write(f'${a}*{b}={c}$\n')
                else:
                    f.write(f'{a}*{b}={c}\n')
        for i in range(2900):
            a, b = random.randint(0, 99), random.randint(0, 99)
            c = a * b
            if reverse:
                c = reverse_string(str(c))
                f.write(f'${a}*{b}={c}$\n')
            else:
                f.write(f'{a}*{b}={c}\n')
    

def make_multiplication_train_prompt(out_dir_name, file_name):
    lines = get_data(out_dir_name, file_name)  
    file_name = file_name.replace('.txt', '_trainprompt.txt')
    out_file_name = f'{out_dir_name}/{file_name}'
    
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return
    print('making train prompt file: ', out_file_name)
    with open(f'{out_file_name}', 'w') as f:
        for line in lines:
            a, b = get_abc(line)
            if line.startswith('$'):
                f.write(f'${a}*{b}=\n')
            else:
                f.write(f'{a}*{b}=\n')


def make_multiplication_test(out_dir_name, train_prompt_file_name, reverse=False):
    out_file_name = f'{out_dir_name}/{train_prompt_file_name}'

    # read our train file and create a set of all the prompts
    # we want to make sure we don't have any overlap between train and test
    lines_to_remove = set()
    with open(f'{out_file_name}', 'r') as f:
        for line in f.readlines():
            lines_to_remove.add(line.strip())

    # now let's create our test file    
    out_file_name = out_file_name.replace('_trainprompt.txt', '_test.txt')
    
    if os.path.exists(out_file_name):
        print(f'{out_file_name} already exists')
        return

    print('making non-overlapping test file: ', out_file_name)
    with open(f'{out_file_name}', 'w') as f:
        for a in range(100):
            for b in range(100):
                if reverse:
                    line_sample = f'${a}*{b}='
                else:
                    line_sample = f'{a}*{b}='
                for line_to_remove in lines_to_remove:
                    if line_to_remove.startswith(line_sample):
                        lines_to_remove.remove(line_to_remove)
                        break
                else:
                    f.write(line_sample + '\n')

In [32]:
out_dir_name = f'multiplication/plain'
binary_name = f'train_3000.bin'

if not os.path.exists(out_dir_name):
    os.mkdir(out_dir_name)

file_name = 'train_examples_3000.txt'

make_multiplication_data(out_dir_name, file_name, reverse=False)
make_multiplication_train_prompt(out_dir_name, file_name)
make_multiplication_test(out_dir_name, 'train_examples_3000_trainprompt.txt', reverse=False)

out_file_name = f'{out_dir_name}/{file_name}'
make_binary_file_shuffle(out_dir_name=out_dir_name, filepath=binary_name, input_file_path=out_file_name)
make_binary_val_file(out_dir_name=out_dir_name, filepath='val.bin', reverse=False)


making multiplication data file:  multiplication/plain/train_examples_3000.txt
making train prompt file:  multiplication/plain/train_examples_3000_trainprompt.txt
making non-overlapping test file:  multiplication/plain/train_examples_3000_test.txt
length of dataset in characters: 30,740
all the unique characters: 
*0123456789=
vocab size: 13
train has 30,740 tokens
saving meta file!
length of dataset in characters: 10,368
all the unique characters: 
*0123456789=
vocab size: 13
val has 10,368 tokens


In [33]:
# TODO: make the training data the same
out_dir_name = f'multiplication/dollar_reverse'
binary_name = f'train_3000.bin'

if not os.path.exists(out_dir_name):
    os.mkdir(out_dir_name)

file_name = 'train_examples_3000.txt'

make_multiplication_data(out_dir_name, file_name, reverse=True)
make_multiplication_train_prompt(out_dir_name, file_name)
make_multiplication_test(out_dir_name, 'train_examples_3000_trainprompt.txt', reverse=True)

out_file_name = f'{out_dir_name}/{file_name}'
make_binary_file_shuffle(out_dir_name=out_dir_name, filepath=binary_name, input_file_path=out_file_name)
make_binary_val_file(out_dir_name=out_dir_name, filepath='val.bin', reverse=True)


making multiplication data file:  multiplication/dollar_reverse/train_examples_3000.txt
making train prompt file:  multiplication/dollar_reverse/train_examples_3000_trainprompt.txt
making non-overlapping test file:  multiplication/dollar_reverse/train_examples_3000_test.txt
length of dataset in characters: 36,860
all the unique characters: 
$*0123456789=
vocab size: 14
train has 36,860 tokens
saving meta file!
length of dataset in characters: 12,385
all the unique characters: 
$*0123456789=
vocab size: 14
val has 12,385 tokens
