In [2]:
import os
import pickle
import requests
import numpy as np
import random
import re

In [3]:
def reverse_string(a: str) -> str:
    return a[::-1]

In [4]:
def numCarryOps(a, b):
    a,b=int(a),int(b)
    def digitSum(n):
        return sum(map(int,str(n)))
    # assert(a >= 0); assert(b >= 0);
    return int((digitSum(a) + digitSum(b) - digitSum(a+b)) / 9)


def get_two_operands(line):
    x, y = line.split('+')
    y = y.split('=')[0]
    return x, y

In [9]:
def make_dataset(output_file_path, numbers_to_delete, input_file_path='add_examples_10000.txt'):
    with open(input_file_path, 'r') as f:
        lines = f.readlines()

    count_changes = 0
    if not os.path.exists(output_file_path):
        with open(output_file_path, 'w') as f:
            for line in lines:
                x, y = get_two_operands(line)
                if int(x) in numbers_to_delete:
                    print(f'previous line: {line}')
                    while int(x) in numbers_to_delete:
                        digit_to_change = random.randint(0, len(str(x))-1)
                        x = list(x)
                        x[digit_to_change] = str(random.randint(0,9)) if digit_to_change != 0 else str(random.randint(1,9))
                        x = ''.join(x)
                    print(f'new line: {x}+{y}={int(x)+int(y)}')
                    count_changes += 1
                if int(y) in numbers_to_delete:
                    while int(y) in numbers_to_delete:
                        print(f'previous line: {line}')
                        digit_to_change = random.randint(0, len(str(y))-1)
                        y = list(y)
                        y[digit_to_change] = str(random.randint(0,9)) if digit_to_change != 0 else str(random.randint(1,9))
                        y = ''.join(y)
                        print(f'new line: {x}+{y}={int(x)+int(y)}')
                    count_changes += 1

                z = int(x) + int(y)
                line = f'{x}+{y}={z}\n'
                f.write(line)

    print(f'count_changes: {count_changes}')

    # count number of carry operations
    num_carry_list = [0,0,0,0]
    with open(output_file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            x, y = get_two_operands(line)
            num_carry = numCarryOps(x, y)
            num_carry_list[num_carry] += 1

    print(num_carry_list)


def make_binary_file(filepath, input_file_path):
    with open(input_file_path, 'r') as f:
        data = f.read()        

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) # 130,023
    # train_data = data[:int(n*0.9)]
    train_data = data
    # val_data = data[int(n*0.9):]

    # encode both to integers
    train_ids = encode(train_data)
    # val_ids = encode(val_data)
    print(f"train has {len(train_ids):,} tokens")
    # print(f"val has {len(val_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    # val_ids = np.array(val_ids, dtype=np.uint16)
    train_ids.tofile(f'{filepath}')
    # val_ids.tofile(f'val_no527.bin')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists('meta.pkl'):
        print('saving meta file!')
        with open(f'meta.pkl', 'wb') as f:
            pickle.dump(meta, f)


def make_binary_file_shuffle(filepath, input_file_path):
    with open(input_file_path, 'r') as f:
        # data = f.read()        
        lines = f.readlines()
        random.shuffle(lines)
        data = ''.join(lines)

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) # 130,023
    # train_data = data[:int(n*0.9)]
    train_data = data
    # val_data = data[int(n*0.9):]

    # encode both to integers
    train_ids = encode(train_data)
    # val_ids = encode(val_data)
    print(f"train has {len(train_ids):,} tokens")
    # print(f"val has {len(val_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    # val_ids = np.array(val_ids, dtype=np.uint16)
    train_ids.tofile(f'{filepath}')
    # val_ids.tofile(f'val_no527.bin')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists('meta.pkl'):
        print('saving meta file!')
        with open(f'meta.pkl', 'wb') as f:
            pickle.dump(meta, f)

In [5]:
# prepare meta file, .bin file

# for no 527, we manually changed some examples from the txt file

output_file_path = 'add_examples_10000_no527.txt'

with open(output_file_path, 'r') as f:
    data = f.read()        

print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):  
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data) # 130,023
# train_data = data[:int(n*0.9)]
train_data = data
# val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
# val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
# print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
# val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(f'train_no527.bin')
# val_ids.tofile(f'val_no527.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}

if not os.path.exists('meta.pkl'):
    print('saving meta file!')
    with open(f'meta.pkl', 'wb') as f:
        pickle.dump(meta, f)


length of dataset in characters: 120,027
all the unique characters: 
+0123456789=
vocab size: 13
train has 120,027 tokens


In [6]:
input_file_path = '527_test_dollar.txt'
output_file_path = '527_test.txt'

with open(input_file_path, 'r') as f:
    data = f.read()
    data = data.replace('$','')

with open(output_file_path, 'w') as f:
    f.write(data)

In [5]:
input_file_path = '527_500_test_dollar.txt'
output_file_path = '527_500_test.txt'

with open(input_file_path, 'r') as f:
    data = f.read()
    data = data.replace('$','')

with open(output_file_path, 'w') as f:
    f.write(data)

In [18]:
# now change 100 such examples in the original file appearing the fewest in both operands
numbers_to_delete = [527, 717, 384, 411, 374, 860, 881, 821, 830, 912]
input_file_path = 'add_examples_10000.txt'

with open(input_file_path, 'r') as f:
    lines = f.readlines()

output_file_path = 'add_examples_10000_no527_10.txt'

count_changes = 0
if not os.path.exists(output_file_path):
    with open(output_file_path, 'w') as f:
        for line in lines:
            x, y = get_two_operands(line)
            if int(x) in numbers_to_delete:
                print(f'previous line: {line}')
                while int(x) in numbers_to_delete:
                    digit_to_change = random.randint(0, len(str(x))-1)
                    x = list(x)
                    x[digit_to_change] = str(random.randint(0,9)) if digit_to_change != 0 else str(random.randint(1,9))
                    x = ''.join(x)
                print(f'new line: {x}+{y}={int(x)+int(y)}')
                count_changes += 1
            if int(y) in numbers_to_delete:
                while int(y) in numbers_to_delete:
                    print(f'previous line: {line}')
                    digit_to_change = random.randint(0, len(str(y))-1)
                    y = list(y)
                    y[digit_to_change] = str(random.randint(0,9)) if digit_to_change != 0 else str(random.randint(1,9))
                    y = ''.join(y)
                    print(f'new line: {x}+{y}={int(x)+int(y)}')
                count_changes += 1

            z = int(x) + int(y)
            line = f'{x}+{y}={z}\n'
            f.write(line)

print(f'count_changes: {count_changes}')

# count number of carry operations
num_carry_list = [0,0,0,0]
with open(output_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        x, y = get_two_operands(line)
        num_carry = numCarryOps(x, y)
        num_carry_list[num_carry] += 1

print(num_carry_list)

previous line: 312+527=839

new line: 312+520=832
previous line: 31+821=852

new line: 31+521=552
previous line: 860+24=884

new line: 880+24=904
previous line: 153+830=983

new line: 153+834=987
previous line: 411+548=959

new line: 431+548=979
previous line: 55+912=967

new line: 55+412=467
previous line: 821+163=984

new line: 811+163=974
previous line: 314+411=725

new line: 314+410=724
previous line: 112+384=496

new line: 112+884=996
previous line: 830+101=931

new line: 837+101=938
previous line: 350+527=877

new line: 350+567=917
previous line: 18+860=878

new line: 18+861=879
previous line: 408+411=819

new line: 408+416=824
previous line: 93+384=477

new line: 93+304=397
previous line: 347+821=1168

new line: 347+831=1178
previous line: 510+881=1391

new line: 510+889=1399
previous line: 830+731=1561

new line: 837+731=1568
previous line: 91+860=951

new line: 91+870=961
previous line: 830+860=1690

new line: 890+860=1750
previous line: 830+860=1690

new line: 890+880=1770
pr

In [19]:
# prepare meta file, .bin file


with open(output_file_path, 'r') as f:
    data = f.read()        

print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):  
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data) # 130,023
# train_data = data[:int(n*0.9)]
train_data = data
# val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
# val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
# print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
# val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(f'train_no527_10.bin')
# val_ids.tofile(f'val_no527.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}

if not os.path.exists('meta.pkl'):
    print('saving meta file!')
    with open(f'meta.pkl', 'wb') as f:
        pickle.dump(meta, f)


length of dataset in characters: 120,024
all the unique characters: 
+0123456789=
vocab size: 13
train has 120,024 tokens


In [20]:
# now change 100 such examples in the original file appearing the fewest in both operands
numbers_to_delete = [527, 717, 384, 411, 374, 860, 881, 821, 830, 912, 328, 990, 406,
       562, 602, 250, 192, 783, 960, 108, 870, 930, 580, 590, 290, 672,
       551, 455, 690, 908, 308, 390, 115, 831, 940, 277, 273, 382, 253,
       905, 932, 317, 581, 226, 219, 217, 709, 207, 729, 891, 306, 950,
       157, 154, 147, 243, 834, 921, 850, 805, 851, 762, 735, 852, 670,
       931, 623, 114, 552, 519, 507, 490, 456, 434, 614, 442, 480, 146,
       218, 606, 571, 802, 452, 721, 109, 631, 563, 491, 685, 166, 901,
       116, 189, 186, 895, 749, 982, 829, 470, 391]
input_file_path = 'add_examples_10000.txt'

with open(input_file_path, 'r') as f:
    lines = f.readlines()

output_file_path = 'add_examples_10000_no527_100.txt'

count_changes = 0
if not os.path.exists(output_file_path):
    with open(output_file_path, 'w') as f:
        for line in lines:
            x, y = get_two_operands(line)
            if int(x) in numbers_to_delete:
                print(f'previous line: {line}')
                while int(x) in numbers_to_delete:
                    digit_to_change = random.randint(0, len(str(x))-1)
                    x = list(x)
                    x[digit_to_change] = str(random.randint(0,9))
                    x = ''.join(x)
                print(f'new line: {x}+{y}={int(x)+int(y)}')
                count_changes += 1
            if int(y) in numbers_to_delete:
                while int(y) in numbers_to_delete:
                    print(f'previous line: {line}')
                    digit_to_change = random.randint(0, len(str(y))-1)
                    y = list(y)
                    y[digit_to_change] = str(random.randint(0,9))
                    y = ''.join(y)
                    print(f'new line: {x}+{y}={int(x)+int(y)}')
                count_changes += 1

            z = int(x) + int(y)
            line = f'{x}+{y}={z}\n'
            f.write(line)

print(f'count_changes: {count_changes}')

# count number of carry operations
num_carry_list = [0,0,0,0]
with open(output_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        x, y = get_two_operands(line)
        num_carry = numCarryOps(x, y)
        num_carry_list[num_carry] += 1

print(num_carry_list)

previous line: 115+814=929

new line: 105+814=919
previous line: 306+92=398

new line: 006+92=98
previous line: 470+518=988

new line: 370+518=888
previous line: 416+452=868

new line: 416+457=873
previous line: 219+100=319

new line: 917+100=1017
previous line: 210+614=824

new line: 210+604=814
previous line: 37+930=967

new line: 37+130=167
previous line: 211+442=653

new line: 211+142=353
previous line: 115+31=146

new line: 715+31=746
previous line: 306+502=808

new line: 326+502=828
previous line: 214+614=828

new line: 214+514=728
previous line: 805+30=835

new line: 705+30=735
previous line: 631+208=839

new line: 731+208=939
previous line: 421+108=529

new line: 421+106=527
previous line: 312+527=839

new line: 312+567=879
previous line: 745+154=899

new line: 745+156=901
previous line: 455+544=999

new line: 485+544=1029
previous line: 123+802=925

new line: 123+902=1025
previous line: 829+20=849

new line: 823+20=843
previous line: 602+252=854

new line: 604+252=856
previous

In [21]:
# prepare meta file, .bin file


with open(output_file_path, 'r') as f:
    data = f.read()        

print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):  
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data) # 130,023
# train_data = data[:int(n*0.9)]
train_data = data
# val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
# val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
# print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
# val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(f'train_no527_100.bin')
# val_ids.tofile(f'val_no527.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}

if not os.path.exists('meta.pkl'):
    print('saving meta file!')
    with open(f'meta.pkl', 'wb') as f:
        pickle.dump(meta, f)


length of dataset in characters: 120,020
all the unique characters: 
+0123456789=
vocab size: 13
train has 120,020 tokens


In [10]:
# now change 100 such examples in the original file appearing the fewest in both operands
numbers_to_delete = [527, 717, 384, 411, 374, 860, 881, 821, 830, 912, 328, 990, 406,
       562, 602, 250, 192, 783, 960, 108, 870, 930, 580, 590, 290, 672,
       551, 455, 690, 908, 308, 390, 115, 831, 940, 277, 273, 382, 253,
       905, 932, 317, 581, 226, 219, 217, 709, 207, 729, 891, 306, 950,
       157, 154, 147, 243, 834, 921, 850, 805, 851, 762, 735, 852, 670,
       931, 623, 114, 552, 519, 507, 490, 456, 434, 614, 442, 480, 146,
       218, 606, 571, 802, 452, 721, 109, 631, 563, 491, 685, 166, 901,
       116, 189, 186, 895, 749, 982, 829, 470, 391, 318, 534, 429, 953,
       630, 782, 754, 246, 271, 971, 450, 276, 393, 625, 151, 790, 138,
       638, 285, 415, 941, 419, 714, 171, 564, 409, 820, 389, 373, 259,
       268, 634, 240, 239, 284, 286, 232, 229, 842, 322, 333, 345, 357,
       743, 380, 892, 574, 858, 708, 935, 127, 136, 711, 547, 811, 440,
       980, 915, 661, 545, 198, 917, 833, 427, 592, 600, 652, 601, 325,
       800, 492, 362, 702, 861, 531, 942, 509, 624, 937, 646, 727, 503,
       292, 616, 227, 613, 607, 925, 504, 587, 890, 416, 910, 173, 133,
       170, 550, 680, 807, 414]
output_file_path = 'add_examples_10000_no527_200.txt'

make_dataset(output_file_path, numbers_to_delete)

make_binary_file('train_no527_200.bin', output_file_path)

previous line: 115+814=929

new line: 195+814=1009
previous line: 306+92=398

new line: 304+92=396
previous line: 652+233=885

new line: 752+233=985
previous line: 470+518=988

new line: 475+518=993
previous line: 170+808=978

new line: 177+808=985
previous line: 333+113=446

new line: 335+113=448
previous line: 330+229=559

new line: 330+299=629
previous line: 416+452=868

new line: 618+452=1070
previous line: 416+452=868

new line: 618+352=970
previous line: 219+100=319

new line: 211+100=311
previous line: 210+614=824

new line: 210+114=324
previous line: 210+614=824

new line: 210+914=1124
previous line: 37+930=967

new line: 37+960=997
previous line: 37+930=967

new line: 37+860=897
previous line: 37+930=967

new line: 37+869=906
previous line: 211+442=653

new line: 211+842=1053
previous line: 211+442=653

new line: 211+840=1051
previous line: 325+120=445

new line: 320+120=440
previous line: 312+646=958

new line: 312+648=960
previous line: 115+31=146

new line: 111+31=142
previ

In [11]:
# now change 100 such examples in the original file appearing the fewest in both operands
numbers_to_delete = [527, 717, 384, 411, 374, 860, 881, 821, 830, 912, 328, 990, 406,
       562, 602, 250, 192, 783, 960, 108, 870, 930, 580, 590, 290, 672,
       551, 455, 690, 908, 308, 390, 115, 831, 940, 277, 273, 382, 253,
       905, 932, 317, 581, 226, 219, 217, 709, 207, 729, 891, 306, 950,
       157, 154, 147, 243, 834, 921, 850, 805, 851, 762, 735, 852, 670,
       931, 623, 114, 552, 519, 507, 490, 456, 434, 614, 442, 480, 146,
       218, 606, 571, 802, 452, 721, 109, 631, 563, 491, 685, 166, 901,
       116, 189, 186, 895, 749, 982, 829, 470, 391, 318, 534, 429, 953,
       630, 782, 754, 246, 271, 971, 450, 276, 393, 625, 151, 790, 138,
       638, 285, 415, 941, 419, 714, 171, 564, 409, 820, 389, 373, 259,
       268, 634, 240, 239, 284, 286, 232, 229, 842, 322, 333, 345, 357,
       743, 380, 892, 574, 858, 708, 935, 127, 136, 711, 547, 811, 440,
       980, 915, 661, 545, 198, 917, 833, 427, 592, 600, 652, 601, 325,
       800, 492, 362, 702, 861, 531, 942, 509, 624, 937, 646, 727, 503,
       292, 616, 227, 613, 607, 925, 504, 587, 890, 416, 910, 173, 133,
       170, 550, 680, 807, 414, 572, 682, 766, 163, 118, 372, 386, 822,
       621, 502, 536, 404, 501, 746, 508, 813, 435, 532, 420, 530, 517,
       780, 264, 432, 819, 521, 752, 523, 767, 972, 360, 466, 748, 981,
       594, 472, 343, 392, 770, 341, 474, 604, 839, 329, 327, 828, 481,
       321, 484, 319, 750, 467, 843, 826, 426, 300, 914, 180, 148, 104,
       884, 235, 706, 871, 209, 135, 904, 159, 926, 675, 635, 156, 231,
       681, 903, 695, 140, 671, 920, 129, 663, 718, 215, 880, 463, 424,
       809, 126, 132, 443, 168, 449, 446, 398, 208, 142, 145, 591, 433,
       818, 194, 356, 355, 353, 338, 352, 437, 595, 439, 740, 139, 736,
       817, 645, 340, 660, 653, 310, 315, 961, 234, 792, 772, 238, 178,
       500, 786, 725, 694, 704, 164, 263, 529, 703, 260, 951, 970, 627,
       577, 190, 302, 804, 544, 291, 376, 301, 107, 430, 608, 615, 417,
       119, 105, 640, 155, 150, 399, 410, 160, 158, 421, 412, 408, 569,
       673, 968, 256, 909, 526, 697, 513, 936, 963, 705, 498, 497, 773,
       924, 793, 540, 541, 103, 483, 801, 803, 473, 113, 710, 916, 462,
       461, 560, 137, 444, 438, 712, 128, 974, 165, 864, 367, 365, 242,
       278, 241, 644, 893, 193, 281, 847, 287, 741, 886, 204, 289, 296,
       293, 603, 369, 894, 873, 179, 844, 252, 251, 641, 323, 827, 629,
       249, 648, 732, 332, 840, 378, 626, 270, 272, 832, 307, 636, 722,
       611, 707, 303, 686, 295, 475, 495, 619, 632, 955, 775, 946, 516,
       265, 965, 728, 726, 535, 692, 691, 855, 275, 282, 622, 939, 546,
       841, 316, 744, 907, 149, 371, 187, 418, 153, 184, 379, 716, 667,
       385, 407, 387, 668, 700, 254, 176, 573, 174, 172, 191, 354, 363,
       655, 882, 911, 212, 134, 755]

output_file_path = 'add_examples_10000_no527_500.txt'

make_dataset(output_file_path, numbers_to_delete)

make_binary_file('train_no527_500.bin', output_file_path)

previous line: 50+148=198

new line: 50+108=158
previous line: 50+148=198

new line: 50+108=158
previous line: 50+148=198

new line: 50+408=458
previous line: 50+148=198

new line: 50+308=358
previous line: 50+148=198

new line: 50+338=388
previous line: 50+148=198

new line: 50+308=358
previous line: 50+148=198

new line: 50+808=858
previous line: 221+645=866

new line: 221+245=466
previous line: 303+351=654

new line: 605+351=956
previous line: 115+814=929

new line: 515+814=1329
previous line: 306+92=398

new line: 336+92=428
previous line: 652+233=885

new line: 556+233=789
previous line: 191+505=696

new line: 199+505=704
previous line: 470+518=988

new line: 479+518=997
previous line: 302+140=442

new line: 902+140=1042
previous line: 302+140=442

new line: 902+144=1046
previous line: 320+158=478

new line: 320+658=978
previous line: 170+808=978

new line: 175+808=983
previous line: 500+458=958

new line: 400+458=858
previous line: 120+517=637

new line: 120+567=687
previous line

Let's try to smartly create a dataset consisting of

- all 1 digit additions
- 2 digit additions with all combinations of 1 digit numbers for each digits (200)
- 3 digit additions with all combinations of 1 digit numbers for each digits (300)


In [3]:
import os
import pickle
import requests
import numpy as np
import random
import re

def get_two_operands(line):
    x, y = line.split('+')
    y = y.split('=')[0]
    return x, y

def get_unique_operands(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    operands = []
    for line in lines:
        x, y = get_two_operands(line)
        operands.append(x)
        operands.append(y)
    return set(operands)

In [4]:
# select [0~4] w.p. 1/15 and [5~9] w.p. 2/15
def select_digit():
    r = random.randint(0,14)
    if r < 5:
        return r
    else:
        return random.randint(5,9)

In [42]:
def make_smart_addition_examples(file_name):
    if os.path.exists(file_name):
        print(f'{file_name} already exists!')
        return
    
    print(f'creating file: {file_name}')
    with open(file_name, 'w') as f:
        # add all 1 digit examples - this would be 10 unique numbers
        for x in range(10):
            for y in range(10):
                z = x + y
                line = f'{x}+{y}={z}\n'
                f.write(line)
        # add 900 2 digit examples - this would be ~ 90 unique numbers
        # count_2digit = 0        
        # while count_2digit < 900:
        #     x = random.randint(0, 99)
        #     y = random.randint(0,99)
        #     if x < 10 and y < 10:
        #         continue
        #     z = x + y
        #     line = f'{x}+{y}={z}\n'
        #     f.write(line)
        #     count_2digit += 1

        for digit in range(1, 3):
            for xx in range(10):
                for yy in range(10):
                    if digit == 1:
                        x = str(select_digit()) + str(xx)
                        y = str(select_digit()) + str(yy)
                    elif digit == 2:
                        x = str(xx) + str(select_digit())
                        y = str(yy) + str(select_digit())
                    x, y = int(x), int(y)
                    z = int(x) + int(y)
                    line = f'{x}+{y}={z}\n'
                    f.write(line)


        for digit in range(1, 4):
            for xx in range(10):
                for yy in range(10):
                    if digit == 1:
                        x = str(select_digit()) + str(select_digit()) + str(xx)
                        y = str(select_digit()) + str(select_digit()) + str(yy)
                    elif digit == 2:
                        x = str(select_digit()) + str(xx) + str(select_digit())
                        y = str(select_digit()) + str(yy) + str(select_digit())
                    elif digit == 3:
                        x = str(xx) + str(select_digit()) + str(select_digit())
                        y = str(yy) + str(select_digit()) + str(select_digit())
                    x, y = int(x), int(y)
                    z = int(x) + int(y)
                    line = f'{x}+{y}={z}\n'
                    f.write(line)

        random_count = 0        
        while random_count < 400:
            x = random.randint(0, 999)
            y = random.randint(0,999)
            z = x + y
            line = f'{x}+{y}={z}\n'
            f.write(line)
            random_count += 1


def make_binary_file_shuffle(filepath, input_file_path):
    with open(input_file_path, 'r') as f:
        # data = f.read()        
        lines = f.readlines()
        random.shuffle(lines)
        data = ''.join(lines)

    print(f"length of dataset in characters: {len(data):,}")

    # get all the unique characters that occur in this text
    chars = sorted(list(set(data)))
    vocab_size = len(chars)
    print("all the unique characters:", ''.join(chars))
    print(f"vocab size: {vocab_size:,}")

    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    def encode(s):  
        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
    def decode(l):
        ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    # create the train and test splits
    n = len(data) # 130,023
    # train_data = data[:int(n*0.9)]
    train_data = data
    # val_data = data[int(n*0.9):]

    # encode both to integers
    train_ids = encode(train_data)
    # val_ids = encode(val_data)
    print(f"train has {len(train_ids):,} tokens")
    # print(f"val has {len(val_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    # val_ids = np.array(val_ids, dtype=np.uint16)
    if not os.path.exists(filepath):
        train_ids.tofile(f'{filepath}')
        print(f'saved {filepath}')
    else:
        print(f'{filepath} already exists!')
    # val_ids.tofile(f'val_no527.bin')

    # save the meta information as well, to help us encode/decode later
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }

    if not os.path.exists('meta.pkl'):
        print('saving meta file!')
        with open(f'meta.pkl', 'wb') as f:
            pickle.dump(meta, f)

In [43]:
output_file_path = 'add_examples_smallsample_1000.txt'
if not os.path.exists(output_file_path):
    make_smart_addition_examples(output_file_path)

unique_operands = get_unique_operands(output_file_path)
print(f'number of unique operands: {len(unique_operands)}')

make_binary_file_shuffle('train_smallsample_1000.bin', output_file_path)

creating file: add_examples_smallsample_1000.txt
number of unique operands: 801
length of dataset in characters: 11,189
all the unique characters: 
+0123456789=
vocab size: 13
train has 11,189 tokens
saved train_smallsample_1000.bin


In [45]:
all_number_set = {str(i) for i in range(1000)}
hidden_numbers = all_number_set - unique_operands
print(len(hidden_numbers))

test_file_name = 'smallsample_1000_test.txt'
if not os.path.exists(test_file_name):
    with open(test_file_name, 'w') as f:
        for deleted_num in list(hidden_numbers):
            # sample  100 random numbers between 0 and 1000
            random_numbers = random.sample(range(1000), 5)
            for b in random_numbers:
                f.write(f'{deleted_num}+{b}=\n')
            random_numbers = random.sample(range(1000), 5)
            for a in random_numbers:
                f.write(f'{a}+{deleted_num}=\n')

199


Let's try to smartly create a dataset consisting of 10,000 samples of additions with a pool of just 200 numbers
- 10 1 digit numbers (all) 
- 90 2-digit numbers 
- 100 3-digit numbers

sample to have:
- all 100 combinations of 1-digit additions
- total 10,000 addition samples

In [22]:
def make_small_number_pool(file_name):
    if os.path.exists(file_name):
        print(f'{file_name} already exists!')
        return
    
    number_pool = [i for i in range(100)]
    
    #  choose 100 random numbers (without replacement) from [100~999] and add to number pool
    while len(number_pool) < 200:
        rand_num =  random.randint(100, 999)
        if rand_num not in number_pool:
            number_pool.append(rand_num)

    print(f'creating file: {file_name}')
    with open(file_name, 'w') as f:
        # add all 1 digit examples - this would be 10 unique numbers
        for x in range(10):
            for y in range(10):
                z = x + y
                line = f'{x}+{y}={z}\n'
                f.write(line)

        count = 100        
        while count < 10900:
            x = random.choice(number_pool)
            y = random.choice(number_pool)
            z = x + y
            line = f'{x}+{y}={z}\n'
            f.write(line)
            count += 1

In [23]:
output_file_path = 'add_examples_smallpool_200.txt'
if not os.path.exists(output_file_path):
    make_small_number_pool(output_file_path)

unique_operands = get_unique_operands(output_file_path)
print(f'number of unique operands: {len(unique_operands)}')

make_binary_file_shuffle('train_smallpool_200.bin', output_file_path)

creating file: add_examples_smallpool_200.txt
number of unique operands: 200
length of dataset in characters: 119,089
all the unique characters: 
+0123456789=
vocab size: 13
train has 119,089 tokens
saved train_smallpool_200.bin


In [29]:
all_number_set = {str(i) for i in range(1000)}
hidden_numbers = all_number_set - unique_operands

test_file_name = 'smallpool_200_test.txt'
if not os.path.exists(test_file_name):
    with open(test_file_name, 'w') as f:
        for deleted_num in list(hidden_numbers):
            # sample  100 random numbers between 0 and 1000
            random_numbers = random.sample(range(1000), 2)
            for b in random_numbers:
                f.write(f'{deleted_num}+{b}=\n')
            random_numbers = random.sample(range(1000), 2)
            for a in random_numbers:
                f.write(f'{a}+{deleted_num}=\n')

In [47]:
with open('smallsample_1000_test.txt', 'r') as f:
    lines = f.readlines()
    with open('smallsample_1000_test_dollar.txt', 'w') as f:
        for line in lines:
            f.write('$'+line)