In [1]:
import os
import pickle
import requests
import numpy as np
import random
import re

In [2]:
def numCarryOps(a, b):
    a,b=int(a),int(b)
    def digitSum(n):
        return sum(map(int,str(n)))
    # assert(a >= 0); assert(b >= 0);
    return int((digitSum(a) + digitSum(b) - digitSum(a+b)) / 9)


def get_two_operands(line):
    x, y = line.split('+')
    y = y.split('=')[0]
    return x, y

In [3]:
# open add_examples_10000.txt and select samples with 2nd digit = 5
no_5 = ['0','1','2','3','4','6','7','8','9']

with open('add_examples_10000.txt', 'r') as f:
    lines = f.readlines()

output_file_path = 'add_examples_10000_no5_2nddigit.txt'
if not os.path.exists(output_file_path):
    print('creating file: ', output_file_path)
    with open(output_file_path, 'w') as f:
        for line in lines:
            x, y = get_two_operands(line)
            if len(x) > 1 and x[-2] == '5':
                x = list(x)
                x[-2] = random.choice(no_5)
                x = ''.join(x)
            if len(y) > 1 and y[-2] == '5':
                y = list(y)
                y[-2] = random.choice(no_5)
                y = ''.join(y)
            z = int(x) + int(y)
            line = f'{x}+{y}={z}\n'
            f.write(line)

creating file:  add_examples_10000_no5_2nddigit.txt


In [4]:
# count number of carry operations
num_carry_list = [0,0,0,0]
with open(output_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        x, y = get_two_operands(line)
        num_carry = numCarryOps(x, y)
        num_carry_list[num_carry] += 1

print(num_carry_list)
    

[2473, 2535, 2589, 2403]


In [5]:
# prepare meta file, .bin file

output_file_path = 'add_examples_10000_no5_2nddigit.txt'

with open(output_file_path, 'r') as f:
    data = f.read()        

print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):  
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data) # 130,023
# train_data = data[:int(n*0.9)]
train_data = data
# val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
# val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
# print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
# val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(f'train_no5_2nddigit.bin')
# val_ids.tofile(f'val_no527.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}

if not os.path.exists('meta.pkl'):
    print('saving meta file!')
    with open(f'meta.pkl', 'wb') as f:
        pickle.dump(meta, f)


length of dataset in characters: 120,015
all the unique characters: 
+0123456789=
vocab size: 13
train has 120,015 tokens


In [6]:
# create test prompts with 2nd digit = 5
test_file_path = 'no5_2nddigit_test.txt'

if not os.path.exists(test_file_path):
    print('creating file: ', test_file_path)
    with open(test_file_path, 'w') as f:
        for i in range(1000):
            x = random.randint(10, 999)
            x = list(str(x))
            x[-2] = '5'
            x = ''.join(x)
            y = random.randint(1, 999)
            line = f'{x}+{y}=\n'
            f.write(line)
        for i in range(1000):
            x = random.randint(1, 999)
            y = random.randint(10, 999)
            y = list(str(y))
            y[-2] = '5'
            y = ''.join(y)
            line = f'{x}+{y}=\n'
            f.write(line)

creating file:  no5_2nddigit_test.txt


In [7]:
# append $ in front of the created test prompt too!
test_file_dollar_path = 'no5_2nddigit_test_dollar.txt'
test_file_path = 'no5_2nddigit_test.txt'

if not os.path.exists(test_file_dollar_path):
    print('creating file: ', test_file_dollar_path)
    with open(test_file_dollar_path, 'w') as f:
        with open(test_file_path, 'r') as f2:
            lines = f2.readlines()
            for line in lines:
                line = '$' + line
                f.write(line)

creating file:  no5_2nddigit_test_dollar.txt


In [8]:
# open add_examples_10000.txt and select samples with 2nd digit = 5
no_5 = ['0','1','2','3','4','6','7','8','9']

with open('add_examples_10000.txt', 'r') as f:
    lines = f.readlines()

output_file_path = 'add_examples_10000_no5_1stdigit.txt'
if not os.path.exists(output_file_path):
    print('creating file: ', output_file_path)
    with open(output_file_path, 'w') as f:
        for line in lines:
            x, y = get_two_operands(line)
            if len(x) > 0 and x[-1] == '5':
                x = list(x)
                x[-1] = random.choice(no_5)
                x = ''.join(x)
            if len(y) > 0 and y[-1] == '5':
                y = list(y)
                y[-1] = random.choice(no_5)
                y = ''.join(y)
            z = int(x) + int(y)
            line = f'{x}+{y}={z}\n'
            f.write(line)

# count number of carry operations
num_carry_list = [0,0,0,0]
with open(output_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        x, y = get_two_operands(line)
        num_carry = numCarryOps(x, y)
        num_carry_list[num_carry] += 1

print(num_carry_list)
    

creating file:  add_examples_10000_no5_1stdigit.txt
[2457, 2598, 2563, 2382]


In [9]:
# prepare meta file, .bin file

output_file_path = 'add_examples_10000_no5_1stdigit.txt'

with open(output_file_path, 'r') as f:
    data = f.read()        

print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):  
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data) # 130,023
# train_data = data[:int(n*0.9)]
train_data = data
# val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
# val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
# print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
# val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(f'train_no5_1stdigit.bin')
# val_ids.tofile(f'val_no527.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}

if not os.path.exists('meta.pkl'):
    print('saving meta file!')
    with open(f'meta.pkl', 'wb') as f:
        pickle.dump(meta, f)


length of dataset in characters: 120,018
all the unique characters: 
+0123456789=
vocab size: 13
train has 120,018 tokens


In [10]:
# create test prompts with 2nd digit = 5
test_file_path = 'no5_1stdigit_test.txt'

if not os.path.exists(test_file_path):
    print('creating file: ', test_file_path)
    with open(test_file_path, 'w') as f:
        for i in range(1000):
            x = random.randint(1, 999)
            x = list(str(x))
            x[-1] = '5'
            x = ''.join(x)
            y = random.randint(1, 999)
            line = f'{x}+{y}=\n'
            f.write(line)
        for i in range(1000):
            x = random.randint(1, 999)
            y = random.randint(1, 999)
            y = list(str(y))
            y[-1] = '5'
            y = ''.join(y)
            line = f'{x}+{y}=\n'
            f.write(line)

# append $ in front of the created test prompt too!
test_file_dollar_path = 'no5_1stdigit_test_dollar.txt'
test_file_path = 'no5_1stdigit_test.txt'

if not os.path.exists(test_file_dollar_path):
    print('creating file: ', test_file_dollar_path)
    with open(test_file_dollar_path, 'w') as f:
        with open(test_file_path, 'r') as f2:
            lines = f2.readlines()
            for line in lines:
                line = '$' + line
                f.write(line)

creating file:  no5_1stdigit_test.txt
creating file:  no5_1stdigit_test_dollar.txt


In [14]:
# open add_examples_10000.txt and select samples with 2nd digit = 5
no_5 = ['0','1','2','3','4','6','7','8','9']

with open('add_examples_10000.txt', 'r') as f:
    lines = f.readlines()

output_file_path = 'add_examples_10000_no5_3rddigit.txt'
if not os.path.exists(output_file_path):
    print('creating file: ', output_file_path)
    with open(output_file_path, 'w') as f:
        for line in lines:
            x, y = get_two_operands(line)
            if len(x) > 2 and x[-3] == '5':
                x = list(x)
                x[-3] = random.choice(no_5)
                x = ''.join(x)
            if len(y) > 2 and y[-3] == '5':
                y = list(y)
                y[-3] = random.choice(no_5)
                y = ''.join(y)
            z = int(x) + int(y)
            line = f'{x}+{y}={z}\n'
            f.write(line)

# count number of carry operations
num_carry_list = [0,0,0,0]
with open(output_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        x, y = get_two_operands(line)
        num_carry = numCarryOps(x, y)
        num_carry_list[num_carry] += 1

print(num_carry_list)
    

creating file:  add_examples_10000_no5_3rddigit.txt
[2476, 2543, 2568, 2413]


In [15]:
# prepare meta file, .bin file

output_file_path = 'add_examples_10000_no5_3rddigit.txt'

with open(output_file_path, 'r') as f:
    data = f.read()        

print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):  
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data) # 130,023
# train_data = data[:int(n*0.9)]
train_data = data
# val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
# val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
# print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
# val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(f'train_no5_3rddigit.bin')
# val_ids.tofile(f'val_no527.bin')

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}

if not os.path.exists('meta.pkl'):
    print('saving meta file!')
    with open(f'meta.pkl', 'wb') as f:
        pickle.dump(meta, f)


length of dataset in characters: 119,928
all the unique characters: 
+0123456789=
vocab size: 13
train has 119,928 tokens


In [16]:
# create test prompts with 2nd digit = 5
test_file_path = 'no5_3rddigit_test.txt'

if not os.path.exists(test_file_path):
    print('creating file: ', test_file_path)
    with open(test_file_path, 'w') as f:
        for i in range(1000):
            x = random.randint(100, 999)
            x = list(str(x))
            x[-3] = '5'
            x = ''.join(x)
            y = random.randint(1, 999)
            line = f'{x}+{y}=\n'
            f.write(line)
        for i in range(1000):
            x = random.randint(1, 999)
            y = random.randint(100, 999)
            y = list(str(y))
            y[-3] = '5'
            y = ''.join(y)
            line = f'{x}+{y}=\n'
            f.write(line)

# append $ in front of the created test prompt too!
test_file_dollar_path = 'no5_3rddigit_test_dollar.txt'
test_file_path = 'no5_3rddigit_test.txt'

if not os.path.exists(test_file_dollar_path):
    print('creating file: ', test_file_dollar_path)
    with open(test_file_dollar_path, 'w') as f:
        with open(test_file_path, 'r') as f2:
            lines = f2.readlines()
            for line in lines:
                line = '$' + line
                f.write(line)

creating file:  no5_3rddigit_test.txt
creating file:  no5_3rddigit_test_dollar.txt
