In [1]:
"""
create a dataset with addition only, but use meta file, encode/decode, using Shakespeare and algorithmic reasoning too!
This will allow you to try out ICL 
"""

import pickle
import requests
import numpy as np
import os


In [2]:
# use the existing text/addition data (taken from addition_bal / algorithmic_reasoning folder)

addition_ar_path = f'add_examples_algorithmic_3000.txt'
addition_add_path = f'add_examples.txt'

with open(addition_ar_path, 'r') as f:
    data_ar = f.read()
    print(len(data_ar))

with open(addition_add_path, 'r') as f:
    data_add = f.read()
    print(len(data_add))

text_file_path = 'shakespeare.txt'
with open(text_file_path, 'r') as f:
    data_text = f.read()
    print(len(data_text))


813510
120027
1115394


In [3]:
chars = sorted(list(set(data_ar + data_add)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")


all the unique characters: 
 +,-./0123456789:<=>ACDEINT[]acdeghinprstu
vocab size: 43


In [4]:
chars = sorted(list(set(data_text)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 64


In [5]:
data_all = data_text + data_ar + data_add
# get all the unique characters that occur in this text
chars = sorted(list(set(data_all)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string



all the unique characters: 
 !&'+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz
vocab size: 80


In [6]:
# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open(f'meta.pkl', 'wb') as f:
    pickle.dump(meta, f)

# length of dataset in characters:  1115394
# all the unique characters:
#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# vocab size: 65
# train has 1003854 tokens
# val has 111540 tokens

In [7]:
def make_binary_file(filepath, input_file_path):
    with open(input_file_path, 'r') as f:
        data = f.read()        

    print(f"length of dataset in characters: {len(data):,}")

    # create the train and test splits
    n = len(data) # 130,023
    # train_data = data[:int(n*0.9)]
    train_data = data
    # val_data = data[int(n*0.9):]

    # encode both to integers
    train_ids = encode(train_data)
    # val_ids = encode(val_data)
    print(f"train has {len(train_ids):,} tokens")
    # print(f"val has {len(val_ids):,} tokens")

    # export to bin files
    train_ids = np.array(train_ids, dtype=np.uint16)
    # val_ids = np.array(val_ids, dtype=np.uint16)
    train_ids.tofile(f'{filepath}')
    # val_ids.tofile(f'val_no527.bin')


In [8]:
number_of_samples = [500,1000,2000,3000,4000,5000,10000,20000,40000]

for i in number_of_samples:
    print("number of samples: ", i)
    input_file_path = f'add_examples_{i}.txt'
    filepath = f'train_{i}.bin'
    if not os.path.exists(filepath):
        make_binary_file(filepath, input_file_path)

number of samples:  500
length of dataset in characters: 5,463
train has 5,463 tokens
number of samples:  1000
length of dataset in characters: 11,512
train has 11,512 tokens
number of samples:  2000
length of dataset in characters: 23,573
train has 23,573 tokens
number of samples:  3000
length of dataset in characters: 35,598
train has 35,598 tokens
number of samples:  4000
length of dataset in characters: 47,585
train has 47,585 tokens
number of samples:  5000
length of dataset in characters: 59,682
train has 59,682 tokens
number of samples:  10000
length of dataset in characters: 120,027
train has 120,027 tokens
number of samples:  20000
length of dataset in characters: 240,468
train has 240,468 tokens
number of samples:  40000
length of dataset in characters: 481,047
train has 481,047 tokens


In [9]:
# check if our meta file is correct
with open(f'meta.pkl', 'rb') as f:
    meta = pickle.load(f)
    print(meta['vocab_size'])
    print(meta['itos'][0])
    print(meta['stoi']['I'])

80


34
