This notebook tokenizes the TinyStories subsetted dataset from `frequent_words.ipynb`. It tokenizes each word into an integer by rank, and does a train-test split.

Input file:
`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-frequent-K.txt`

where K is 100, 200, 300, 400, 500, and the file containing the words and their frequencies

`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-word-freq.csv`


Output file:
`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-K-train.txt`
`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-K-test.txt`

and the tokenizer
`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/tokenizer.json`

In [2]:
import pandas as pd

Ks = [100, 200, 300, 400, 500]

df_word_freq = pd.read_csv("/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-word-freq.csv")
df_word_freq.head()

Unnamed: 0.1,Unnamed: 0,Character,Frequency,Rank
0,0,.,21637643,1
1,1,the,13026562,2
2,2,and,9573623,3
3,3,a,8937078,4
4,4,",",8350439,5


In [4]:
# we use a 0-indexed rank system, with no ties
df_word_freq = df_word_freq.sort_values(by="Character")
df_word_freq['Rank'] = df_word_freq['Frequency'].rank(ascending=False, method='first').astype(int)

# Assert that there are no ties in the ranks
assert df_word_freq['Rank'].nunique() == len(df_word_freq), "There are ties in the ranks"

In [13]:
# make this into a tokenizer
# represent this tokenizer as a dictionary

tokenizer = df_word_freq.set_index('Character')['Rank'].to_dict()
str(tokenizer)[:50]

"{',': 5, '.': 1, 'a': 4, 'aa': 19298, 'aaaaahing':"

In [15]:
import json

# Save the tokenizer dictionary as a JSON file
with open('/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/tokenizer.json', 'w') as json_file:
    json.dump(tokenizer, json_file)

In [16]:
def tokenize_line(line):
    return [tokenizer[word] for word in line.split()]

In [18]:
import random
from tqdm import tqdm

for K in tqdm(Ks):
    input_file = f"/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-frequent-{K}.txt"
    train = []
    test = []
    with open(input_file, 'r') as file:
        lines = file.readlines()
    
    # tokenize
    lines = [tokenize_line(line) for line in lines]
        
    # Shuffle the lines to ensure random distribution
    random.shuffle(lines)
    
    # Split data into 80% train and 20% test
    split_index = int(0.8 * len(lines))
    train = lines[:split_index]
    test = lines[split_index:]
    
    # cast to strings for writing
    train = [' '.join([str(token) for token in line]) for line in train]
    test = [' '.join([str(token) for token in line]) for line in test]
    
    # Output the split
    train_output_file = f"/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-{K}-train.txt"
    test_output_file = f"/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-{K}-test.txt"
    
    # Write the train and test data to respective files
    with open(train_output_file, 'w') as train_file:
        train_file.writelines(train)
    
    with open(test_output_file, 'w') as test_file:
        test_file.writelines(test)

100%|██████████| 5/5 [01:37<00:00, 19.55s/it]
