In [2]:
# read the file and clean out stories that end abruptly

filename = "/n/holyscratch01/sham_lab/summer_2024/datasets/tiny-600.txt"
with open(filename, 'r') as f:
    all_text = f.read()

In [3]:
split_token = "<|endoftext|>\n"

stories = [text + split_token for text in all_text.split(split_token)]
stories[:5]

['Once upon a time, there was a hungry butterfly. The butterfly did not know where to find food. It flew around looking for something to eat.\nOne day, the butterfly met a big, friendly bear. The bear knew where to find food. He said, "Come with me, little butterfly. I will show you where to find yummy flowers." The butterfly was very happy and said, "Thank you, big bear!"\nThe bear took the butterfly to a beautiful garden full of flowers. The butterfly ate and ate until it was not hungry anymore. The butterfly and the bear became good friends. They played together every day and had lots of fun.<|endoftext|>\n',
 'One day, a boy named Tim went to play in the park. He was glad to be there. He saw a big tree and wanted to climb it. Tim was very happy.\nTim saw a cat in the tree. "Hi, cat!" said Tim. The cat was scared. Tim wanted to help the cat. He climbed the tree to get the cat.\nBut, the cat was not a cat! It was a dog! Tim was surprised. "You are not a cat, you are a dog!" Tim said.

In [4]:
# usually these stories have short length

# Sorting the stories by their length
sorted_stories = sorted(stories, key=len)

sorted_stories[:20]

['<|endoftext|>\n',
 'One<|endoftext|>\n',
 'Once<|endoftext|>\n',
 'Once<|endoftext|>\n',
 'Once upon a time,<|endoftext|>\n',
 'Once upon a time, there<|endoftext|>\n',
 'One day, a boy named Tom<|endoftext|>\n',
 'One day, a little boy named<|endoftext|>\n',
 'Once upon a time, there was a<|endoftext|>\n',
 'Once upon a time, there was a little<|endoftext|>\n',
 'Once upon a time, there was a little boy<|endoftext|>\n',
 'One day, a little squirrel named Sam went to find<|endoftext|>\n',
 'One day, a little boy named Tim went to the park. He was<|endoftext|>\n',
 'One day, a little bird named Tim was playing with his friends.<|endoftext|>\n',
 'Once upon a time, there was a little fish named Fin. Fin was a very<|endoftext|>\n',
 'Once upon a time, there was a little girl named Mia. She lived in a<|endoftext|>\n',
 'One day, a boy named Tim went to the park with his mom. It was a hot<|endoftext|>\n',
 'Once upon a time, in a small town, there lived a girl named Lily. Lily loved<|endo

In [5]:
# another way is that they end without a period
period_ending = '.' + split_token

cnt=0
for story in stories:
    if len(story)>len(period_ending) and story[-len(period_ending):] != period_ending:
        cnt += 1
        print(story)

One day, a boy named Tom<|endoftext|>

Once upon a time, there was a little fish named Fin. Fin was a very<|endoftext|>

One day, a little squirrel named Sam went to find<|endoftext|>

Once upon a time, there was a little boy named Tim. He had a big red truck. He loved to play with his truck every day. One hot day, Tim went to the park to play with his truck.
At the park, Tim saw a girl named Sue. Sue liked Tim's truck very much. She wanted to play with it too. But Tim didn't want to share his truck. He wanted to play with it all by himself. This made Sue very sad.
Sue's mom told Tim to take turns with Sue. They could both play with the truck. Tim thought about it and decided to share. They took turns and had a lot of fun. Tim learned that sharing his truck made playing even better!<|endoftext|>

Once upon a time, there was a little boy named Tim. Tim was very fast. He loved to run and play all day. One day, Tim went to the store with his mom. The store had many yummy things to eat.
At

In [6]:
# the number of stories without period vs total
print(cnt)
print(len(stories))

30
846


In [7]:
# remove the one with just eos
stories = [story for story in stories if story != split_token]
print(len(stories))

845


In [8]:
# since some end with " or !
# let's remove those that end without punctuation or with ,

def last_char(story):
    return story[:-len(split_token)][-1]

cleaned_stories = [story for story in stories if not last_char(story).isalnum() and last_char(story) != ',']
len(cleaned_stories)

827

In [9]:
# now let's print the shortest stories (we did not filter by length)

sorted_stories = sorted(cleaned_stories, key=len)
sorted_stories[:10]

['One day, a little bird named Tim was playing with his friends.<|endoftext|>\n',
 "One day, Sam went to the park. He saw a swing and ran to it. He sat on the swing and started to move. Sam was happy.\nSam's mom was near. She saw Sam on the swing and waved to him. Sam waved back. They both smiled.\nSam pushed hard on the swing. He went up and down. He laughed and had fun. The swing made Sam feel good.<|endoftext|>\n",
 'Once upon a time, there was a big owl. The big owl lived in a tree. The tree was in the woods.\nOne day, the big owl saw a small rabbit. The rabbit liked to run. The big owl and the rabbit became friends.\nThey played together every day. The rabbit would run, and the big owl would fly. They were happy and had a lot of fun.<|endoftext|>\n',
 'Once there was a girl. She liked to walk. One day, she went for a long walk in the woods.\nThe girl saw many things. She saw birds, flowers, and trees. She walked and walked, but she did not know that she was lost.\nIt got dark and 

In [10]:
# results look pretty good. Let's save it
filename = "/n/holyscratch01/sham_lab/summer_2024/datasets/cleaned_tiny-600.txt"
with open(filename, 'w') as f:
    f.write(''.join(cleaned_stories))

In [11]:
# let's also write them as input ids
from typing import List
import re

def explode_into_words(story: str) -> List[str]:
    # explode a story into a list of words and single-char non-alphanumerics (e.g. punctuations)
    return re.findall(r'\<\|endoftext\|\>|\w+|\W', story)

In [12]:
word_set = set()
list_of_words = []
num_tokens = 0

for story in cleaned_stories:
    words = explode_into_words(story)
    words = [word.lower() for word in words]
    num_tokens += len(words)
    list_of_words.append(words)
    for word in words:
        word_set.add(word)

print(f"Number of unique words: {len(word_set)}")
print(f"Total tokens: {num_tokens}")

Number of unique words: 583
Total tokens: 221921


In [13]:
word_to_id = dict()
for id, word in enumerate(word_set):
    word_to_id[word] = id

input_ids = []
for words in list_of_words:
    ids = [word_to_id[word] for word in words]
    input_ids.append(ids)

In [14]:
# we need to make sure each input_ids line has the same length
# to save into .npy
# let the padding token be EOS token
eos_id = word_to_id['<|endoftext|>']
eos_id

248

In [15]:
import numpy as np

# Determine the maximum length of the stories
max_length = max(len(story_ids) for story_ids in input_ids)

print(f"maxlength: {max_length}")

maxlength: 438


In [16]:

# Pad each sequence with the custom token
padded_sequences = np.array([story_ids + [eos_id] * (max_length - len(story_ids)) for story_ids in input_ids])

padded_sequences


array([[254, 414, 539, ..., 248, 248, 248],
       [406, 414, 397, ..., 248, 248, 248],
       [406, 414, 397, ..., 248, 248, 248],
       ...,
       [254, 414, 539, ..., 248, 248, 248],
       [254, 414, 539, ..., 248, 248, 248],
       [406, 414, 397, ..., 248, 248, 248]])

In [17]:
# Save the list as a .npy file
filename = "/n/holyscratch01/sham_lab/summer_2024/datasets/cleaned_tiny-600.npy"
np.save(filename, padded_sequences)

In [18]:
# Save the tokenizer
import pickle

with open('tokenizers/simple-600.pkl', 'wb') as file:
    pickle.dump(word_to_id, file)

In [19]:
import random

random.sample(sorted_stories, 5)

['Once upon a time, there was a brown dog. The dog liked to run and play. One day, the dog went far away from home. He did not know how to go back.\nThe dog saw a big tree. He thought, "Maybe if I go around the tree, I will find my home." He went around the tree, but he did not find his home. The dog was sad.\nThen, the dog saw a little girl. The girl knew the dog. She said, "I know where your home is!" The girl took the dog back to his home. The dog was so happy. He did a big jump and gave the girl a hug. From that day, the dog and the girl were best friends.<|endoftext|>\n',
 'Once upon a time, there was a tall tree. A little cat lived in the tree. The cat was happy and liked to play all day.\nOne day, a big man came to the tree. He wanted to cut the tree down. The cat was scared and said, "Please don\'t cut my tree!" The big man stopped and looked at the cat. He did not know the cat could talk.\nThe cat and the big man became friends. They played together and had fun. The big man de