This notebook processes the TinyStories dataset so that it only has lower case alphabets, space, commas and periods.

Input:
`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStoriesV2-GPT4-train.txt`

Output:
`/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-processed.txt`

In [2]:
from collections import Counter
import pandas as pd
from tqdm import tqdm
import os

In [None]:
input_file = '/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStoriesV2-GPT4-train.txt'

# Initialize a counter
char_counter = Counter()

# Get the file size
file_size = os.path.getsize(input_file)

# Read the file in chunks to handle large size with a progress bar
with open(input_file, 'r', encoding='utf-8') as file:
    for chunk in tqdm(iter(lambda: file.read(1024 * 1024), ''), total=file_size // (1024 * 1024), unit='MB'):
        char_counter.update(chunk)

# Convert the counter to a DataFrame for better visualization
char_count_df = pd.DataFrame(char_counter.items(), columns=['Character', 'Count'])
print(char_count_df)

100%|██████████| 2124/2124 [01:12<00:00, 29.47MB/s]

    Character      Count
0          \n   15600056
1           O    4637239
2           n  102941792
3           c   30276870
4           e  209712524
..        ...        ...
223         œ          5
224         ê          2
225                    2
226         «          5
227         »          5

[228 rows x 2 columns]





In [65]:
char_count_df = char_count_df.sort_values(by="Count",ascending=False)

In [66]:
char_count_df.to_csv("temp/char_count_df.csv", index=False)

In [67]:
# symbols related to eos
# |,5435403
# <,2717731
# >,2717728

In [68]:
# Define the output file path
output_file_1 = '/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-processed-temp-1.txt'

eos_token_sentence = "<|endoftext|>\n"

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file_1, 'w', encoding='utf-8') as outfile, \
     tqdm(total=file_size, unit='B', unit_scale=True, unit_divisor=1024, desc='Processing') as pbar:
    for line in infile:
        if line == eos_token_sentence:
            continue
        outfile.write(line)
        pbar.update(len(line.encode('utf-8')))

Processing:  98%|█████████▊| 2.04G/2.07G [00:08<00:00, 246MB/s]


In [69]:
# Initialize a counter
char_counter2 = Counter()

output_file_size = os.path.getsize(output_file_1)

# Read the file in chunks to handle large size with a progress bar
with open(output_file_1, 'r', encoding='utf-8') as file:
    for chunk in tqdm(iter(lambda: file.read(1024 * 1024), ''), total=output_file_size // (1024 * 1024), unit='MB'):
        char_counter2.update(chunk)

# Convert the counter to a DataFrame for better visualization
char_count2_df = pd.DataFrame(char_counter2.items(), columns=['Character', 'Count'])
char_count2_df = char_count2_df.sort_values(by="Count",ascending=False)

100%|██████████| 2088/2088 [01:11<00:00, 29.13MB/s]


In [70]:
char_count2_df.to_csv("temp/char_count_df-2.csv", index=False)

In [71]:
# Function to replace fancy quotes with normal quotes
def replace_fancy_quotes(text):
    text = text.replace('“', '"')
    text = text.replace('”', '"')
    text = text.replace('’', '\'')
    return text

In [4]:
import re

# Define a function to check if a sentence contains only alphabets or punctuation
def is_valid_line(line):
    return re.match(r'^[a-zA-Z .,]*\n$', line) is not None

output_file_2 = '/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-processed-temp-2.txt'


In [None]:
file_size = os.path.getsize(output_file_1)

char_counter3 = Counter()

# Read the processed file and filter sentences
with open(output_file_1, 'r', encoding='utf-8') as infile, \
     open(output_file_2, 'w', encoding='utf-8') as outfile, \
        tqdm(total=file_size, unit='B', unit_scale=True, unit_divisor=1024, desc='Processing') as pbar:
    for line in infile:
        line = replace_fancy_quotes(line)
        if is_valid_line(line):
            outfile.write(line)
            char_counter3.update(line)
        pbar.update(len(line.encode('utf-8')))

Processing: 100%|█████████▉| 2.04G/2.04G [00:55<00:00, 39.5MB/s]


In [73]:
char_count3_df = pd.DataFrame(char_counter3.items(), columns=['Character', 'Count'])
char_count3_df = char_count3_df.sort_values(by="Count",ascending=False)
char_count3_df.to_csv("temp/char_count_df-3.csv", index=False)

In [5]:
# now we remove all newlines
# and treat each sentence as its own
# so each sentense is on its own line
# and make it lowercase
# make commas and periods as their own word
# so we space them out

output_file_3 = '/n/netscratch/sham_lab/Everyone/jchooi/in-context-language-learning/data/TinyStories-processed.txt'

file_size = os.path.getsize(output_file_2)

char_counter3 = Counter()

def space_out_commas(sentence):
    sentence = ' , '.join([chunk.strip() for chunk in sentence.split(",")])
    return sentence

# Read the processed file and filter sentences
with open(output_file_2, 'r', encoding='utf-8') as infile, \
     open(output_file_3, 'w', encoding='utf-8') as outfile, \
        tqdm(total=file_size, unit='B', unit_scale=True, unit_divisor=1024, desc='Processing') as pbar:
    for line in infile:
        line = line.lower()
        lines = ''.join([space_out_commas(sentence.strip()) + " .\n" for sentence in line.split(".") if sentence.strip() != ''])
        outfile.write(lines)
        pbar.update(len(line.encode('utf-8')))

Processing: 100%|██████████| 967M/967M [00:19<00:00, 52.0MB/s] 
