# data-1 analysis
* testing pretokenization_example code

In [2]:
import json
import os
from typing import BinaryIO

import pandas as pd

In [3]:
split_token = "<|endoftext|>".encode("utf-8")
split_token

b'<|endoftext|>'

In [4]:
mini_chunk_size = 4096

with open('data/owt_valid.txt', 'rb') as file:
    file.seek(0, os.SEEK_END)
    file_size = file.tell()
    print(file_size)
    file.seek(0)
    
    mini_chunk = file.read(mini_chunk_size) 
    print(type(mini_chunk))

289998753
<class 'bytes'>


In [5]:
def find_chunk_boundaries(
    file: BinaryIO, 
    desired_num_chunks: int, 
    split_special_token: bytes
) -> list[int]:
    """
    Chunk the file into parts that can be counted independently.
    May return fewer chunks if the boundaries end up overlapping.
    """
    assert isinstance(split_special_token, bytes), (
        "Must represent special token as a bytestring"
    )

    # Get total file size in bytes
    file.seek(0, os.SEEK_END)
    file_size = file.tell()
    file.seek(0)

    chunk_size = file_size // desired_num_chunks

    # Initial guesses for chunk boundary locations, uniformly spaced
    # Chunks start on previous index, don't include last index
    chunk_boundaries = [i * chunk_size for i in range(desired_num_chunks + 1)]
    chunk_boundaries[-1] = file_size

    mini_chunk_size = 4096  # Read ahead by 4k bytes at a time

    for bi in range(1, len(chunk_boundaries) - 1):
        initial_position = chunk_boundaries[bi]
        file.seek(initial_position)  # Start at boundary guess
        while True:
            mini_chunk = file.read(mini_chunk_size)  # Read a mini chunk

            # If EOF, this boundary should be at the end of the file
            if mini_chunk == b"":
                chunk_boundaries[bi] = file_size
                break

            # Find the special token in the mini chunk
            found_at = mini_chunk.find(split_special_token)
            if found_at != -1:
                chunk_boundaries[bi] = initial_position + found_at
                break
            initial_position += mini_chunk_size

    # Make sure all boundaries are unique, but might be fewer than desired_num_chunks
    return sorted(set(chunk_boundaries))

# OWT Valid

In [14]:
mini_chunk_size = 4096
num_processes = 8

with open('data/owt_valid.txt', 'rb') as file:
        boundaries = find_chunk_boundaries(
                file,
                num_processes,
                "<|endoftext|>".encode("utf-8")
        )

        for start, end in zip(boundaries[:-1], boundaries[1:]):
                file.seek(start)
                chunk = file.read(end - start).decode("utf-8", errors="ignore")
                print(f"CHUNK {start} Size:", len(chunk))
boundaries

CHUNK 0 Size: 36009830
CHUNK 36335216 Size: 35852690
CHUNK 72505172 Size: 35941460
CHUNK 108752143 Size: 35900753
CHUNK 145027268 Size: 35914750
CHUNK 181256470 Size: 35917461
CHUNK 217499287 Size: 35927603
CHUNK 253752435 Size: 35889585


[0,
 36335216,
 72505172,
 108752143,
 145027268,
 181256470,
 217499287,
 253752435,
 289998753]

In [19]:
print(chr(0), repr(chr(0)))

  '\x00'


In [20]:
"Hi I am" + chr(0) + "hello"

'Hi I am\x00hello'

In [21]:
print("Hi I am" + chr(0) + "hello")

Hi I am hello


# corpus.en

In [6]:
FIXTURES_PATH="assignment1-basics/tests/fixtures"
input_path =  os.path.join(FIXTURES_PATH, "corpus.en")

In [14]:
num_processes=8
split_special_token = "<|endoftext|>".encode('utf-8')

with open(input_path, 'rb') as file:
        boundaries = find_chunk_boundaries(
                file,
                num_processes,
                "<|endoftext|>".encode("utf-8")
        )

        for start, end in zip(boundaries[:-1], boundaries[1:]):
                file.seek(start)
                chunk = file.read(end - start).decode("utf-8", errors="ignore")
                print(f"CHUNK {start} Size:", len(chunk))
                print(chunk[:100])
                print(chunk[-100:])
                
        for b_i in range(1, len(boundaries)):
                start = boundaries[b_i-1]
                # every chunk except first contains split_special_token at start
                if b_i!=1:
                        start+=len(split_special_token)
                
                end = boundaries[b_i]
                # Last Chunk contains split_special_token at the end
                if b_i==len(boundaries)-1:
                        end-=len(split_special_token)
                
                file.seek(start)
                chunk = file.read(end - start).decode("utf-8", errors="ignore")
                print(chunk[:100])
                print(chunk[-100:])

CHUNK 0 Size: 132878
iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould
chäftsordnung .
Frau Präsidentin , zur Geschäftsordnung .
Frau Präsidentin , zur Geschäftsordnung .

iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould
tin , zur Geschäftsordnung .
Frau Präsidentin , zur Geschäftsordnung .
Frau Präsidentin , zur Geschä


# tinystories

In [15]:
FIXTURES_PATH="assignment1-basics/tests/fixtures"
input_path =  os.path.join(FIXTURES_PATH, "tinystories_sample_5M.txt")

In [16]:
num_processes=8
split_special_token = "<|endoftext|>".encode('utf-8')

with open(input_path, 'rb') as file:
        boundaries = find_chunk_boundaries(
                file,
                num_processes,
                "<|endoftext|>".encode("utf-8")
        )

        for start, end in zip(boundaries[:-1], boundaries[1:]):
                file.seek(start)
                chunk = file.read(end - start).decode("utf-8", errors="ignore")
                print(f"CHUNK {start} Size:", len(chunk))
                print(chunk[:100])
                print(chunk[-100:])
                
        for b_i in range(1, len(boundaries)):
                start = boundaries[b_i-1]
                # every chunk except first contains split_special_token at start
                if b_i!=1:
                        start+=len(split_special_token)
                
                end = boundaries[b_i]
                # Last Chunk contains split_special_token at the end
                if b_i==len(boundaries)-1:
                        end-=len(split_special_token)
                
                file.seek(start)
                chunk = file.read(end - start).decode("utf-8", errors="ignore")
                print(chunk[:100])
                print(chunk[-100:])

CHUNK 0 Size: 656379
u don't have to be scared of the loud dog, I'll protect you". The mole felt so safe with the little 
 kiss. They played with their ball in the backyard. They were not worried anymore. They were happy.

CHUNK 656657 Size: 654068
<|endoftext|>
One day, a sad cat named Tom was walking in the rain. The rain made a big flood near h
lived happily in the forest, always remembering to follow the signs and protect the important root.

CHUNK 1310951 Size: 655158
<|endoftext|>
Once upon a time, there was a pink cat named Kitty. Kitty loved to nap in the warm sun
nough spraying, let’s go home now.”
So, they both went back home with a happy smile on their faces.

CHUNK 1966391 Size: 655280
<|endoftext|>
Once upon a time, in a small town, there was a store. The store was incredible. It had
e big bird helped Tim get down safely. Tim learned to listen to his friend Sam and be more careful.

CHUNK 2621933 Size: 654814
<|endoftext|>
Once upon a time, there was a little boy named Tim