In [1]:
import os
import re
import torch
from torch.utils.data import Dataset, DataLoader, random_split

from data_loader_v1 import create_dataloader_v1
from clean_gutenberg_text import clean_gutenberg_text

In [2]:
def concatinate_text_data(directory="original_texts/"):
    all_text = ""
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            text = clean_gutenberg_text(os.path.join(directory, filename))
            all_text += text + "\n<|endoftext|>\n"
            
    # Replace multiple consecutive <|endoftext|> with a single one
    all_text = re.sub(r"(<\|endoftext\|>\n*)+", "<|endoftext|>\n", all_text, flags=re.DOTALL)
                
    return all_text

## Training & validation datasets

In [3]:
text_data = concatinate_text_data()

In [4]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

with open('./dataset/train_text_data.txt', 'w', encoding='utf-8', newline='') as f:
    f.write(train_data)

with open('./dataset/val_text_data.txt', 'w', encoding='utf-8', newline='') as f:
    f.write(val_data)

In [5]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

word_count = len(text_data.split())
char_count = len(text_data)

tokens = tokenizer.encode(text_data, allowed_special={'<|endoftext|>'})

token_count = len(tokens)
unique_token_count = len(set(tokens))

print("Words:", word_count)
print("Characters:", char_count)
print("Tokens:", token_count)
print("Unique Tokens Used:", unique_token_count)

Words: 5789730
Characters: 32372094
Tokens: 7608098
Unique Tokens Used: 28960


## Evaluation dataset

In [6]:
eval_data = concatinate_text_data(directory="original_texts/evaluation_set/")

In [7]:
with open('./dataset/eval_text_data.txt', 'w', encoding='utf-8', newline='') as f:
    f.write(eval_data)