In [1]:
import os
import re
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from data_loader_v1 import create_dataloader_v1

In [2]:
def clean_gutenberg_text(file_path):
    """ Cleans a Project Gutenberg book text file by removing non-novel content dynamically. """

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Find start and end markers dynamically
    start_idx, end_idx = None, None

    for i, line in enumerate(lines):
        if re.search(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK", line):
            start_idx = i + 1  # Skip the start line itself
        if re.search(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK", line):
            end_idx = i  # Stop before the end line

    # Keep only content between start and end markers
    if start_idx is not None and end_idx is not None:
        lines = lines[start_idx:end_idx]
    elif start_idx is not None:
        lines = lines[start_idx:]  # If no end marker, keep until the end
    else:
        lines = lines  # If no markers, keep everything (fallback)

    # Convert list back to text for easier regex processing
    text = "".join(lines)

    # Find the first occurrence of "CHAPTER I" that has an empty line before it
    match = re.search(r"\n\s*\n(CHAPTER\s+I\b)", text, re.IGNORECASE)

    if match:
        text = text[match.start():]  # Keep content from first valid "CHAPTER I"

    # Remove excessive newlines
    text = re.sub(r"\n\s*\n", "\n\n", text)

    return text.strip()

In [3]:
def concatinate_text_data(directory="original_texts/"):
    all_text = ""
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            text = clean_gutenberg_text(os.path.join(directory, filename))
            all_text += text + "<|endoftext|>\n"
                
    return all_text

In [4]:
text_data = concatinate_text_data()

In [5]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

with open('train_text_data.txt', 'w') as f:
    f.write(train_data)
with open('val_text_data.txt', 'w') as f:
    f.write(val_data)