# Load books

In [2]:
import glob
import os
 
# Specify the top-level directory containing all subdirectories
books_directory = '/home/ubuntu/Downloads/twiker-experiments/process_analysis/CRIMEandPUNISHMENT/books'
books = []

# List all .txt files in the directory using glob
book_files = glob.glob(os.path.join(books_directory, '*.txt'))
# Iterate over each file in the directory
for book_file in book_files:
    with open(book_file, 'r',encoding='UTF-8') as fs:
        books.append(fs.read())
    print(f'{os.path.basename(book_file)}', len(books[-1]))
print()

CRIME AND PUNISHMENT.txt 1129664



In [3]:
import re
def clean_text(text):
    """Clean the text by removing special characters, tags, etc."""
    # Replace newlines and carriage returns with a space
    text = text.replace('\n', ' ').replace('\r', ' ')

    # Remove all non-alphanumeric characters except spaces
    #text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    #text = re.sub('<[^>]+>%/', '', text)
    text = re.sub('<[^>]+>', '', text) # Remove any character that is not a letter, number, punctuation, or whitespace 
    #text = re.sub(r'[^A-Za-z0-9\s.,!?;:()-]', '', text)
    #text = re.sub(r'\.{3,}\s*', '', text)
    # Remove leading tabs and extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

cleaned_books =[clean_text(book) for book in books]

In [4]:
# Some long paragraphs will be truncated during training. 
# To avoid lossing data, we split them into shorter ones.

import nltk
from transformers import GPT2Tokenizer

# Download the necessary NLTK data
nltk.download('punkt')

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def split_text_at_sentences(text, max_tokens=200):
    # Tokenize the text into sentences
    sentences = nltk.tokenize.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_tokens = []
    
    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
        
        # Check if adding this sentence would exceed the max token limit
        if len(current_tokens) + len(sentence_tokens) > max_tokens:
            # If so, finalize the current chunk and start a new one
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_tokens = []
        
        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_tokens.extend(sentence_tokens)
    
    # Add the last chunk if there's any content left
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
chucked_books = []
for i, book in enumerate(cleaned_books):
    chunked_book = ''
    chunks = split_text_at_sentences(book.replace("\n", " "))
    for chunk in chunks:
        chunked_book += chunk + "\n"
    chucked_books.append(chunked_book)

# Merge to train and eval

In [6]:
# Assuming `chucked_books` is a list where each element is a string containing paragraphs of the book.
book_content = chucked_books[0]  # Get the entire book content
 
# Assuming paragraphs are already defined (e.g., separated by '\n\n').
paragraphs = book_content.split('\n')
 
# Calculate the split index for 80% of the paragraphs for training
split_index = int(0.6 * len(paragraphs))
 
# Check if split_index is at a paragraph boundary
train_paragraphs = paragraphs[:split_index]  # First 80% of the paragraphs for training
eval_paragraphs = paragraphs[split_index:]   # Remaining 20% of the paragraphs for evaluation
 
# Ensure that paragraphs are not split inappropriately
# In this case, since we split by paragraph, each set will contain whole paragraphs.
 
# Join the paragraphs back into a single string for train and eval
train = '\n'.join(train_paragraphs)
eval = '\n'.join(eval_paragraphs)
 
# Write the train, eval, and full content to respective files
with open('data_files/train.txt', 'w') as fs:
    fs.write(train)
 
with open('data_files/eval.txt', 'w') as fs:
    fs.write(eval)
 
with open('data_files/full.txt', 'w') as fs:
    fs.write(book_content)