# Load books

In [90]:
import glob
import os
 
# Specify the top-level directory containing all subdirectories
books_directory = '/home/ubuntu/Downloads/twiker-experiments/process_analysis/poem/books'
 
# Initialize an empty list to store the content of the books
books = []
 
# Use glob to recursively find all .txt files in the directory and its subdirectories
book_files = glob.glob(os.path.join(books_directory, '**', '*.txt'), recursive=True)
 
# Iterate over each file found
for book_file in book_files:
    with open(book_file, 'r') as fs:
        books.append(fs.read())
    print(f'{os.path.basename(book_file)}', len(books[-1]))
 
#print()
#print(books[0][0:1000])

SnakePoemsEyesOfTheSnakePoembyRandyMcClave.txt 789
SnakePoemsASnakeInCagePoembyGajananMishra.txt 382
SnakePoemsWhatTheSnakeSawPoembyJamesStephens.txt 596
SnakePoemsTheSnakeGourdBlossomsPoembyMasaokaShiki.txt 79
SnakePoemsTheDoubleHeadedSnakeOfNewburyPoembyJohnGreenleafWhittier.txt 3785
SnakePoemsTheSnakePoembyTaMaRaHaNaRiNgPaLeSTiNeFreedOmFlottila.txt 2348
SnakePoemsSnakeCharmerPoembyGangadharannairPulingat.txt 1798
SnakePoemsSnakeSongPoembyOtteriSelvakumar.txt 78
SnakePoemsLeavingTheCampusThinkOfITakingToABeenAndOfBeingASnakeCharmerPoembyBijayKantDubey.txt 1185
SnakePoemsRainbowSnakePoembyRajArumugam.txt 1811
SnakePoemsTheEagleAndTheSnakePoembyEdwardKofiLouis.txt 83
SnakePoemsCobraSnakePoembyHasmukhAmathalal.txt 683
SnakePoemsSnakeDancePoembyKostasLagos.txt 48
SnakePoemsTheSnakeCharmerPoembySarojiniNaidu.txt 618
SnakePoemsATrystWithAnIndianSnakeCharmerTheCobrasDancingPoembyBijayKantDubey.txt 1173
SnakePoemsSnakeOfTimePoembyGajananMishra.txt 420
SnakePoemsBlackRedBandedSnakePoisonPerfu

In [91]:
import re
 
def book_is_valid(text):
    # Check if the text contains only allowed characters
    # Updated: Added apostrophe (') to the allowed characters
    return bool(re.match(r'^[a-zA-Z0-9.,;?"\s\']*$', text))
 
def capitalize_and_add_period(text):
    # Remove unwanted characters and patterns, except for apostrophes
    text = re.sub(r'[<>@]', ' ', text)  # Remove specific unwanted characters
    # Updated: Replace multiple periods with a single period, but preserve apostrophes
    text = re.sub(r'\.{3,}', '.', text)
    # Updated: Remove excessive spaces, ensuring apostrophes are preserved in contractions
    text = ' '.join(text.split())
    # Ensure the text ends with a single period
    if not text.endswith('.'):
        text = text.strip() + '.'
    # Updated: Reduce repeated characters to two, but preserve apostrophes
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # Replace double newlines with single newline
    text = text.replace('\n\n', '\n')
    # Split the text into lines
    lines = text.splitlines()
    # Process each line
    processed_lines = []
    for line in lines:
        # Remove all leading whitespace (ensures left alignment)
        line = line.lstrip()
        if line:  # Only process non-empty lines after stripping
            # Split line into words
            words = line.split()
            if words:
                # Updated: Capitalize the first word and lowercase the rest, preserving contractions
                line = words[0].capitalize() + ' ' + ' '.join(word.lower() for word in words[1:])
                # Check if the line ends with a punctuation mark
                if not re.search(r"[.,:;!?']$", line.strip()):
                    line = line.strip() + '.'  # Add a period if no punctuation at the end
            processed_lines.append(line)
        else:
            processed_lines.append(line)  # Preserve empty lines
    # Join the lines back into a single string
    return '\n'.join(processed_lines)
 
# Filter books: remove any book that contains invalid characters
filtered_books = []
for book in books:
    if book_is_valid(book):
        processed_book = capitalize_and_add_period(book)
        filtered_books.append(processed_book)

In [92]:
print(len(filtered_books))

6109


In [93]:
def combine_books(book):
    combined_books = []
    i = 0
    while i < len(book):
        current_book = book[i]
        word_count = len(current_book.split())
        # If the current book has less than 400 words, combine it with the next book
        if word_count < 40:
            if i < len(book) - 1:  # If there's a next book, combine with it
                current_book += ' ' + book[i + 1]
                i += 1  # Skip the next book since it's combined with the current one
            elif combined_books:  # If it's the last book and there's a previous combined book, combine with it
                combined_books[-1] += ' ' + current_book
            else:  # If it's the only book left, just add it as is
                combined_books.append(current_book)
        else:
            combined_books.append(current_book)
        i += 1
    return combined_books
combined_books = combine_books(filtered_books)

In [94]:
print(len(combined_books))

3889


In [95]:
# Some long paragraphs will be truncated during training. 
# To avoid lossing data, we split them into shorter ones.

import nltk
from transformers import GPT2Tokenizer

# Download the necessary NLTK data
nltk.download('punkt')

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def split_text_at_sentences(text, max_tokens=400):
    # Tokenize the text into sentences
    sentences = nltk.tokenize.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_tokens = []
    
    
    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)          
        
        # Check if adding this sentence would exceed the max token limit
        if len(current_tokens) + len(sentence_tokens) > max_tokens:
            # If so, finalize the current chunk and start a new one
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_tokens = []
        
        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_tokens.extend(sentence_tokens)
    
    # Add the last chunk if there's any content left
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [96]:
chucked_books = []
for i, book in enumerate(combined_books):
    chunked_book = ''
    #book  = re.sub(r'\.{3,}\s*', '.', book)
    #book = re.sub(r'(.)\1{2,}', r'\1\1', book)
    #book = book.replace("\n\n\n", "\n\n").replace("\n\n", "\n").replace("\n", " ")
    chunks = split_text_at_sentences(book.replace("\n", " "))
    for chunk in chunks:
        chunked_book += chunk + "\n"
    chucked_books.append(chunked_book)

# Merge to train and eval

In [97]:
num_train_books = 900
train = ''
eval =''
for i in range(num_train_books):
    train += chucked_books[i]
for j in range(len(chucked_books)-num_train_books):    
    eval += chucked_books[num_train_books+j]


with open(f'data_files/train.txt', 'w') as fs:
    fs.write(train)

with open(f'data_files/eval.txt', 'w') as fs:
    fs.write(eval)

with open(f'data_files/full.txt', 'w') as fs:
    fs.write(train + eval)

In [98]:
def count_lines_in_file(filename):
    with open(filename, 'r') as file:
        line_count = sum(1 for line in file)
    return line_count
 
# Example usage:
filename = 'your_text_file.txt'
train_number_of_lines = count_lines_in_file('data_files/train.txt')
full_number_of_lines = count_lines_in_file('data_files/full.txt')
eval_number_of_lines = count_lines_in_file('data_files/eval.txt')
print(f'The train file has {train_number_of_lines} lines.')
print(f'The full file has {full_number_of_lines} lines.')
print(f'The eval file has {eval_number_of_lines} lines.')

The train file has 911 lines.
The full file has 3946 lines.
The eval file has 3035 lines.
