In [6]:
# Import all required libraries for the program to run (nltk, random, pandas, requests and re)
import nltk
import random
import pandas as pd
import requests
import re

# nltk.download('punkt') ## Download the 'punkt' resource (for the first time)
# nltk.download('punkt_tab') ## Download the 'punkt_tab' resource (for the first time)

def read_file(url): # This function downloads the text file of a Gutenberg book from the url
    response = requests.get(url) # Retrieves the specified book (or text) from the web
    response.raise_for_status()
    return response.text

def clean_book(text): # This function cleans the text of the Gutenberg book
    start_data = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*", text, re.IGNORECASE) # Searches text pattern (meta data) at the start of the book
    end_data = re.search(r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*", text, re.IGNORECASE) # Searches text pattern (meta data) at the end of the book
    if start_data:
        text = text[start_data.end():] # Remove text (in the book) before the end line that matches the pattern in start_data
    if end_data:
        text = text[:end_data.start()] # Keep text (in the book) before the start line that matches the pattern in end_data

    # Remove all non-alphanumeric characters (like punctuations) and subsitute it with ''
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text
        
def create_partitions(texts, num_partitions): # Creates partitions of the text file of the Gutenberg book
    labeled_partitions = []

    for label, text in zip('abcdefghijklmnopqrstuvwxyz', texts): # Labels the partitions as a, b, c,..z, one letter per book so that is it unique
        text = clean_book(text)    
        words = nltk.word_tokenize(text) # Tokenize the text into words

        partition_size = 100
        partitions = []
        
        for i in range(num_partitions): # Splits the text into 100 word partitions (200 partitions in total)
            start_index = random.randint(0, len(words) - partition_size) # Starts the partitioning at a random start location (or index) in the text of a book
            partitions.append(words[start_index:start_index + partition_size])
            
        labeled_partitions.extend([(label, partition) for partition in partitions])
    return labeled_partitions

if __name__ == "__main__":
    # URLs for the 3 Gutenberg digital books in text format. These can be changed as needed, so the program works for any Gutenberg digital book
    urls = ['https://www.gutenberg.org/files/84/84-0.txt', 'https://www.gutenberg.org/files/1342/1342-0.txt', 'https://www.gutenberg.org/files/11/11-0.txt']
    texts = [read_file(url) for url in urls] # Reads each .txt url and stores in 'text'
    
    partitions = create_partitions(texts, 200) # Creates partitions of the text

    # Creates a disctionary and stores labels and corresponding text partitions of 100 words
    all_data = {'Label': [], 'Text': []}
    for label, partition in partitions:
        all_data['Label'].append(label)
        all_data['Text'].append(' '.join(partition))
    
    labeled_partitions_df = pd.DataFrame(all_data) # Converts the dictionary into a dataframe using Pandas and displays aspects of the dataframe below
    print("Shape - Rows vs Columns: ", labeled_partitions_df.shape)
    print(labeled_partitions_df)
    print("Sample: ", labeled_partitions_df.sample(5)) # Prints a random sample of the dataframe

# Saves the labels and corresponding partitions to a CSV file
labeled_partitions_df.to_csv("gutenberg_partitions.csv", index=False)
print("Data preparation complete. Output saved to 'gutenberg_partitions.csv'.")

Shape - Rows vs Columns:  (600, 2)
    Label                                               Text
0       a  refined the groundwork of my character that I ...
1       a  the consolation of your father Elizabeth my lo...
2       a  and gratitude assisted the development of fili...
3       a  a prophetic feeling I felt my heart sink withi...
4       a  were now lighted up with indignation now subdu...
..    ...                                                ...
595     c  with sobs to sing this Beautiful Soup so rich ...
596     c  the Mock Turtle yet No said Alice I dont even ...
597     c  heavy sobbing of the Mock Turtle Alice was ver...
598     c  noticed had powdered hair that curled all over...
599     c  on the table Nothing can be clearer than that ...

[600 rows x 2 columns]
Sample:      Label                                               Text
395     b  life saw anyone so much altered as she is sinc...
15      a  with a palpable enemy one by one the various k...
324     b  hersel