<a href="https://colab.research.google.com/github/ignancyamichelleg/Data-Science-Applications/blob/main/TextDataPartitioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import re                  # Regular expressions for text cleaning
import random              # For random sampling
import requests            # For downloading books from the internet
import pandas as pd        # For organizing data in tables
import pickle              # For saving Python objects
from pathlib import Path


In [10]:
def download_book(book_id, book_label):
    """
    Download a book from Project Gutenberg.

    Parameters:
    - book_id: The ID number of the book on Project Gutenberg
    - book_label: A single letter label ('a', 'b', or 'c')

    Returns:
    - The text of the book as a string, or None if download fails
    """

    print(f"\n--- Downloading Book {book_label.upper()} (ID: {book_id}) ---")

    # Try the first URL format
    url = f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt"

    try:
        response = requests.get(url, timeout=30)

        # If first URL doesn't work, try second format
        if response.status_code != 200:
            url = f"https://www.gutenberg.org/files/{book_id}/{book_id}.txt"
            response = requests.get(url, timeout=30)

        # Check if download was successful
        if response.status_code == 200:
            print(f"Successfully downloaded! Size: {len(response.text)} characters")
            return response.text
        else:
            print(f"Failed to download (Status code: {response.status_code})")
            return None

    except Exception as e:
        print(f"Error during download: {e}")
        return None

In [11]:
def clean_text(text):
    """
    Clean the book text by removing headers, footers, and extra spaces.

    This uses Regular Expressions to:
    - Remove Project Gutenberg's legal header
    - Remove Project Gutenberg's legal footer
    - Remove extra whitespace

    Parameters:
    - text: The raw text from the book

    Returns:
    - Cleaned text ready for processing
    """

    print("  Cleaning text...")

    # REGEX 1: Remove everything before "*** START OF"
    # This removes the legal header that Project Gutenberg adds
    start_pattern = r'\*\*\*.*?START OF.*?\*\*\*'
    parts = re.split(start_pattern, text, flags=re.IGNORECASE)
    if len(parts) > 1:
        text = parts[-1]  # Keep everything after the start marker

    # REGEX 2: Remove everything after "*** END OF"
    # This removes the legal footer that Project Gutenberg adds
    end_pattern = r'\*\*\*.*?END OF.*?\*\*\*'
    parts = re.split(end_pattern, text, flags=re.IGNORECASE)
    text = parts[0]  # Keep everything before the end marker

    # REGEX 3: Replace multiple spaces, tabs, and newlines with single space
    text = re.sub(r'\s+', ' ', text)

    # REGEX 4: Remove any asterisks
    text = re.sub(r'\*+', '', text)

    # Remove leading and trailing whitespace
    text = text.strip()

    print(f"Text cleaned! Length: {len(text)} characters")
    return text


# ============================================================================
# STEP 3: EXTRACT WORDS USING REGULAR EXPRESSIONS
# ============================================================================

def extract_words(text):
    """
    Extract individual words from the text using Regular Expressions.

    A word is defined as letters and apostrophes (for contractions like "don't").

    Parameters:
    - text: Cleaned text

    Returns:
    - List of words
    """

    print("  Extracting words...")

    # REGEX 5: Match words (letters and apostrophes for contractions)
    # \b = word boundary
    # [A-Za-z]+ = one or more letters
    # (?:'[A-Za-z]+)? = optionally an apostrophe followed by more letters
    word_pattern = r"\b[A-Za-z]+(?:'[A-Za-z]+)?\b"

    words = re.findall(word_pattern, text)

    print(f"Extracted {len(words)} words")
    return words

In [12]:
def create_partitions(words, book_label, num_partitions=200, words_per_partition=100):
    """
    Create random 100-word partitions from the list of words.

    Parameters:
    - words: List of all words from the book
    - book_label: Label for this book ('a', 'b', or 'c')
    - num_partitions: How many partitions to create (default: 200)
    - words_per_partition: Words in each partition (default: 100)

    Returns:
    - List of dictionaries, each containing one partition
    """

    print(f"  Creating {num_partitions} random partitions of {words_per_partition} words each...")

    total_words = len(words)
    partitions = []

    # Make sure we have enough words
    if total_words < words_per_partition:
        print(f"Not enough words! Need {words_per_partition}, only have {total_words}")
        return partitions

    # Create the requested number of partitions
    for i in range(num_partitions):
        # Pick a random starting position
        # We need to leave room for 100 words, so max start is total_words - 100
        max_start = total_words - words_per_partition
        start_position = random.randint(0, max_start)

        # Extract 100 words starting from that position
        partition_words = words[start_position:start_position + words_per_partition]

        # Join the words back into a text string
        partition_text = ' '.join(partition_words)

        # Create a record for this partition
        partition_record = {
            'partition_id': f"{book_label}_{i+1}",  # Example: "a_1", "a_2", etc.
            'book_label': book_label,
            'partition_number': i + 1,
            'text': partition_text,
            'word_count': len(partition_words)
        }

        partitions.append(partition_record)

    print(f"Created {len(partitions)} partitions")
    return partitions


In [13]:
def process_book(book_id, book_label, num_partitions=200):
    """
    Process one book: download, clean, extract words, create partitions.

    Parameters:
    - book_id: Project Gutenberg book ID
    - book_label: Single letter label
    - num_partitions: Number of 100-word partitions to create

    Returns:
    - List of partition records for this book
    """

    # Step 1: Download
    text = download_book(book_id, book_label)
    if text is None:
        return []

    # Step 2: Clean
    clean = clean_text(text)

    # Step 3: Extract words
    words = extract_words(clean)

    # Step 4: Create partitions
    partitions = create_partitions(words, book_label, num_partitions)

    return partitions

In [14]:
def save_data(dataframe, output_folder):
    """
    Save the data in three different formats: CSV, JSON, and Pickle.

    Parameters:
    - dataframe: Pandas DataFrame containing all partitions
    - output_folder: Folder to save the files
    """

    print("\n--- Saving Data ---")

    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(exist_ok=True)

    # Save as CSV (comma-separated values - can open in Excel)
    csv_path = f"{output_folder}/partitions.csv"
    dataframe.to_csv(csv_path, index=False)
    print(f"Saved CSV file: {csv_path}")

    # Save as JSON (JavaScript Object Notation - human readable)
    json_path = f"{output_folder}/partitions.json"
    dataframe.to_json(json_path, orient='records', indent=2)
    print(f"Saved JSON file: {json_path}")

    # Save as Pickle (Python binary format - preserves exact data types)
    pickle_path = f"{output_folder}/partitions.pkl"
    dataframe.to_pickle(pickle_path)
    print(f"Saved Pickle file: {pickle_path}")

    # Also save a summary statistics file
    summary_path = f"{output_folder}/summary_statistics.txt"
    with open(summary_path, 'w') as f:
        f.write("SUMMARY STATISTICS\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Total partitions: {len(dataframe)}\n")
        f.write(f"Books processed: {dataframe['book_label'].nunique()}\n\n")
        f.write("Partitions per book:\n")
        f.write(str(dataframe['book_label'].value_counts().sort_index()))
        f.write("\n\n")
        f.write("Sample of first 5 rows:\n")
        f.write(str(dataframe.head()))
    print(f"Saved summary: {summary_path}")


In [15]:
def main():
    """
    Main function that runs the entire program.
    """

    print("=" * 70)
    print("GUTENBERG BOOK PROCESSOR")
    print("=" * 70)

    # ========================================================================
    # CONFIGURATION - Change these settings as needed
    # ========================================================================

    # Three popular books from Project Gutenberg
    books = [
        {'id': 1342, 'label': 'a', 'title': 'Pride and Prejudice by Jane Austen'},
        {'id': 11, 'label': 'b', 'title': 'Alice\'s Adventures in Wonderland by Lewis Carroll'},
        {'id': 84, 'label': 'c', 'title': 'Frankenstein by Mary Shelley'}
    ]

    num_partitions = 200      # Number of partitions per book
    output_folder = "output"  # Where to save results

    # ========================================================================
    # PROCESS EACH BOOK
    # ========================================================================

    all_partitions = []  # This will hold all partitions from all books

    for book in books:
        print(f"\n{'=' * 70}")
        print(f"Processing: {book['title']}")
        print(f"{'=' * 70}")

        # Process this book
        partitions = process_book(book['id'], book['label'], num_partitions)

        # Add these partitions to our complete collection
        all_partitions.extend(partitions)

    # ========================================================================
    # CREATE PANDAS DATAFRAME
    # ========================================================================

    print(f"\n{'=' * 70}")
    print("Creating Pandas DataFrame")
    print(f"{'=' * 70}")

    # Convert list of dictionaries to Pandas DataFrame
    df = pd.DataFrame(all_partitions)

    print(f" Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    print(df.head())

    # Show statistics by book
    print(f"\nPartitions per book:")
    print(df['book_label'].value_counts().sort_index())

    # ========================================================================
    # SAVE RESULTS
    # ========================================================================

    save_data(df, output_folder)

    # ========================================================================
    # DONE!
    # ========================================================================

    print(f"\n{'=' * 70}")
    print(" PROCESSING COMPLETE!")
    print(f"{'=' * 70}")
    print(f"\nAll files saved to '{output_folder}/' folder")
    print(f"Total partitions created: {len(df)}")
    print("\nYou can now:")
    print("  - Open partitions.csv in Excel or any spreadsheet program")
    print("  - View partitions.json in any text editor")
    print("  - Load partitions.pkl in another Python program")

In [16]:
if __name__ == "__main__":
    # Set random seed for reproducibility (same random samples each time)
    random.seed(42)

    # Run the main program
    main()

GUTENBERG BOOK PROCESSOR

Processing: Pride and Prejudice by Jane Austen

--- Downloading Book A (ID: 1342) ---
Successfully downloaded! Size: 743383 characters
  Cleaning text...
Text cleaned! Length: 718447 characters
  Extracting words...
Extracted 127959 words
  Creating 200 random partitions of 100 words each...
Created 200 partitions

Processing: Alice's Adventures in Wonderland by Lewis Carroll

--- Downloading Book B (ID: 11) ---
Successfully downloaded! Size: 144696 characters
  Cleaning text...
Text cleaned! Length: 143067 characters
  Extracting words...
Extracted 27175 words
  Creating 200 random partitions of 100 words each...
Created 200 partitions

Processing: Frankenstein by Mary Shelley

--- Downloading Book C (ID: 84) ---
Successfully downloaded! Size: 419434 characters
  Cleaning text...
Text cleaned! Length: 418285 characters
  Extracting words...
Extracted 75194 words
  Creating 200 random partitions of 100 words each...
Created 200 partitions

Creating Pandas Data