# Convert from PDF/EPUB to txt

In [14]:
from data.data_preprocessing import DocumentExtractor

In [15]:
# Initialize extractor
extractor = DocumentExtractor(output_dir="data/books")

In [4]:
test_path = '/Users/iulia/Library/CloudStorage/GoogleDrive-iulia.feofanova@gmail.com/My Drive/Books/prep/AlgorithmicCollusionbyLLM.pdf'

In [8]:
book1 = "/Users/iulia/Library/CloudStorage/GoogleDrive-iulia.feofanova@gmail.com/My Drive/BrainStormingAgent/Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).pdf"

In [9]:
# Extract single PDF
result = extractor.extract_single_document(book1)

In [10]:
extractor.generate_extraction_report([result])


DOCUMENT EXTRACTION REPORT
Total files processed: 1
Successful extractions: 1 (100.0%)
Failed extractions: 0 (0.0%)

File Type Breakdown:
  PDF files: 2
  EPUB files: 0

Extraction Method Success:
  PyPDF2 successes: 2
  pdfplumber successes: 0
  ebooklib successes: 0
  epub2txt successes: 0

Quality Metrics:
  Total words extracted: 98,922
  Average words per file: 98922
  Total processing time: 6.39s

Report saved to: data/processed_texts/extraction_report.json


In [16]:
doc_dir = '/Users/iulia/Documents/Documents/StudyMaterials/Brainstorming_agent_RAG_sources/'

In [17]:
results = extractor.batch_extract(doc_dir)



DOCUMENT EXTRACTION REPORT
Total files processed: 8
Successful extractions: 8 (100.0%)
Failed extractions: 0 (0.0%)

File Type Breakdown:
  PDF files: 4
  EPUB files: 4

Extraction Method Success:
  PyPDF2 successes: 4
  pdfplumber successes: 0
  ebooklib successes: 4
  epub2txt successes: 0

Quality Metrics:
  Total words extracted: 542,742
  Average words per file: 67843
  Total processing time: 16.16s

Report saved to: data/processed_texts/extraction_report.json


# Clean text

In [1]:
from data.text_preprocessing.text_preprocessor_main import TextPreprocessor

In [2]:
preprocessor = TextPreprocessor(
        input_dir='/Users/iulia/Documents/Documents/StudyMaterials/brainstorming_agent/data/books',
    log_level= "DEBUG"
    )

In [3]:
results = preprocessor.process_directory()


2025-05-30 19:06:40,822 - INFO - Found 8 files to process
2025-05-30 19:06:40,823 - INFO - Processing: Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).txt
2025-05-30 19:06:40,836 - DEBUG - Encoding for Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).txt: utf-8 (confidence: 0.99)
2025-05-30 19:06:40,838 - DEBUG - Successfully read Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).txt (599281 chars)
2025-05-30 19:06:40,839 - INFO - Original length: 599281 chars
2025-05-30 19:06:40,839 - DEBUG - Cleaning text...
2025-05-30 19:06:40,973 - DEBUG - Text cleaning complete
2025-05-30 19:06:40,973 - DEBUG - Standardizing format...
2025-05-30 19:06:40,974 - INFO - Removed 0 header/footer lines
2025-05-30 19:06:41,009 - DEBUG - Format standardization complete
2025-05-30 19:06:41,009 - INFO - Final length: 588518 chars (98.2%)
2025-05-30 19:06:41,01

In [6]:
# Print summary
print("\n" + "=" * 40)
print("RESULTS")
print("=" * 40)
for filename, success in results.items():
    status = "✓" if success else "✗"
    print(f"{status} {filename}")


RESULTS
✓ Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).txt
✓ Cracking Creativity The Secrets of Creative Genius (Michael Michalko [Michalko, Michael]) (Z-Library).txt
✓ Applied Imagination - Principles and Procedures of Creative Writing (Alex Osborn alex faickney osborn) (Z-Library).txt
✓ Where Good Ideas Come From (Johnson Steven) (Z-Library).txt
✓ Six Thinking Hats (Edward de Bono) (Z-Library).txt
✓ Sprint How to Solve Big Problems and Test New Ideas in Just Five Days (Jake Knapp  John Zeratsky  Braden Kowitz) (Z-Library).txt
✓ How to Get Ideas, Second Edition (Jack Foster) (Z-Library).txt
✓ The Idea Hunter (Andy Boynton  Fischer, Bill  Bole, William) (Z-Library).txt


# Chunk Text

In [8]:
from data.text_chunker import IntelligentTextChunker

In [10]:
chunker = IntelligentTextChunker(
    min_chunk_size=800,
    max_chunk_size=1300,
    overlap_size=100,
)

In [12]:
from pathlib import Path


# Process files
input_dir = Path('/Users/iulia/Documents/Documents/StudyMaterials/brainstorming_agent/data/books/cleaned')
output_dir = input_dir / "chunks"
output_dir.mkdir(parents=True, exist_ok=True)

text_files = list(input_dir.glob("*.txt"))

print(f"Found {len(text_files)} text files to chunk")



Found 8 text files to chunk


In [13]:
all_chunks = []
for file_path in text_files:
    # Read cleaned text
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Create chunks
    chunks = chunker.chunk_text(text, file_path.name)

    # Save individual chunks
    for chunk in chunks:
        chunk_file = output_dir / f"{chunk.chunk_id}.txt"
        with open(chunk_file, 'w', encoding='utf-8') as f:
            f.write(f"# {chunk.chunk_id}\n")
            f.write(f"Source: {chunk.source_file}\n")
            f.write(f"Type: {chunk.chunk_type}\n")
            f.write(f"Tokens: {chunk.token_count}\n")
            if chunk.section_hierarchy:
                f.write(f"Section: {' > '.join(chunk.section_hierarchy)}\n")
            f.write("\n" + "=" * 50 + "\n\n")
            f.write(chunk.content)

    all_chunks.extend(chunks)



2025-05-30 22:51:05,396 - INFO - Chunking text from cleaned_Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).txt
2025-05-30 22:51:05,510 - INFO - Created 113 chunks from cleaned_Breakthrough thinking a guide to creative thinking and idea generation (Vogel, Thomas) (Z-Library).txt
2025-05-30 22:51:05,637 - INFO - Chunking text from cleaned_Six Thinking Hats (Edward de Bono) (Z-Library).txt
2025-05-30 22:51:05,707 - INFO - Created 43 chunks from cleaned_Six Thinking Hats (Edward de Bono) (Z-Library).txt
2025-05-30 22:51:05,723 - INFO - Chunking text from cleaned_The Idea Hunter (Andy Boynton  Fischer, Bill  Bole, William) (Z-Library).txt
2025-05-30 22:51:05,769 - INFO - Created 55 chunks from cleaned_The Idea Hunter (Andy Boynton  Fischer, Bill  Bole, William) (Z-Library).txt
2025-05-30 22:51:05,786 - INFO - Chunking text from cleaned_How to Get Ideas, Second Edition (Jack Foster) (Z-Library).txt
2025-05-30 22:51:05,823 - INFO - Created 4

In [14]:
# Save metadata
chunker.save_chunks_metadata(all_chunks, output_dir / "chunks_metadata.json")

print(f"\nChunking complete!")
print(f"Created {len(all_chunks)} chunks")
print(f"Average chunk size: {sum(c.token_count for c in all_chunks) / len(all_chunks):.0f} tokens")
print(f"Chunks saved to: {output_dir}")

2025-05-30 22:51:16,625 - INFO - Saved metadata for 626 chunks to /Users/iulia/Documents/Documents/StudyMaterials/brainstorming_agent/data/books/cleaned/chunks/chunks_metadata.json



Chunking complete!
Created 626 chunks
Average chunk size: 1291 tokens
Chunks saved to: /Users/iulia/Documents/Documents/StudyMaterials/brainstorming_agent/data/books/cleaned/chunks
