In [2]:
# Imports

!pip install python-docx
!pip install docx

from docx import Document
from io import BytesIO
import re
import os
from pathlib import Path

# from google.colab import files



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# RAG Mini Project
## Milestone #1 : Create and store Chunks
This notebook shows how to create text chunks from MS Word Documents.  
- Chunks all the word documents in a directory
- Uses python-docs to extract paragraph text for chunking
- Paragraphs are merged depending on parameterizable  max chunk size
- Document cleaning recommended for best results
-- remove diagrams and unnecessary text
-- merge paragraphs that are semantically similar

## Deliverables:
- Selection of multiple documents for your RAG project
- Capture chunks in a pickle file for next step (Embeddings)

When you generate the chunks with the size and semantics you want, then store them into a List and use Python's pickle library to save any Python data structure in a pickle file (.pkl) for later use and then recreate the data structure by loading the pickle file.

Deliverables:

- List of documents for your RAG project
- Jupyter Notebook
- Short summary of your efforts (issues, successes...)
- Pickle file with chunks for next step (Milestone 2)




### How to use pickle (good for Python data)
<code>import pickle<br>
my_list = ['apple', 'banana', 42, [1, 2, 3]]</code>

### Save the list
<code>with open('mylist.pkl', 'wb') as f:<br>
&nbsp;&nbsp;&nbsp;&nbsp; pickle.dump(my_list, f)
</code>
### Load the list
<code>with open('mylist.pkl', 'rb') as f:<br>
&nbsp;&nbsp;&nbsp;&nbsp; loaded_list = pickle.load(f)
</code>

In [6]:
# Extract Chunks using document paragraphs
# Chunk size is controlled by parameter

def extract_fixed_chunks(file_path, chunk_size=1000):
    """
    Extract fixed-size chunks from a Word document.

    Args:
        file_path (str or bytes): Path to Word document or binary content
        chunk_size (int): Target size of each chunk in characters

    Returns:
        list: List of text chunks of approximately chunk_size characters
    """
    try:
        # Handle both file path and binary content
        if isinstance(file_path, bytes):
            doc = Document(BytesIO(file_path))
        else:
            doc = Document(file_path)

        # Extract and clean all text
        full_text = ""
        for para in doc.paragraphs:
            text = para.text.strip()
            if text:  # Skip empty paragraphs
                # Clean the text
                text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
                full_text += text + " "  # Add space between paragraphs

        # Split text into sentences
        sentences = re.split('(?<=[.!?]) +', full_text)

        chunks = []
        current_chunk = ""

        for sentence in sentences:
            # If adding this sentence would exceed chunk_size
            if len(current_chunk) + len(sentence) > chunk_size:
                # If current chunk is not empty, add it to chunks
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""

                # Handle sentences longer than chunk_size
                if len(sentence) > chunk_size:
                    # Split long sentence into fixed-size chunks
                    words = sentence.split()
                    temp_chunk = ""

                    for word in words:
                        if len(temp_chunk) + len(word) + 1 <= chunk_size:
                            temp_chunk += (" " + word if temp_chunk else word)
                        else:
                            chunks.append(temp_chunk.strip())
                            temp_chunk = word

                    if temp_chunk:
                        current_chunk = temp_chunk
                else:
                    current_chunk = sentence
            else:
                # Add sentence to current chunk
                current_chunk += (" " + sentence if current_chunk else sentence)

        # Add the last chunk if not empty
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    except Exception as e:
        raise Exception(f"Error processing document: {str(e)}")



In [11]:
# Main - Note that chunk size to use is set here in main and overrides default

def main():
    try:
        # Directory containing Word documents
        directory = "/home/javad/Downloads/INFO290/content"

        # Get all .docx files in the directory
        docx_files = list(Path(directory).glob("*.docx"))

        if not docx_files:
            print(f"No Word documents found in {directory}")
            return

        print(f"Found {len(docx_files)} Word documents")

        # Process each document
        for doc_path in docx_files:
            try:
                print(f"\nProcessing: {doc_path.name}")

                # Extract chunks of approximately 500 characters
                # MODIFY this as you see fit
                chunks = extract_fixed_chunks(str(doc_path), chunk_size=500)

                print(f"Created {len(chunks)} chunks")

                # Print first few chunks with their lengths
                for i, chunk in enumerate(chunks[:3], 1):
                    print(f"\nChunk {i} (length: {len(chunk)}):")
                    print(chunk)
                    print("-" * 50)

            except Exception as e:
                print(f"Error processing {doc_path.name}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error accessing directory: {str(e)}")

# Call main and start the chunking
main()



Found 5 Word documents

Processing: 5.Global Approaches to Data Protection.docx
Created 4 chunks

Chunk 1 (length: 293):
Global Approaches to Data Protection: A Comparative Analysis Data protection approaches vary significantly across different regions of the world, reflecting diverse cultural, political, and economic priorities. This document examines how different regions approach privacy and data protection.
--------------------------------------------------

Chunk 2 (length: 495):
European Union Approach: Comprehensive Protection: - GDPR as global standard - Privacy as fundamental right - Strict consent requirements - Significant penalties - Data Protection Authorities Key Features: - Data minimization principles - Purpose limitation - Storage limitations - Individual rights emphasis - Cross-border transfer restrictions United States Approach: Sectoral Regulation: - Industry-specific laws - State-level legislation - FTC enforcement - Market-driven solutions - Limited
--------------

/content/sample_data/mydata