In [17]:
from pathlib import Path
from langchain_unstructured import UnstructuredLoader

# Descobrir diret√≥rios relevantes a partir do contexto atual
_CWD = Path.cwd()
_DATA_DIR_CANDIDATES = [
    _CWD / "data",
    _CWD / "src/notebooks/data"
]

for candidate in _DATA_DIR_CANDIDATES:
    if candidate.exists():
        DATA_DIR = candidate.resolve()
        break
else:
    searched = ", ".join(str(c.resolve()) for c in _DATA_DIR_CANDIDATES)
    raise FileNotFoundError(f"N√£o foi poss√≠vel localizar o diret√≥rio de PDFs. Caminhos testados: {searched}")

OUTPUT_ROOT = DATA_DIR.parent / "outputs"
ELEMENTS_OUTPUT_DIR = OUTPUT_ROOT / "elements"
PAGE_PLOTS_DIR = OUTPUT_ROOT / "page_plots"
CHUNK_PLOTS_DIR = OUTPUT_ROOT / "chunk_plots"

for directory in (OUTPUT_ROOT, ELEMENTS_OUTPUT_DIR, PAGE_PLOTS_DIR, CHUNK_PLOTS_DIR):
    directory.mkdir(parents=True, exist_ok=True)

PDF_PATHS = sorted({p.resolve() for pattern in ("*.pdf", "*.PDF") for p in DATA_DIR.glob(pattern)})

if not PDF_PATHS:
    raise FileNotFoundError(f"Nenhum PDF encontrado em {DATA_DIR}")

print(f"üìÅ Diret√≥rio de dados: {DATA_DIR}")
print(f"üñºÔ∏è Diret√≥rio de plots por p√°gina: {PAGE_PLOTS_DIR}")
print(f"üß© Diret√≥rio de plots de chunks: {CHUNK_PLOTS_DIR}")
print(f"üìÑ PDFs detectados: {len(PDF_PATHS)}")

üìÅ Diret√≥rio de dados: C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\data
üñºÔ∏è Diret√≥rio de plots por p√°gina: C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\outputs\page_plots
üß© Diret√≥rio de plots de chunks: C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\outputs\chunk_plots
üìÑ PDFs detectados: 9


In [18]:
# Utilit√°rio para serializar elementos carregados
import json

def _element_to_serializable(el):
    # Prefer pydantic v2 model_dump, then v1 dict, then __dict__ fallback
    if hasattr(el, "model_dump"):
        data = el.model_dump()
    elif hasattr(el, "dict"):
        data = el.dict()
    elif hasattr(el, "__dict__"):
        data = el.__dict__
    else:
        return str(el)

    def _convert(obj):
        if isinstance(obj, dict):
            return {k: _convert(v) for k, v in obj.items()}
        if isinstance(obj, (list, tuple)):
            return [_convert(v) for v in obj]
        if isinstance(obj, (str, int, float, bool)) or obj is None:
            return obj
        try:
            return [_convert(v) for v in obj]
        except Exception:
            return str(obj)

    return _convert(data)

def dump_elements(elements, destination_path):
    destination_path.parent.mkdir(parents=True, exist_ok=True)
    with open(destination_path, "w", encoding="utf-8") as f:
        json.dump([_element_to_serializable(element) for element in elements], f, ensure_ascii=False, indent=4)

In [19]:
import fitz
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import Counter
import matplotlib.colors as mcolors

def generate_category_colors(categories):
    """Gera cores distintas para cada categoria usando colormap."""
    if not categories:
        return {}
    n_categories = len(categories)
    cmap = plt.cm.get_cmap('tab20' if n_categories <= 20 else 'hsv')
    colors = {}
    for idx, cat in enumerate(categories):
        color_rgba = cmap(idx / max(n_categories - 1, 1))
        colors[cat] = mcolors.rgb2hex(color_rgba[:3])
    return colors

def highlight_elements(pdf_path, elements, output_dir, render_scale=2):
    pdf_output_dir = (output_dir / pdf_path.stem)
    pdf_output_dir.mkdir(parents=True, exist_ok=True)
    doc = fitz.open(pdf_path)
    categories = sorted(set(elem.metadata.get('category', 'Unknown') for elem in elements))
    category_colors = generate_category_colors(categories)
    page_stats = []

    try:
        for page_index in range(doc.page_count):
            page_num = page_index + 1
            page_elements = [elem for elem in elements if elem.metadata.get('page_number') == page_num]
            if not page_elements:
                continue

            page = doc[page_index]
            pix = page.get_pixmap(matrix=fitz.Matrix(render_scale, render_scale))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            fig, ax = plt.subplots(1, 1, figsize=(14, 18))
            ax.imshow(img)
            ax.axis('off')

            for elem in page_elements:
                coords = elem.metadata.get('coordinates')
                category = elem.metadata.get('category', 'Unknown')
                if coords and coords.get('points'):
                    try:
                        points = coords['points']
                        x1, y1 = points[0]
                        x2, y2 = points[2]
                        x = x1 * render_scale
                        y = y1 * render_scale
                        width = (x2 - x1) * render_scale
                        height = (y2 - y1) * render_scale
                        color = category_colors.get(category, '#CCCCCC')
                        rect = patches.Rectangle(
                    (x, y), width, height,
                    linewidth=0,
                    edgecolor='none',
                    facecolor=color,
                    alpha=0.4
                )
                        ax.add_patch(rect)
                    except Exception:
                        continue

            legend_elements = [patches.Patch(facecolor=color, label=cat, alpha=0.6) for cat, color in category_colors.items()]
            if legend_elements:
                ax.legend(handles=legend_elements, loc='upper right', fontsize=11, framealpha=0.9)

            page_categories = [elem.metadata.get('category', 'Unknown') for elem in page_elements]
            category_counts = Counter(page_categories)
            stats_text = " | ".join([f"{cat}: {count}" for cat, count in category_counts.most_common(3)])
            if stats_text:
                ax.set_title(f"P√°gina {page_num} | {stats_text}", fontsize=14, pad=10)
            else:
                ax.set_title(f"P√°gina {page_num}", fontsize=14, pad=10)

            fig.tight_layout()
            output_path = pdf_output_dir / f"page-{page_num:03d}.png"
            fig.savefig(output_path, dpi=200, bbox_inches='tight')
            plt.close(fig)
            page_stats.append((page_num, len(page_elements), output_path))
    finally:
        doc.close()

    print(f"‚úÖ Plots por p√°gina salvos em {pdf_output_dir}")
    return page_stats

In [20]:
print("üì¶ Persist√™ncia no Chroma desabilitada ‚Äî apenas gera√ß√£o de chunks para os plots.")

üì¶ Persist√™ncia no Chroma desabilitada ‚Äî apenas gera√ß√£o de chunks para os plots.


In [21]:
from collections import defaultdict
from langchain_core.documents import Document

def group_elements_by_page(elements):
    pages_dict = defaultdict(list)
    for elem in elements:
        page_num = elem.metadata.get('page_number')
        if page_num is not None:
            pages_dict[page_num].append(elem)
    return pages_dict

def build_parent_documents(elements, pdf_path):
    pages_dict = group_elements_by_page(elements)
    print(f"üìÑ Total de p√°ginas encontradas: {len(pages_dict)}")
    parent_documents = []
    for page_num in sorted(pages_dict.keys()):
        page_elements = pages_dict[page_num]
        page_text = '\n\n'.join(elem.page_content for elem in page_elements)
        first_elem = page_elements[0]
        parent_metadata = {
            'page_number': page_num,
            'source': first_elem.metadata.get('source', str(pdf_path)),
            'filename': first_elem.metadata.get('filename', pdf_path.name),
            'source_path': str(pdf_path),
            'total_elements': len(page_elements),
            'type': 'parent_page'
        }
        parent_documents.append(Document(page_content=page_text, metadata=parent_metadata))
    print(f"‚úÖ {len(parent_documents)} parent documents criados")
    return parent_documents, pages_dict

In [22]:
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

CHUNKING_STRATEGY = "semantic"  # "semantic" ou "character"

SEMANTIC_CONFIG = {
    'breakpoint_threshold_type': 'percentile',
    'breakpoint_threshold_amount': 0.2,
    'buffer_size': 1
}

CHARACTER_CONFIG = {
    'chunk_size': 380,
    'chunk_overlap': 60,
    'separators': ["\n\n", "\n", " " ]
}

def build_text_splitter():
    if CHUNKING_STRATEGY == "semantic":
        splitter = SemanticChunker(
            embeddings=hf,
            breakpoint_threshold_type=SEMANTIC_CONFIG['breakpoint_threshold_type'],
            breakpoint_threshold_amount=SEMANTIC_CONFIG['breakpoint_threshold_amount'],
            buffer_size=SEMANTIC_CONFIG['buffer_size']
)
        print(
            "üîß SemanticChunker ativo | threshold_type=",
            SEMANTIC_CONFIG['breakpoint_threshold_type'],
            " | threshold_amount=",
            SEMANTIC_CONFIG['breakpoint_threshold_amount'],
            " | buffer_size=",
            SEMANTIC_CONFIG['buffer_size']
)
        return splitter
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHARACTER_CONFIG['chunk_size'],
        chunk_overlap=CHARACTER_CONFIG['chunk_overlap'],
        separators=CHARACTER_CONFIG['separators'],
        length_function=len
)
    print(
        "üîß RecursiveCharacterTextSplitter ativo | chunk_size=",
        CHARACTER_CONFIG['chunk_size'],
        " | chunk_overlap=",
        CHARACTER_CONFIG['chunk_overlap']
    )
    return splitter

text_splitter = build_text_splitter()

def create_element_map(page_elements):
    element_map = []
    current_pos = 0
    for elem in page_elements:
        text = elem.page_content or ""
        start_pos = current_pos
        end_pos = current_pos + len(text)
        element_map.append({
            'element': elem,
            'start_pos': start_pos,
            'end_pos': end_pos,
            'text': text
        })
        current_pos = end_pos + 2
    return element_map

def calculate_overlap(chunk_start, chunk_end, elem_start, elem_end):
    overlap_start = max(chunk_start, elem_start)
    overlap_end = min(chunk_end, elem_end)
    if overlap_start >= overlap_end:
        return 0.0
    elem_length = elem_end - elem_start
    if elem_length == 0:
        return 0.0
    overlap_length = overlap_end - overlap_start
    return overlap_length / elem_length

def create_semantic_chunks(page_elements):
    full_text = '\n\n'.join(elem.page_content or "" for elem in page_elements)
    element_map = create_element_map(page_elements)
    semantic_texts = text_splitter.split_text(full_text)
    semantic_chunks = []
    current_chunk_pos = 0
    for chunk_text in semantic_texts:
        chunk_start = full_text.find(chunk_text, current_chunk_pos)
        if chunk_start == -1:
            chunk_start = current_chunk_pos
        chunk_end = chunk_start + len(chunk_text)
        current_chunk_pos = chunk_end
        contributing_elements = []
        for elem_info in element_map:
            overlap_pct = calculate_overlap(
                chunk_start, chunk_end,
                elem_info['start_pos'], elem_info['end_pos']
            )
            if overlap_pct >= 0.10:
                contributing_elements.append({
                    'element': elem_info['element'],
                    'coordinates': elem_info['element'].metadata.get('coordinates'),
                    'category': elem_info['element'].metadata.get('category'),
                    'content': elem_info['element'].page_content,
                    'overlap_percentage': overlap_pct
                })
        if not contributing_elements:
            best_elem = None
            best_overlap = 0
            for elem_info in element_map:
                overlap_pct = calculate_overlap(
                    chunk_start, chunk_end,
                    elem_info['start_pos'], elem_info['end_pos']
                )
                if overlap_pct > best_overlap:
                    best_overlap = overlap_pct
                    best_elem = elem_info['element']
            if best_elem is not None:
                contributing_elements = [{
                    'element': best_elem,
                    'coordinates': best_elem.metadata.get('coordinates'),
                    'category': best_elem.metadata.get('category'),
                    'content': best_elem.page_content,
                    'overlap_percentage': best_overlap
                }]
        categories = [elem['category'] for elem in contributing_elements if elem['category']]
        predominant_category = max(set(categories), key=categories.count) if categories else 'Unknown'
        semantic_chunks.append({
            'text': chunk_text,
            'contributing_elements': contributing_elements,
            'category': predominant_category,
            'source_elements_count': len(contributing_elements),
            'chunk_position': (chunk_start, chunk_end)
        })
    return semantic_chunks

INFO: Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


üîß SemanticChunker ativo | threshold_type= percentile  | threshold_amount= 0.2  | buffer_size= 1


In [23]:
import fitz
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as mcolors

def highlight_chunks(pdf_path, elements_with_coords, output_dir, render_scale=2):
    if not elements_with_coords:
        print(f"‚ö†Ô∏è Nenhum chunk sem√¢ntico para renderizar em {pdf_path.name}")
        return None
    pdf_output_dir = (output_dir / pdf_path.stem)
    pdf_output_dir.mkdir(parents=True, exist_ok=True)
    doc = fitz.open(pdf_path)

    try:
        chunks_by_page = defaultdict(list)
        for child_id, chunk_info in elements_with_coords.items():
            page_num = chunk_info['page_number']
            chunks_by_page[page_num].append(chunk_info)

        print(f"Total de p√°ginas com chunks: {len(chunks_by_page)}")
        for page_num in sorted(chunks_by_page.keys()):
            page_chunks = chunks_by_page[page_num]
            page = doc[page_num - 1]
            pix = page.get_pixmap(matrix=fitz.Matrix(render_scale, render_scale))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            fig, ax = plt.subplots(1, 1, figsize=(14, 18))
            ax.imshow(img)
            ax.axis('off')

            chunk_colors = plt.cm.get_cmap('Set3')(range(len(page_chunks)))
            for chunk_idx, chunk_info in enumerate(page_chunks):
                color = mcolors.rgb2hex(chunk_colors[chunk_idx][:3])
                for elem_data in chunk_info['contributing_elements']:
                    coords = elem_data['coordinates']
                    if coords and coords.get('points'):
                        try:
                            points = coords['points']
                            x1, y1 = points[0]
                            x2, y2 = points[2]
                            x = x1 * render_scale
                            y = y1 * render_scale
                            width = (x2 - x1) * render_scale
                            height = (y2 - y1) * render_scale
                            rect = patches.Rectangle(
                                (x, y), width, height,
                                linewidth=1,
                                edgecolor=color,
                                facecolor=color,
                                alpha=0.3
)
                            ax.add_patch(rect)
                        except Exception:
                            continue

            ax.set_title(f"P√°gina {page_num} - {len(page_chunks)} chunks sem√¢nticos", fontsize=14, pad=10)
            fig.tight_layout()
            output_path = pdf_output_dir / f"page-{page_num:03d}.png"
            fig.savefig(output_path, dpi=200, bbox_inches='tight')
            plt.close(fig)

        return pdf_output_dir
    finally:
        doc.close()

In [26]:
import traceback
from datetime import datetime

processing_results = []
start_time = datetime.now()

for pdf_path in PDF_PATHS:
    print(f"\n{'='*80}")
    print(f"üìÑ Processando arquivo: {pdf_path.name}")
    print(f"{'='*80}")
    try:
        loader = UnstructuredLoader(str(pdf_path), partition_kwargs={"languages": ["por"], "strategy": "hi_res"})
        elements = loader.load()
        print(f"Number of documents loaded: {len(elements)}")

        for elem in elements:
            elem.metadata.setdefault('filename', pdf_path.name)
            elem.metadata.setdefault('source', str(pdf_path))
            elem.metadata['source_path'] = str(pdf_path)

        serialized_path = ELEMENTS_OUTPUT_DIR / f"{pdf_path.stem}.json"
        dump_elements(elements, serialized_path)
        page_stats = highlight_elements(pdf_path, elements, PAGE_PLOTS_DIR)
        parent_documents, pages_dict = build_parent_documents(elements, pdf_path)
        semantic_payload = build_semantic_children(parent_documents, pages_dict, pdf_path)

        chunks_output_dir = ELEMENTS_OUTPUT_DIR / "chunks"
        chunks_output_dir.mkdir(parents=True, exist_ok=True)
        chunks_json_path = chunks_output_dir / f"{pdf_path.stem}_chunks.json"

        chunk_records = []
        for child_id, chunk_info in semantic_payload['elements_with_coords'].items():
            chunk_records.append({
                "chunk_id": child_id,
                "text": chunk_info['content'],
                "category": chunk_info['category'],
                "page_number": chunk_info['page_number'],
                "source_elements_count": chunk_info['source_elements_count'],
                "chunk_position": chunk_info['chunk_position'],
                "contributing_elements": [
                    {
                        "coordinates": elem_data['coordinates'],
                        "category": elem_data['category'],
                        "content": elem_data['content'],
                        "overlap_percentage": elem_data['overlap_percentage']
                    }
                    for elem_data in chunk_info['contributing_elements']
                ]
            })

        with open(chunks_json_path, "w", encoding="utf-8") as fp:
            json.dump(chunk_records, fp, ensure_ascii=False, indent=2)
        print(f"üíæ Registro dos chunks salvo em {chunks_json_path}")

        chunk_plots_dir = highlight_chunks(pdf_path, semantic_payload['elements_with_coords'], CHUNK_PLOTS_DIR)

        chunk_pages = {info['page_number'] for info in semantic_payload['elements_with_coords'].values()}
        processing_results.append({
            'pdf': pdf_path,
            'elements_path': serialized_path,
            'chunks_path': chunks_json_path,
            'page_plots': page_stats,
            'chunk_plots_dir': chunk_plots_dir,
            'total_elements': len(elements),
            'parent_documents': len(parent_documents),
            'semantic_children': len(semantic_payload['semantic_children']),
            'chunk_pages': len(chunk_pages)
        })
    except Exception as exc:
        print(f"‚ö†Ô∏è Erro ao processar {pdf_path.name}: {exc}")
        traceback.print_exc()
        continue

elapsed = datetime.now() - start_time
print(f"\n‚è±Ô∏è Pipeline conclu√≠do em {elapsed} (hh:mm:ss)")

if processing_results:
    print("\nResumo por arquivo:")
    for result in processing_results:
        page_plot_count = len(result['page_plots']) if result['page_plots'] else 0
        chunk_plot_path = str(result['chunk_plots_dir']) if result['chunk_plots_dir'] else 'N/A'
        summary_line = "- {name}: {elements} elementos, {parents} parents, {chunks} chunks, plots de p√°ginas: {pages}, plots de chunks: {chunk_pages}.".format(
            name=result['pdf'].name,
            elements=result['total_elements'],
            parents=result['parent_documents'],
            chunks=result['semantic_children'],
            pages=page_plot_count,
            chunk_pages=result['chunk_pages']
        )
        print(summary_line)
        print(f"  JSON de elementos: {result['elements_path']}")
        print(f"  JSON de chunks: {result['chunks_path']}")
        if page_plot_count:
            first_page_plot = result['page_plots'][0][2]
            print(f"  Exemplos de plots de p√°gina: {first_page_plot.parent}")
        print(f"  Plots de chunks: {chunk_plot_path}")
else:
    print("Nenhum arquivo foi processado.")


üìÑ Processando arquivo: 6608-06e0e21ca08fc4373941c452c916f536.pdf


  cmap = plt.cm.get_cmap('tab20' if n_categories <= 20 else 'hsv')
  cmap = plt.cm.get_cmap('tab20' if n_categories <= 20 else 'hsv')


Number of documents loaded: 52
‚úÖ Plots por p√°gina salvos em C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\outputs\page_plots\6608-06e0e21ca08fc4373941c452c916f536
üìÑ Total de p√°ginas encontradas: 6
‚úÖ 6 parent documents criados
üîß Criando chunks sem√¢nticos preservando elementos individuais...
‚úÖ Plots por p√°gina salvos em C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\outputs\page_plots\6608-06e0e21ca08fc4373941c452c916f536
üìÑ Total de p√°ginas encontradas: 6
‚úÖ 6 parent documents criados
üîß Criando chunks sem√¢nticos preservando elementos individuais...
  P√°gina 1: 32 elementos ‚Üí 27 chunks sem√¢nticos
  P√°gina 1: 32 elementos ‚Üí 27 chunks sem√¢nticos
  P√°gina 2: 12 elementos ‚Üí 9 chunks sem√¢nticos
  P√°gina 3: 2 elementos ‚Üí 1 chunks sem√¢nticos
  P√°gina 4: 2 elementos ‚Üí 1 chunks sem√¢nticos
  P√°gina 5: 2 elementos ‚Üí 1 chunks sem√¢nticos
  P√°gina 6: 2 elementos ‚Üí 1 chunks sem√¢nticos

‚úÖ 6 parents armazenados
‚úÖ 40 chunks sem√¢nticos g

  chunk_colors = plt.cm.get_cmap('Set3')(range(len(page_chunks)))



üìÑ Processando arquivo: 6608-0a1d0058940a3f53d22f922124b0e884.pdf
Number of documents loaded: 57
Number of documents loaded: 57
‚úÖ Plots por p√°gina salvos em C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\outputs\page_plots\6608-0a1d0058940a3f53d22f922124b0e884
üìÑ Total de p√°ginas encontradas: 7
‚úÖ 7 parent documents criados
üîß Criando chunks sem√¢nticos preservando elementos individuais...
‚úÖ Plots por p√°gina salvos em C:\Users\User\Workplace\not-a-rag-chat\src\notebooks\outputs\page_plots\6608-0a1d0058940a3f53d22f922124b0e884
üìÑ Total de p√°ginas encontradas: 7
‚úÖ 7 parent documents criados
üîß Criando chunks sem√¢nticos preservando elementos individuais...
  P√°gina 1: 5 elementos ‚Üí 1 chunks sem√¢nticos
  P√°gina 1: 5 elementos ‚Üí 1 chunks sem√¢nticos
  P√°gina 2: 12 elementos ‚Üí 16 chunks sem√¢nticos
  P√°gina 2: 12 elementos ‚Üí 16 chunks sem√¢nticos


KeyboardInterrupt: 

In [None]:
# # P√≥s processamento: 

# from langchain_litellm import ChatLiteLLM
# from langchain_openai import ChatOpenAI

# chat = ChatOpenAI(
#     openai_api_base="http://localhost:4000",  # Your proxy URL
#     model="gpt-4o",
#     temperature=0.7,
#     extra_body={
#         "metadata": {
#             "tags": ["proxy", "team-alpha", "feature-flagged"],
#             "generation_name": "customer-onboarding",
#             "trace_user_id": "user-12345"
#         }
#     }
# )



SyntaxError: expression expected after dictionary key and ':' (3434448406.py, line 7)