In [None]:
%pip install -qqqU PyPDF2 langchain langchain-community matplotlib pypdf

In [None]:
from PyPDF2 import PdfReader

reader = PdfReader("../data/regulamento-dos-cursos-de-graduacao-da-UFRN-2024.pdf")
len(reader.pages)

In [None]:
raw_text = ""

for i in range(len(reader.pages)):
    page = reader.pages[i]
    text = page.extract_text()
    text.strip()

    raw_text += text

print(len(raw_text))

In [None]:
titulo_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('TÍTULO', pos)]
capitulo_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('CAPÍTULO', pos)]
secoes_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('Seção', pos)]
subsecao_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('Subseção', pos)]
artigo_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('Art.', pos)]
# paragrafo_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('Parágrafo único.', pos)]
# p_starts = [pos for pos, _ in enumerate(raw_text) if raw_text.startswith('§', pos)]

section_starts = titulo_starts + secoes_starts + artigo_starts + capitulo_starts +subsecao_starts
section_starts.sort()
splits = []

for i in range(len(section_starts) - 1):
    section = raw_text[section_starts[i]:section_starts[i+1]]
    splits.append(section)

splits

In [None]:
import json
def create_dictionary(strings_list):
    document_dict = {}
    current_title = None
    current_chapter = None
    current_section = None
    current_subsection = None

    for string in strings_list:
        string = string.strip().replace('\n', '')

        # Identifica títulos
        if string.startswith('TÍTULO'):
            current_title = string
            current_chapter = None
            current_section = None
            current_subsection = None
            document_dict[current_title] = {}

        # Identifica capítulos
        elif string.startswith('CAPÍTULO'):
            if current_title:
                current_chapter = string
                current_section = None
                current_subsection = None
                document_dict[current_title][current_chapter] = {}

        # Identifica seções
        elif string.startswith('Seção'):
            if current_title and current_chapter:
                current_section = string
                current_subsection = None
                document_dict[current_title][current_chapter][current_section] = {}

        # Identifica subseções
        elif string.startswith('Subseção'):
            if current_title and current_chapter and current_section:
                current_subsection = string
                document_dict[current_title][current_chapter][current_section][current_subsection] = {}

        # Identifica artigos
        elif string.startswith('Art'):
            if current_title and current_chapter and current_section and current_subsection:
                document_dict[current_title][current_chapter][current_section][current_subsection] = string
            elif current_title and current_chapter and current_section:
                document_dict[current_title][current_chapter][current_section][string] = None
            elif current_title and current_chapter:
                document_dict[current_title][current_chapter][string] = None
            elif current_title:
                document_dict[current_title][string] = None
            else:
                # Caso o artigo esteja fora de um título definido
                document_dict[string] = None

    return document_dict

document_dict = create_dictionary(splits)
json_like = json.dumps(document_dict, indent=4, ensure_ascii=False)
print(json_like)


In [None]:
articles = []

for k, v in document_dict.items():
    if v is None:
        articles.append(k)
    elif isinstance(v, dict):
        for k2, v2 in v.items():
            if v2 is None:
                articles.append(k2)
            else:
                articles.append(v2)

articles

In [None]:
document_dict

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import matplotlib.pyplot as plt

# Caminho do arquivo PDF
pdf_path = "../data/regulamento-dos-cursos-de-graduacao-da-UFRN-2024.pdf"

# Carregar o PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Configurações de tamanhos de chunk para comparação
chunk_sizes = [200, 500, 1000, 2000]  # Tamanhos de chunks para análise
chunk_overlap = 50  # Mesma sobreposição para todos os casos

# Armazenar resultados
chunk_distributions = {}

for size in chunk_sizes:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(documents)
    chunk_distributions[size] = [len(chunk.page_content) for chunk in chunks]

# Gerar gráfico comparativo
plt.figure(figsize=(12, 8))
for size, sizes_list in chunk_distributions.items():
    plt.plot(range(len(sizes_list)), sizes_list, label=f"Tamanho de Chunk = {size}")

plt.title("Comparação de Distribuição de Tamanhos de Chunks")
plt.xlabel("Índice do Chunk")
plt.ylabel("Tamanho do Chunk")
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
import numpy as np


loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Configurações de tamanhos de chunk para comparação
chunk_sizes = [200, 500, 1000, 2000]  # Tamanhos de chunks para análise
chunk_overlap = 50  # Mesma sobreposição para todos os casos

# Armazenar resultados
chunk_counts = []  # Armazena o número total de chunks para cada tamanho

for size in chunk_sizes:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(documents)
    chunk_counts.append(len(chunks))

# Criar o gráfico de barras agrupadas
x = np.arange(len(chunk_sizes))  # Posições no eixo X
bar_width = 0.6

plt.figure(figsize=(10, 6))
plt.bar(x, chunk_counts, width=bar_width, color='skyblue', edgecolor='black')

# Adicionar rótulos e títulos
plt.xticks(x, [f"{size}" for size in chunk_sizes])
plt.title("Comparação do Número de Chunks por Tamanho", fontsize=14)
plt.xlabel("Tamanho do Chunk", fontsize=12)
plt.ylabel("Quantidade de Chunks", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Exibir os valores acima das barras
for i, count in enumerate(chunk_counts):
    plt.text(x[i], count + 1, str(count), ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
splits

In [None]:
from langchain.schema import Document
import re

documents = []
metadata = {}

for text in splits:

    if text.startswith('TÍTULO'):
        match = re.match(r"TÍTULO\s+([IVXLCDM]+)\s+(.*)", text)
        numero_titulo = match.group(1).strip()
        descricao_titulo = match.group(2).strip()
        metadata['Título'] = numero_titulo + " - " + descricao_titulo

        metadata.pop('Capítulo', None)
        metadata.pop('Seção', None)
        metadata.pop('Subseção', None)

    elif text.startswith('CAPÍTULO'):
        match = re.match(r"CAPÍTULO\s+([IVXLCDM]+)\s+(.*)", text)
        numero_capitulo = match.group(1).strip()
        descricao_capitulo = match.group(2).strip()
        metadata['Capítulo'] = numero_capitulo + " - " + descricao_capitulo

        metadata.pop('Seção', None)
        metadata.pop('Subseção', None)

    elif text.startswith('Seção'):
        match = re.match(r"Seção\s+([IVXLCDM]+)\s+(.*)", text)
        numero_secao = match.group(1).strip()
        descricao_secao = match.group(2).strip()
        metadata['Seção'] = numero_secao + " - " + descricao_secao

        metadata.pop('Subseção', None)

    elif text.startswith('Subseção'):
        match = re.match(r"Subseção\s+([IVXLCDM]+)\s+(.*)", text)
        numero_subsecao = match.group(1).strip()
        descricao_subsecao = match.group(2).strip()
        metadata['Seção'] = numero_subsecao + " - " + descricao_subsecao


    elif text.startswith('Art.'):
        match = re.match(r"Art\.\s*(\d+)", text)
        final_art_number_chat = match.span(0)[-1]
        art_number = match.group(1)

        metadata["Artigo"] = art_number

        art_content: str = text[final_art_number_chat:]
        art_content: str = art_content.replace('º', '').replace('\n', '').lstrip(" .").strip()
        
        document = Document(
            page_content=art_content,
            metadata=metadata
            )

        documents.append(document)

documents

In [None]:
len(documents)

In [None]:
import os
from langchain_aws import BedrockEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

embeddings = BedrockEmbeddings(
    region_name='us-east-1',
    )

client = QdrantClient(
    location=os.environ["VECTOR_STORE_URL"],
    api_key=os.environ["VECTOR_STORE_API_KEY"]
)

# Create collection
# client.create_collection(
#     collection_name="regulamento_dos_cursos_de_graduacao_da_UFRN",
#     vectors_config=VectorParams(size=len(embeddings.embed_query(" ")), distance=Distance.COSINE),
# )

vector_store = QdrantVectorStore(
    client=client,
    collection_name="regulamento_dos_cursos_de_graduacao_da_UFRN",
    embedding=embeddings
)

In [None]:
vector_store.add_documents(documents)