<a href="https://colab.research.google.com/github/joaowinderfeldbussolotto/assistente-ppc-ciencia-da-computacao/blob/main/preprocessing_pdfs_chunk_metadata_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU openparse[ml] html2text langchain langchain_groq langchain_huggingface faiss-cpu langchain-pinecone pinecone-notebooks langchain_mistralai
!mkdir -p data
!wget https://www.uffs.edu.br/atos-normativos/ppc/ccccch/2017-0002/@@download/documento_historico -O "data/ppc_2018.pdf"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.8/50.8 kB[0m [31m965.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:0

In [None]:
from google.colab import userdata

class Settings:
  HF_TOKEN          = userdata.get('HF_TOKEN')
  PINECONE_API_KEY  = userdata.get('PINECONE_API_KEY')
  GROQ_API_KEY      = userdata.get('GROQ_API_KEY')
  MISTRAL_AI_KEY    = userdata.get('MISTRAL_AI_KEY')
  GROQ_API_KEY2     = userdata.get('GROQ_API_KEY2')


settings = Settings()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/My\ Drive/tcc/PPC_2024.pdf data/ppc_2024.pdf

## Utils

In [None]:
from langchain_core.load import dumpd, dumps, load, loads
import json
import time
import hashlib
import os


def save_document_locally(documents, version):

  output_file_name = f'ppc{version}_documents.json'
  json_string = dumps(documents, pretty=True, ensure_ascii=False)
  with open(output_file_name, "w", encoding="utf-8") as file:
      file.write(json_string)

def load_document_locally(version):
  output_file_name = f'ppc{version}_documents.json'
  with open(output_file_name, "r") as fp:
      doc = json.load(fp)
  return load(doc)


def save_dict_to_json_on_drive(documents, drive_folder_path, drive_file_name):

    os.makedirs(drive_folder_path, exist_ok=True)
    timestamp = str(int(time.time()))
    hash_suffix = hashlib.md5(timestamp.encode()).hexdigest()[:6]  # Gera um hash curto com 6 caracteres

    drive_file_name = f'{hash_suffix}_{drive_file_name}'
    drive_path = f'{drive_folder_path}/{os.path.basename(drive_file_name)}'

    json_string = dumpd(documents)
    print(json_string)
    with open(drive_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_string, json_file, ensure_ascii=False)
    print(f'File saved to: {drive_path}')

# PDF Extraction and chunking

### Semantic chunking

In [None]:
from typing import List, Union
import numpy as np
from openparse.schemas import Node
from openparse.processing.basic_transforms import ProcessingStep
from langchain_huggingface import HuggingFaceEmbeddings

def cosine_similarity(
    a: Union[np.ndarray, List[float]], b: Union[np.ndarray, List[float]]
) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

class HFEmbeddings:
    def __init__(
        self,
        model_name: str = "sentence-transformers/all-mpnet-base-v2",
        batch_size: int = 256,
        device: str = "cuda",
    ):
        """
        Used to generate embeddings for Nodes using HuggingFace models.

        Args:
            model_name (str): The HuggingFace model to use.
            batch_size (int): The number of texts to process in each batch.
            device (str): The device to run the model on ('cpu' or 'cuda').
        """
        self.model_name = model_name
        self.batch_size = batch_size
        self.client = self._create_client(device, batch_size)

    def embed_many(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a list of texts in batches.

        Args:
            texts (list[str]): The list of texts to embed.

        Returns:
            List[List[float]]: A list of embeddings.
        """
        return self.client.embed_documents(texts)

    def _create_client(self, device: str, batch_size: int):
        try:
            model_kwargs = {'device': device}
            encode_kwargs = {
                'batch_size': batch_size,
                'normalize_embeddings': True
            }
            return HuggingFaceEmbeddings(
                model_name=self.model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs
            )
        except ImportError as err:
            raise ImportError(
                "You need to install the sentence-transformers package to use this feature. "
                "Run: pip install sentence-transformers"
            ) from err


class CustomCombineNodesSemantically(ProcessingStep):
    """
    Combines nodes that are semantically related.
    """

    def __init__(
        self,
        embedding_client: HFEmbeddings,
        min_similarity: float,
        max_tokens: int,
    ):
        self.embedding_client = embedding_client
        self.min_similarity = min_similarity
        self.max_tokens = max_tokens

    def process(self, nodes: List[Node]) -> List[Node]:
        modified = True
        while modified:
            modified = False
            nodes = sorted(nodes)

            embeddings = self.embedding_client.embed_many([node.text for node in nodes])
            i = 0

            while i < len(nodes) - 1:
                current_embedding = embeddings[i]
                next_embedding = embeddings[i + 1]
                similarity = cosine_similarity(current_embedding, next_embedding)
                is_within_token_limit = (
                    nodes[i].tokens + nodes[i + 1].tokens <= self.max_tokens
                )

                if similarity >= self.min_similarity and is_within_token_limit:
                    nodes[i] = nodes[i] + nodes[i + 1]
                    del nodes[i + 1]
                    del embeddings[i + 1]

                    modified = True
                    continue
                i += 1

        return nodes

    def _get_node_similarities(self, nodes: List[Node]) -> List[float]:
        """
        Get the similarity of each node with the node that precedes it
        """
        embeddings = self.embedding_client.embed_many([node.text for node in nodes])

        similarities = []
        for i in range(1, len(embeddings)):
            similarities.append(cosine_similarity(embeddings[i - 1], embeddings[i]))

        return [0] + similarities

In [None]:
from openparse import processing

from openparse.processing.basic_transforms import (
    CombineBullets,
    CombineHeadingsWithClosestText,
    CombineNodesSpatially,
    ProcessingStep,
    RemoveFullPageStubs,
    RemoveMetadataElements,
    RemoveNodesBelowNTokens,
    RemoveRepeatedElements,
    RemoveTextInsideTables,
)

from openparse.processing.semantic_transforms import (
    # CombineNodesSemantically,
    EmbeddingModel,
    OpenAIEmbeddings,
)

class CustomSemanticIngestionPipeline(processing.IngestionPipeline):
    """
    A semantic pipeline for ingesting and processing Nodes.
    """

    def __init__(
        self,
        # openai_api_key: str,
        model = "sentence-transformers/all-mpnet-base-v2",
        min_tokens: int = 64,
        max_tokens: int = 1024,
    ) -> None:
        # embedding_client = OpenAIEmbeddings(api_key=openai_api_key, model=model)
        embedding_client = HFEmbeddings(model_name=model)

        self.transformations = [
            RemoveTextInsideTables(),
            RemoveFullPageStubs(max_area_pct=0.35),
            # mostly aimed at combining bullets and weird formatting
            CombineNodesSpatially(
                x_error_margin=10,
                y_error_margin=2,
                criteria="both_small",
            ),
            CombineHeadingsWithClosestText(),
            CombineBullets(),
            RemoveMetadataElements(),
            RemoveRepeatedElements(threshold=2),
            # RemoveNodesBelowNTokens(min_tokens=10),
            CombineBullets(),
            CustomCombineNodesSemantically(
                embedding_client=embedding_client,
                min_similarity=0.4,
                max_tokens=max_tokens // 2,
            ),
            CustomCombineNodesSemantically(
                embedding_client=embedding_client,
                min_similarity=0.4,
                max_tokens=max_tokens,
            ),
            # RemoveNodesBelowNTokens(min_tokens=min_tokens),
        ]


In [None]:
semantic_pipeline = CustomSemanticIngestionPipeline(
model = "mixedbread-ai/mxbai-embed-large-v1"
)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
import openparse
import html2text
from langchain_core.documents import Document
from uuid import uuid4




def parse_document_to_documents(file_path, source_name):
    # Parse the document using OpenParse
    parser = openparse.DocumentParser(#processing_pipeline=semantic_pipeline,
                                      table_args = {"parsing_algorithm": "pymupdf", "table_output_format": "markdown"}
                                      )
    document = parser.parse(file_path)

    documents = []
    ids = []

    for node in document.nodes:

        html_content = node.text

        if html_content:
            id = str(uuid4())
            ids.append(id)
            page = node.bbox[0].page + 1
            document_chunk = Document(
                metadata={'source': source_name,
                          'id': str(uuid4()),
                          'page': page},
                page_content=html_content
            )
            documents.append(document_chunk)

    return documents, ids, document


In [None]:
ppc_2018_path = 'data/ppc_2018.pdf'
ppc_2018_documents, ppc_2018_ids, ppc_2018_parsed_nodes = parse_document_to_documents(ppc_2018_path, 'ppc2018')

In [None]:
ppc_2024_chunks, ppc_2024_ids, ppc_2024_parsed_nodes = parse_document_to_documents('data/ppc_2024.pdf', 'ppc2024')

In [None]:
pdf = openparse.Pdf(ppc_2018_path)
pdf.export_with_bboxes(
    ppc_2018_parsed_nodes.nodes,
    output_pdf="./data/ppc_2018-markedup.pdf"
)

In [None]:
pdf = openparse.Pdf('data/ppc_2024.pdf')
pdf.export_with_bboxes(
    ppc_2024_parsed_nodes.nodes,
    output_pdf="./data/ppc_2024-marked-up.pdf"
)

## Chunk metadata extraction

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

class MetadadosChunk(BaseModel):
    topico: str = Field(description="Identifique do que se trata o texto: ementa de XXX, grade curricular , regulamento XXX, regras, laboratório, objetivos, infraestrutura, corpo docente, etc. Exemplos: ementa de Grafos, Instrução de validação de disciplinas, etc")
    palavras_chave: str = Field(description=" Palavras-chave principais que representam os conceitos centrais do chunk, separadas por vírgula. Caso seja uma estrutura tabular, mencione absolutamente todos os tópicos que a tabela aborda")
    possiveis_perguntas: str = Field(description="Lista de possíveis perguntas relevantes que alunos fariam e que podem ser respondidas com base no conteúdo deste chunk, separadas por vírgula. Tente 4 a 6 perguntas")


In [None]:
from langchain_groq import ChatGroq
from langchain_mistralai import ChatMistralAI
from time import sleep
import random

def get_metadata(chunk, model_id='llama3-groq-70b-8192-tool-use-preview', count=0):
    try:
        if model_id.startswith('open-mixtral') or model_id.startswith('mistral') or model_id.startswith('open-mistral'):
          model = ChatMistralAI(model = model_id, api_key = settings.MISTRAL_AI_KEY, temperature = 0.2, max_retries = 4)
        else:
          groq_api_key = random.choice([settings.GROQ_API_KEY, settings.GROQ_API_KEY2])
          print(groq_api_key)

          model = ChatGroq(model=model_id, api_key=groq_api_key)

        prompt = f"""
        Você é um assistente especializado no Plano Pedagógico do Curso (PPC) de Ciência da Computação da UFFS.
        Sua função é fornecer metadados sobre trechos extraídos do PPC.
        Os tópicos comumente são sobre: grades curriculares, ementas, regulamentos, regras, objetivos, infraestrutura, corpo docente e assuntos relacionados.
        Analise o seguinte texto e forneça os seguintes metadados:
        - Tópico
        - Palavras-chave
        - Possiveis perguntas
        Texto :  {chunk}
        """

        structured_llm = model.with_structured_output(MetadadosChunk)
        completion = structured_llm.invoke(prompt)
        return completion

    except Exception as e:
        if count < 3:
            print(f"Erro ao tentar invocar o modelo: {e}. Tentativa {count + 1} de 3.")
            sleep(40)
            return get_metadata(chunk, model_id, count + 1)

        print("Falha em todas as tentativas. Retornando metadados padrão.")
        return MetadadosChunk(
            topico="Tópico desconhecido",
            palavras_chave="Desconhecido",
            possiveis_perguntas="Desconhecida",
        )


In [None]:
def adapt_metadata(document: Document, metadata) -> Document:
    metadata_extra = {key: value for key, value in metadata.items() if key not in ['id', 'source','page']}
    metadata_xml = "<metadata>\n"
    for key, value in metadata_extra.items():
        metadata_xml += f"{key}: {value},\n"
    metadata_xml = metadata_xml.rstrip(",\n")
    metadata_xml += "\n</metadata>\n\n"

    new_page_content = metadata_xml + document.page_content
    metadata.update(document.metadata)

    print(metadata)


    return Document(page_content=new_page_content, metadata=metadata)



In [None]:
from time import sleep

def get_metadata_for_documents(documents, model_id='open-mixtral-8x22b'):
    metadata_results = []

    for document in documents:
        sleep(5)
        metadata = get_metadata(document.page_content, model_id)
        metadata_results.append(adapt_metadata(document, metadata.dict()))

    return metadata_results


In [None]:
# metadata_documents_2024 = get_metadata_for_documents(ppc_2024_chunks)
# save_dict_to_json_on_drive(metadata_documents_2024, '/content/drive/MyDrive/tcc/preprocessing', f'ppc{2024}_documents.json')

In [None]:
# metadata_documents_2018 = get_metadata_for_documents(ppc_2018_documents)
# save_dict_to_json_on_drive(metadata_documents_2018, '/content/drive/MyDrive/tcc/preprocessing', f'ppc{2018}_documents.json')

## Data ingestion on vector store

In [None]:
import json
from langchain_core.load import dumpd, dumps, load, loads

def load_files(output_file_name):
  with open(output_file_name, "r") as fp:
      doc = json.load(fp)
  return load(doc)

documents_2018 = load_files('/content/drive/MyDrive/tcc/preprocessing/semantic/4c1807_ppc2018_documents.json')
documents_2024 = load_files('/content/drive/MyDrive/tcc/preprocessing/semantic/7f0876_ppc2024_documents.json')


  return load(doc)


In [None]:
len(documents_2018), len(documents_2024)

(510, 413)

In [None]:
class EmbeddingModelSpecs:
  def __init__(self):
    self.name      = 'sentence-transformers/multi-qa-mpnet-base-cos-v1'
    self.dimension = 768



embeddings_model = EmbeddingModelSpecs()

In [None]:
class FineTunedEmbeddingModelSpecs:
  def __init__(self):
    self.name      = 'winderfeld/cc-uffs-ppc-ft-test-multiqa'
    self.dimension = 768



finetune_embeddings_model = FineTunedEmbeddingModelSpecs()

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name=finetune_embeddings_model.name, model_kwargs={"trust_remote_code":True})

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
!pip install -qU langchain-mistralai langchain_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from uuid import uuid4

from langchain_core.documents import Document

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

In [None]:
import json
from langchain_core.load import dumpd, dumps, load, loads

def load_files(version):
  output_file_name = f'ppc{version}_documents.json'
  with open(output_file_name, "r") as fp:
      doc = json.load(fp)
  return load(doc)

metadata_documents_2018 = load_files(2018)
metadata_documents_2024 = load_files(2024)


  return load(doc)


In [None]:
ppc_2018_ids = [document.metadata.get('id') for document in metadata_documents_2018]
ppc_2024_ids = [document.metadata.get('id') for document in metadata_documents_2024]

documents = metadata_documents_2018 + metadata_documents_2024
uuids = ppc_2018_ids + ppc_2024_ids

In [None]:
len(documents)

923

In [None]:
documents[800].page_content

'<metadata>\ntopico: Infraestrutura do Laboratório de Programação,\npalavras_chave: Laboratório de Programação I, II, III, IV, V, Docente responsável, Discentes por turma, Área, Computadores, Descrição, Interface de rede, Interface para vídeo, Interfaces USB, Interface serial,\npossiveis_perguntas: Quantos laboratórios de programação existem? Quem é o docente responsável pelos laboratórios? Quantos discentes podem participar de uma turma? Qual é a área do laboratório? Quais são as especificações dos computadores disponíveis? Qual é a interface de rede dos computadores? Quais são as interfaces para vídeo dos computadores? Quantas interfaces USB possuem os computadores? Qual é a interface serial dos computadores?\n</metadata>\n\n| programação. |  |  |\n|---|---|---|\n| LABORATÓRIO DE PROGRAMAÇÃO I, II, III, IV e V |   |   |\n| Docente responsável: Docente das disciplinas, conforme oferta do semestre: Algoritmos e Programação, Estruturas de Dados I, Estruturas de Dados II, Programação Ori

In [None]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=settings.PINECONE_API_KEY)

In [None]:
import time

index_name = "index-ppc-markdown-ids-metadata-in-content-v4"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=finetune_embeddings_model.dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
def add_documents_to_vector_store(vector_store, documents, uuids):
    print(vector_store.add_documents(documents=documents, ids=uuids))

In [None]:
add_documents_to_vector_store(vector_store, metadata_documents_2018, ppc_2018_ids)

In [None]:
add_documents_to_vector_store(vector_store, metadata_documents_2024, ppc_2024_ids)

In [None]:
relevant_docs = vector_store.similarity_search("prova toefl conta como aacc?", k = 6, filter = {'source': 'ppc2018'})
print(len(relevant_docs))
context = "\n".join([doc.page_content.split('</metadata>')[1] for doc in relevant_docs])
context

6


'\n\n|  |  |  |  | ção. |\n|---|---|---|---|---|\n|   |   |   |   | ção. |\n| 5.4 | Cursos de idiomas | 60 | ¼ do total de horas cursadas | Certificado. |\n| 5.5 | Prova TOEFL | 4 | 4 horas | Certificado |\n| 5.6 | Ouvinte defesas de TCC curso de Ciência da Com- putação | 30 | 1 hora por de- fesa | Declaração do coordenador de curso ou professor responsável pela disciplina de TCC |\n| 5.7 | Participação da comissão organizadora de eventos do curso | 45 | 15 horas por evento | Declaração do responsável pelo evento. |\n| 5.8 | Auxiliar em eventos da UFFS (ex. mesário do SEPE ou semana acadêmi- ca) | 20 | ¼ do total de horas realiza- das | Declaração da comissão organi- zadora |\n| 5.9 | Demais cursos extra-curri- culares (nos termos do arti- go 8º) | 30 | ¼ do total de horas cursadas | Certificado comprovando a rea- lização e aprovação no curso. |\n| 5.10 | Integrante do Centro Aca- dêmico da Computação. | 60 | total de horas de participação | Atestado da presidência do Centro Acadêmico 