<a href="https://colab.research.google.com/github/joaocabeca2/hybrid_search_pdfs/blob/main/hybrid_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain_community
!pip install langchain_text_splitters
!pip install lancedb
!pip install langchain_google_genai
!pip install pandas
!pip install unidecode
!pip install python-dotenv
!pip install tantivy
!pip install pypdf

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
import lancedb
from lancedb.rerankers import LinearCombinationReranker
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
import pandas as pd
import re
from unidecode import unidecode
from getpass import getpass
from dotenv import load_dotenv
from google.colab import userdata

**Configurações iniciais**

In [3]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("AIzaSyCl17ZZs8Q1Iznjb3ufUyyGCVPcnd9yEsA")

api_key = userdata.get('GOOGLE_API_KEY')

model = get_registry().get("gemini-text").create(name="models/embedding-001")

class Schema(LanceModel):
    text: str = model.SourceField()
    vector: Vector(model.ndims()) = model.VectorField()
    page: int
    index: int

AIzaSyCl17ZZs8Q1Iznjb3ufUyyGCVPcnd9yEsA··········


In [13]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass('AIzaSyCl17ZZs8Q1Iznjb3ufUyyGCVPcnd9yEsA')

**Funções**

In [4]:
def create_index_chunks(table):
    start_index = [indice for indice, _ in enumerate(table['text'])]
    table['index'] = start_index
    return table

def preprocessar_texto(texto):
    # Remove quebras de linha extras que possam estar fragmentando o texto
    texto = re.sub(r'\n+', ' ', texto)
    # Remove espaços duplos resultantes da remoção de quebras de linha
    texto = re.sub(r'\s{2,}', ' ', texto)
    # Remove quebras de página visíveis
    texto = texto.replace('\f', '')
    # Remove tabulações e caracteres não ASCII
    texto = texto.replace('\t', ' ')
    # Remove hifens de quebra de linha e une as palavras
    texto = re.sub(r'-\s+', '', texto)
    # Remove espaços antes de pontuação
    texto = re.sub(r'\s+([.,;?!])', r'\1', texto)
    # Remove acentos
    texto = unidecode(texto)
    # Regex para remover sequências de 2 ou mais underscores (traços)
    texto = re.sub(r'_+', '', texto)
    # Remover numerações de marcação ao iniciar uma nova linha
    texto = re.sub(r'\d+(\.\d+)+\s*', '', texto)

    return texto.strip().lower()

def create_full_text_search_index(table):
    try:
        table.create_fts_index(['text'], replace=True)
        return table
    except ValueError as e:
         print(f'Não foi possível criar o indice fts: {e}')
    except Exception as e :
        print(f'Erro desconhecido: {e}')

def semantic_search(query, table, reranker, k=4):
    try:
        result = table.search(query, query_type='hybrid', vector_column_name='vector').rerank(reranker=reranker).limit(4)
        return result
    except Exception as e:
        raise Exception(f'Não foi possível realizar a busca hibrida: {e}')

def read_file(path_file):
    if path_file.endswith('.pdf'):
        textLoader = PyPDFLoader(path_file)
    return textLoader.load()

def create_chunks(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=0)
    return text_splitter.split_documents(docs)

def create_lance_table(chunks, table_name, df, schema):
    db = lancedb.connect("~/langchain")
    table = db.create_table(
        table_name,
        schema=schema,
        mode="overwrite",
    )

    #table = db.open_table(table_name)
    table.add(df)
    return table

def create_genai_llm(api_key):
    return ChatGoogleGenerativeAI(model='gemini-1.5-flash', temperature=0.2, google_api_key=api_key)


**Transformando o arquivo em chunks e preparando os dados**

In [None]:
docs = read_file('./ia.pdf')

if docs:
    #Criando os chunks
    chunks = create_chunks(docs)
    texts = []
    pages = []
    for chunk in chunks:
        texts.append(chunk.page_content)
        pages.append(chunk.metadata['page'])

    #Criando um dataframe com os dados dos chunks
    df = pd.DataFrame({'text': texts, 'page': pages})
    df['text'] = df['text'].apply(preprocessar_texto)
table_doc = create_index_chunks(df)
table_doc.head()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [19]:
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

'AIzaSyCl17ZZs8Q1Iznjb3ufUyyGCVPcnd9yEsA'

**Criando uma tabela lancedb e estabelecendo um indice para o *full-text-search***

In [20]:
table = create_lance_table(chunks, 'lancetb', df, schema=Schema)

create_full_text_search_index(table)

  No API_KEY or ADC found. Please either:
    - Set the `GOOGLE_API_KEY` environment variable.
    - Manually pass the key with `genai.configure(api_key=my_api_key)`.
    - Or set up Application Default Credentials, see https://ai.google.dev/gemini-api/docs/oauth for more information. 
 Retrying in 3.4420236192280753 seconds (retry 1 of 7) 

  No API_KEY or ADC found. Please either:
    - Set the `GOOGLE_API_KEY` environment variable.
    - Manually pass the key with `genai.configure(api_key=my_api_key)`.
    - Or set up Application Default Credentials, see https://ai.google.dev/gemini-api/docs/oauth for more information. 
 Retrying in 10.782756982754334 seconds (retry 2 of 7) 

  No API_KEY or ADC found. Please either:
    - Set the `GOOGLE_API_KEY` environment variable.
    - Manually pass the key with `genai.configure(api_key=my_api_key)`.
    - Or set up Application Default Credentials, see https://ai.google.dev/gemini-api/docs/oauth for more information. 
 Retrying in 29.000599413

KeyboardInterrupt: 

**Realizando a busca hibrida**

In [None]:
query = 'é possível criar tecnologias éticas com a inteligência artificial?'
reranker = LinearCombinationReranker(weight=0.5)

#Realizando a busca hibrida
result_4_df = semantic_search(query, table, reranker).to_pandas()

#Para cada um dos 4 resultados pegar o chunk anterior e o posterior e truncar com o chunk encontrado
semantic_texts = []
for index in result_4_df['index']:
    semantic_chunk = table_doc.loc[table_doc['index'] == index]['text'][index]
    if index == (table_doc.shape[0] - 1):
        semantic_chunk_pos = ''
    else:
        semantic_chunk_pos = table_doc.loc[table_doc['index'] == (index+1)]['text'][index+1]
    if index != 0:
        semantic_chunk_ant = table_doc.loc[table_doc['index'] == (index-1)]['text'][index-1]
    else:
        semantic_chunk_ant = ''

    text = semantic_chunk_ant + semantic_chunk + semantic_chunk_pos
    semantic_texts.append(text)

print(text)
#Criando instancia no gemini
gemini_responses = []
model = create_genai_llm(os.getenv('GOOGLE_API_KEY'))
for semantic_text in semantic_texts:
    prompt = f"""
    Você é um especialista em análise semantica. Analise o texto abaixo e verifique se o mesmo responde a
    seguinte pergunta: {query}\n
    texto: {semantic_text}.
    Se o chunk não responder a pergunta apenas diga: 'O texto não responde a pergunta'
    """
    response = model.invoke(prompt)
    gemini_responses.append(response.text)

df = pd.DataFrame({'semantic_chunks': semantic_texts, 'gemini_responses': gemini_responses})