# Pre-Filtro
Bloque diseñado para reducir el número de tablas a analizar por el filtro.

### Librerías necesarias

In [629]:
import sys
from langchain.embeddings import LlamaCppEmbeddings
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.llms import LlamaCpp
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
import re

In [None]:
# Remplazar la palabra "Key" por la key de openAI
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings_OpenAI = OpenAIEmbeddings(openai_api_key="Key", chunk_size=1500)

In [None]:
# Lectura del documento con la descipción schema 
loaderschema = TextLoader('schemaWithDescription.txt')
schema = loaderschema.load()

In [None]:
# Lectura de los documentos que tiene la descripción de las tablas y las columnas (obtenidos de la fase cero)
loaderTable = TextLoader('TablewithDescription2.txt')
table_doc = loaderTable.load()
loaderColumn = TextLoader('ColumnsWithDescription3.txt')
column_doc = loaderColumn.load()

text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 10,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)

Table_split = text_splitter.split_documents(table_doc)
Column_split = text_splitter.split_documents(column_doc)

### Definición del text_splitter, se utiliza para generar los Chunk

In [None]:
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 10,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)
split_schema = text_splitter.split_documents(schema)

In [None]:
# Documento separado por doble salto de linea
# Inserción de la metadata a cada vector 
schema_document = []
for item in split_schema:
    inicio_delimitador = 'CREATE TABLE "'
    fin_delimitador = '"'
    #Regular Expression
    patron = re.escape(inicio_delimitador) + "(.*?)" + re.escape(fin_delimitador)
    # Find expression
    table = (re.findall(patron, item.page_content))
    page = Document(page_content=item.page_content,
    metadata = {"Table": table})
    schema_document.append(page)

In [None]:
# Documento separado por doble salto de linea
# Inserción de la metadata a cada vector (para el idice tablas y columnas)

table_document = []
column_document =  []
i = 1
for i in range(len(Table_split)):
    table = ((Table_split[i].page_content).split(":")[0])
    inicio_delimitador = 'foreign_key=['
    fin_delimitador = ']'
    #Regular Expression
    patron = re.escape(inicio_delimitador) + "(.*?)" + re.escape(fin_delimitador)
    # Find expression
    foregin_key = (re.findall(patron, Table_split[i].page_content))[0].split(",")
    paget = Document(page_content=Table_split[i].page_content,metadata = {"Table": table,"Indice":i,"foregin_key":foregin_key})
    pagec = Document(page_content=Column_split[i].page_content, metadata = {"Table": table,"Indice":i,"foregin_key":foregin_key})
    table_document.append(paget)
    column_document.append(pagec)
    i = i + 1

In [None]:
#Validar metadata del indice tabla
for table in table_document:
    print(table.metadata)

In [None]:
#Validar metadata del indice columna
or column in column_document:
    print(column.metadata)

### Carga del embedding

In [None]:
#Use Llama model for embedding
llama_model_path = 'dockerFolder/mixtral-8x7b-v0.1.Q5_K_M.gguf'

#If you want to specify the context window size for embedding, e.g. 2048
embeddings = LlamaCppEmbeddings(model_path=llama_model_path, n_ctx=2048)

In [None]:
#Embeddings libre GPT for all
embeddings_GPT = GPT4AllEmbeddings()

In [None]:
#Creación de la vDB con el embedding del modelo cargado en llama_cpp
vector_db_table = FAISS.from_documents(table_document, embeddings)
vector_db_column = FAISS.from_documents(column_document, embeddings)

In [None]:
#Creación de la vDB con el embedding de GPT4ALL
vector_db_table2 = FAISS.from_documents(table_document, embeddings_GPT)
vector_db_column2 = FAISS.from_documents(column_document, embeddings_GPT)

In [None]:
#Creación de la vDB con el embedding de OpenAI
vector_db_table3 = FAISS.from_documents(table_document, embeddings_OpenAI)
vector_db_column3 = FAISS.from_documents(column_document, embeddings_OpenAI)

### Búsqueda por similitud 

In [None]:
query = "How many transfers made by user Martin in the last month"
docs_t = vector_db_table.similarity_search_with_score(query,6)
docs_c = vector_db_column.similarity_search_with_score(query,6)

In [None]:
docs_t2 = vector_db_table2.similarity_search_with_score(query,6)
docs_c2 = vector_db_column2.similarity_search_with_score(query,6)

In [None]:
docs_t3 = vector_db_table3.similarity_search_with_score(query,5)
docs_c3 = vector_db_column3.similarity_search_with_score(query,5)

#### Funciones para el pre-filtro 

In [None]:
def interceptQuerry(doc_t,doc_c):
    list_t = {doc_t[i][0].metadata.get("Table") for i in range(len(doc_t))}
    list_c = {doc_c[i][0].metadata.get("Table") for i in range(len(doc_c))}
    table_list_cross= list(set(list_t) & set(list_c))
    return [list_t,list_c,table_list_cross]

def unionSimilaritySearch(list_1,list_2):
    table_list = list(set(list_1) | set(list_2))
    return table_list   

In [None]:
def findForeignKey(doc_t,relevant_table,fk):
    for item in doc_t:
        if (item[0].metadata.get("Table") in relevant_table):
            fk_i = {item[0].metadata.get("foregin_key")[i] for i in range(len(item[0].metadata.get("foregin_key")))}
            fk = list(set(fk) | set(fk_i))
    if "" in fk:
        fk.remove("")
    return fk

def addForeignkey(list_fk, list_UE):
    relevant_table = list(set(list_fk) | set(list_UE))
    return relevant_table

In [None]:
table1 = interceptQuerry(docs_t,docs_c)
table2 = interceptQuerry(docs_t2,docs_c2)
union_table = unionSimilaritySearch(table1[2],table2[2])
fk = findForeignKey(docs_t,union_table,fk=[])
fk = findForeignKey(docs_c,union_table,fk)
fk = findForeignKey(docs_t2,union_table,fk)
fk = findForeignKey(docs_c2,union_table,fk)
relevant_table = addForeignkey(fk,union_table)
print(f"Mixtral | Tablas relevantes: {table1[0]} | Columnas relevantes: {table1[1]} Union: {table1[2]}")
print(f"GPT4All | Tablas relevantes: {table2[0]} | Columnas relevantes: {table2[1]} Union: {table2[2]}")
print(f"Union de embedings {union_table}")
print(f"Foreign_key {fk}")
print(f"Tablas relevantes {relevant_table}")

## Función pre-filtro

In [None]:
# Pre-filtro (retorna la lista de tablas importantes, con la funcion db.get_table_info([list]) de langchain se obtiene el schema de esas tablas de la DB
def prefilter(vTable1,vColumn1,vTable2,vColumn2,uQuery,quantity):
    docs_t1 = vTable1.similarity_search_with_score(uQuery,quantity)
    docs_c1 = vColumn1.similarity_search_with_score(uQuery,quantity)
    docs_t2 = vTable2.similarity_search_with_score(uQuery,quantity)
    docs_c2 = vColumn2.similarity_search_with_score(uQuery,quantity)
    table1 = interceptQuerry(docs_t1,docs_c1)
    table2 = interceptQuerry(docs_t2,docs_c2)
    union_table = unionSimilaritySearch(table1[2],table2[2])
    fk = findForeignKey(docs_t,union_table,fk=[])
    fk = findForeignKey(docs_c,union_table,fk)
    fk = findForeignKey(docs_t2,union_table,fk)
    fk = findForeignKey(docs_c2,union_table,fk)
    relevant_table = addForeignkey(fk,union_table)
    return relevant_table

In [None]:
query = "How many transfers made by user Martin in the last month"
result = prefilter(vector_db_table,vector_db_column,vector_db_table2,vector_db_column2,query,6) 
print(result)