In [1]:
import os
from pathlib import Path
from typing import List
import pandas as pd
from tqdm.auto import tqdm
from openai import OpenAI
import minsearch as ms

In [2]:
DATA_PATH = os.getenv("DATA_PATH", "../DATASETS/faq_sacmex.csv")

In [3]:
def ingest_data(file_path: Path = Path(DATA_PATH), text_fields: List[str] = None):
    # Lee el archivo csv
    df = pd.read_csv(file_path)

    # print(df.head())

    # Convierte los campos a string
    if text_fields:
        for field in text_fields:
            if field in df.columns:
                df[field] = df[field].astype(str)

    # Convierte el DataFrame a diccionario
    return df.to_dict(orient='records')


def load_index(
        documents,
        text_fields: List[str],
        keyword_fields: List[str],
        search_index=ms.Index
):
    for doc in documents:
        for key in doc:
            if isinstance(doc[key], float):
                doc[key] = str(doc[key])
            elif isinstance(doc[key], int):
                doc[key] = str(doc[key])
            elif not isinstance(doc[key], str):
                doc[key] = str(doc[key])
            else:
                doc[key] = doc[key]

    index = search_index(
        text_fields=text_fields,
        keyword_fields=keyword_fields
    )

    index.fit(documents)

    return index

In [4]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [5]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [6]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    results_dict = {}
    for q in tqdm(ground_truth):
        try:
            doc_id = q['id']
            results = search_function(q)

            relevance = [str(d['id']) == str(doc_id) for d in results]

            results_dict[q['id']] = (q, results)
            relevance_total.append(relevance)
            # if all(not r for r in relevance):
            #     print("No relevant results found for query: ", q)
            # print("Results: ", results)
        except Exception as e:
            print(f"Error procesando el query: {q} con la excepción: {e}")

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Evaluando la recuperación usando MinSearch y los datos de referencia con llama3.2:3b

In [None]:
df_question = pd.read_csv('../DATASETS/ground-truth-retrieval_llama3_2_3b.csv')
df_question.head()

In [None]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

In [None]:
documents = ingest_data(file_path="../DATASETS/faq_sacmex.csv")
documents[0]

In [None]:
text_fields = [
    'id',
    'pregunta',
    'respuesta',
    'document',
]
keyword_fields = ['id']
index = load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [None]:
evaluate(ground_truth, lambda q: minsearch_search(q['pregunta']))

# Ajuste de parámetros para los datos de referencia basado en llama3.2:3b

In [7]:
df_question = pd.read_csv('../DATASETS/ground-truth-retrieval_llama3_2_3b.csv')
ground_truth = df_question.to_dict(orient='records')
documents = ingest_data(file_path="../DATASETS/faq_sacmex.csv")

In [8]:
df_validation = df_question[:10]
df_test = df_question[10:]
df_validation

Unnamed: 0,id,pregunta
0,d7c4ce5eda85cd602edc71a3f29193e0,¿Dónde puedo reportar una fuga de agua?
1,c6b1388a0227cae6d6045a1dc7dd82c5,¿Qué debo hacer si veo una fuga de agua en la ...
2,b9e493259e0b9fb9f94e4f4d66ab8ded,¿Qué debo hacer si veo una fuga de agua en mi ...
3,5229ec6562607f2de830a7826a099964,¿Qué debo hacer si mi consumo de agua parece h...
4,9055d52c36234595f93430ca60610b82,¿Cuánto tiempo tarda en repararse una fuga de ...
5,34f49a09c09f373422893abead2bad4e,¿Dónde puedo reportar una fuga de agua?
6,221e2b757cee67a9babd822666eae6b1,¿Cómo reportar fugas de agua en CDMX?
7,deb8c6e4cf607cabc9fd72178b45daae,¿Qué otro de reportes puedo realizar?
8,977ff53e8650679587f99ee537c8b095,¿Qué debo tener en cuenta para levantar mi rep...
9,e118293c4450a72349aa684baa54f5eb,¿Puedo reportar fugas en unidades habitacionales?


In [9]:
text_fields = [
    'id',
    'pregunta',
    'respuesta',
    'document',
]
keyword_fields = ['id']
index = load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [16]:
import random


def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Asumiendo que estamos minimizando. Usamos float('-inf') si maximizamos.
    # print(param_ranges.items())
    for _ in range(n_iterations):
        # Genera parametos aleatorios
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            # print(param, (min_val, max_val))
            if isinstance(min_val, float) and isinstance(max_val, float):
                print(param, (min_val, max_val), random.randint(min_val, max_val))
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evalua la función objetivo
        current_score = objective_function(current_params)

        # Actualiza si hay mejores valores
        if current_score > best_score:  # Cambiar > en caso de maximizar
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [11]:
gt_val = df_validation.to_dict(orient='records')

In [12]:
param_ranges = {
    'pregunta': (0, 3),
    'respuesta': (0, 3),
}


def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['pregunta'], boost_params)

    results = evaluate(gt_val, search_function)

    return results['mrr']

In [17]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

({'pregunta': 2.733889011355457, 'respuesta': 0.8747172772681867}, 0.95)

In [18]:
def minsearch_improved(
        query,
        boost=None
):
    # Set default boost values if not provided
    if boost is None:
        boost = {'pregunta': 2.733889011355457, 'respuesta': 0.8747172772681867}

    # Perform the search using the query and boost settings
    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['pregunta']))

  0%|          | 0/11 [00:00<?, ?it/s]

{'hit_rate': 1.0, 'mrr': 0.9545454545454546}