In [2]:
import pandas as pd
from tqdm.auto import tqdm

from meal_mentor.ingest import ingest_data, load_index
from meal_mentor.ingest import load_index


In [10]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    results_dict = {}
    for q in tqdm(ground_truth):
        try:
            doc_id = q['id']
            results = search_function(q)

            relevance = [str(d['id']) == str(doc_id) for d in results]

            results_dict[q['id']] = (q, results)
            relevance_total.append(relevance)
            # if all(not r for r in relevance):
            #     print("No relevant results found for query: ", q)
            # print("Results: ", results)
        except Exception as e:
            print(f"Error processing query: {q} with exeption: {e}")

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Evaluate retrieval using MinSearch and the ground truth data generated with gpt-4o-mini

In [6]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o_mini.csv')
df_question.head()

Unnamed: 0,id,question
0,0,Can you provide the nutritional breakdown for ...
1,0,What makes this Potato Latkes recipe suitable ...
2,0,Are there any specific kosher dietary guidelin...
3,0,How can I adjust the ingredients in the Potato...
4,0,What type of cuisine do the Potato Latkes belo...


In [7]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'Can you provide the nutritional breakdown for the Potato Latkes recipe, including protein, carbs, and fat content?'}

In [8]:
documents = ingest_data(file_path='../data/data.csv')
documents[0]

   id diet_type                                  recipe_name cuisine_type  \
0   0      dash  Potato Latkes Made Simple: A Twitter Recipe       kosher   
1   1      dash                             Avocado Dressing     american   
2   2      dash                                  Bread Salad     american   
3   3      dash                             Ultimat Sparkler        world   
4   4      dash                                  Yogurt Pops     american   

   protein(g)  carbs(g)  fat(g)  
0       31.55    110.84  118.28  
1        4.18     18.97  118.42  
2       44.09    153.84   86.03  
3        0.32     16.32    0.08  
4        9.07     34.05    7.90  


{'id': 0,
 'diet_type': 'dash',
 'recipe_name': 'Potato Latkes Made Simple: A Twitter Recipe',
 'cuisine_type': 'kosher',
 'protein(g)': 31.55,
 'carbs(g)': 110.84,
 'fat(g)': 118.28}

In [9]:
text_fields = [
    'id',
    'recipe_name',
    'cuisine_type',
    'diet_type',
    'protein(g)',
    'carbs(g)',
    'fat(g)',
]
keyword_fields = ['id']
index = load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [10]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.898, 'mrr': 0.85794126984127}

Results: {'hit_rate': 0.898, 'mrr': 0.85794126984127}

## Evaluate retrieval using MinSearch and the ground truth data generated with gpt-4o

In [24]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o.csv')
df_question.head()

Unnamed: 0,id,question
0,0,What is the protein content in the Potato Latk...
1,0,Can you tell me how many grams of carbohydrate...
2,0,I need to know the fat content in the Potato L...
3,0,What type of cuisine is the Potato Latkes Made...
4,0,Is the Potato Latkes Made Simple recipe suitab...


In [16]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'What is the protein content in the Potato Latkes Made Simple recipe?'}

In [17]:
documents = ingest_data(file_path='../data/data.csv')
documents[0]

{'id': 0,
 'diet_type': 'dash',
 'recipe_name': 'Potato Latkes Made Simple: A Twitter Recipe',
 'cuisine_type': 'kosher',
 'protein(g)': 31.55,
 'carbs(g)': 110.84,
 'fat(g)': 118.28}

In [19]:
text_fields = [
    'id',
    'recipe_name',
    'cuisine_type',
    'diet_type',
    'protein(g)',
    'carbs(g)',
    'fat(g)',
]
keyword_fields = ['id']
index = load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [19]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.947, 'mrr': 0.9229761904761906}

Results: {'hit_rate': 0.947, 'mrr': 0.9229761904761906}

## Parameter Tuning for Ground Truth based on gpt-4o-mini

In [13]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o_mini.csv')
ground_truth = df_question.to_dict(orient='records')
documents = ingest_data(file_path='../data/data.csv')

   id diet_type                                  recipe_name cuisine_type  \
0   0      dash  Potato Latkes Made Simple: A Twitter Recipe       kosher   
1   1      dash                             Avocado Dressing     american   
2   2      dash                                  Bread Salad     american   
3   3      dash                             Ultimat Sparkler        world   
4   4      dash                                  Yogurt Pops     american   

   protein(g)  carbs(g)  fat(g)  
0       31.55    110.84  118.28  
1        4.18     18.97  118.42  
2       44.09    153.84   86.03  
3        0.32     16.32    0.08  
4        9.07     34.05    7.90  


In [14]:
df_validation = df_question[:10]
df_test = df_question[10:]
df_validation

Unnamed: 0,id,question
0,0,Can you provide the nutritional breakdown for ...
1,0,What makes this Potato Latkes recipe suitable ...
2,0,Are there any specific kosher dietary guidelin...
3,0,How can I adjust the ingredients in the Potato...
4,0,What type of cuisine do the Potato Latkes belo...
5,1,What are the macronutrient values of the Avoca...
6,1,Is the Avocado Dressing recipe suitable for so...
7,1,Can you tell me what cuisine the Avocado Dress...
8,1,What is the total fat content in the Avocado D...
9,1,How much protein does the Avocado Dressing rec...


In [20]:
text_fields = [
    'id',
    'recipe_name',
    'cuisine_type',
    'diet_type',
    'protein(g)',
    'carbs(g)',
    'fat(g)',
]
keyword_fields = ['id']
index = load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [21]:
import random


def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, float) and isinstance(max_val, float):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [22]:
gt_val = df_validation.to_dict(orient='records')


In [23]:
param_ranges = {
    'recipe_name': (0.0, 3.0),
    'cuisine_type': (0.0, 3.0),
    'diet_type': (0.0, 3.0),
    'protein(g)': (0.0, 3.0),
    'carbs(g)': (0.0, 3.0),
    'fat(g)': (0.0, 3.0),
}


def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)

    return results['mrr']

In [24]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

({'recipe_name': 1,
  'cuisine_type': 2,
  'diet_type': 0,
  'protein(g)': 1,
  'carbs(g)': 1,
  'fat(g)': 2},
 0.875)

In [25]:
def minsearch_improved(
        query,
        boost=None
):
    # Set default boost values if not provided
    if boost is None:
        boost = {
            'recipe_name': 1,
            'cuisine_type': 2,
            'diet_type': 0,
            'protein(g)': 1,
            'carbs(g)': 1,
            'fat(g)': 2
        }

    # Perform the search using the query and boost settings
    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))


  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.898, 'mrr': 0.850663492063492}

Results: {'hit_rate': 0.898, 'mrr': 0.850663492063492}

## Parameter Tuning for Ground Truth based on gpt-4o

In [28]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o.csv')
ground_truth = df_question.to_dict(orient='records')
documents = ingest_data(file_path='../data/data.csv')

   id diet_type                                  recipe_name cuisine_type  \
0   0      dash  Potato Latkes Made Simple: A Twitter Recipe       kosher   
1   1      dash                             Avocado Dressing     american   
2   2      dash                                  Bread Salad     american   
3   3      dash                             Ultimat Sparkler        world   
4   4      dash                                  Yogurt Pops     american   

   protein(g)  carbs(g)  fat(g)  
0       31.55    110.84  118.28  
1        4.18     18.97  118.42  
2       44.09    153.84   86.03  
3        0.32     16.32    0.08  
4        9.07     34.05    7.90  


In [29]:
df_validation = df_question[:10]
df_test = df_question[10:]
df_validation
text_fields = [
    'id',
    'recipe_name',
    'cuisine_type',
    'diet_type',
    'protein(g)',
    'carbs(g)',
    'fat(g)',
]
keyword_fields = ['id']
index = load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)
import random


def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, float) and isinstance(max_val, float):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score


gt_val = df_validation.to_dict(orient='records')

param_ranges = {
    'recipe_name': (0.0, 3.0),
    'cuisine_type': (0.0, 3.0),
    'diet_type': (0.0, 3.0),
    'protein(g)': (0.0, 3.0),
    'carbs(g)': (0.0, 3.0),
    'fat(g)': (0.0, 3.0),
}


def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)

    return results['mrr']


simple_optimize(param_ranges, objective, n_iterations=20)


def minsearch_improved(
        query,
        boost=None
):
    # Set default boost values if not provided
    if boost is None:
        boost = {
            'recipe_name': 1,
            'cuisine_type': 2,
            'diet_type': 0,
            'protein(g)': 1,
            'carbs(g)': 1,
            'fat(g)': 2
        }

    # Perform the search using the query and boost settings
    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


evaluate(ground_truth, lambda q: minsearch_improved(q['question']))


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.947, 'mrr': 0.919023015873016}

Results: {'hit_rate': 0.947, 'mrr': 0.92}