In [14]:
import pandas as pd
from tqdm.auto import tqdm

from ingestion import get_index
from ingestion import ingest_data


In [11]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [9]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    results_dict = {}
    for q in tqdm(ground_truth):
        try:
            doc_id = q['id']
            results = search_function(q)

            relevance = [str(d['id']) == str(doc_id) for d in results]

            results_dict[q['id']] = (q, results)
            relevance_total.append(relevance)
            # if all(not r for r in relevance):
            #     print("No relevant results found for query: ", q)
            # print("Results: ", results)
        except Exception as e:
            print(f"Error processing query: {q} with exeption: {e}")

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Evaluate retrieval using MinSearch and the ground truth data generated with gpt-4o-mini

In [3]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o_mini.csv')
df_question.head()

Unnamed: 0,id,question
0,0,Can you provide the nutritional breakdown for ...
1,0,What makes this Potato Latkes recipe suitable ...
2,0,Are there any specific kosher dietary guidelin...
3,0,How can I adjust the ingredients in the Potato...
4,0,What type of cuisine do the Potato Latkes belo...


In [4]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'Can you provide the nutritional breakdown for the Potato Latkes recipe, including protein, carbs, and fat content?'}

In [5]:
documents = ingest_data()
documents[0]

{'id': 0,
 'diet_type': 'dash',
 'recipe_name': 'Potato Latkes Made Simple: A Twitter Recipe',
 'cuisine_type': 'kosher',
 'protein(g)': 31.55,
 'carbs(g)': 110.84,
 'fat(g)': 118.28}

In [6]:
text_fields = [
    'id',
    'recipe_name',
    'cuisine_type',
    'diet_type',
    'protein(g)',
    'carbs(g)',
    'fat(g)',
]
keyword_fields = ['id']
index = get_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [13]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.898, 'mrr': 0.85794126984127}

Results: {'hit_rate': 0.898, 'mrr': 0.85794126984127}

## Evaluate retrieval using MinSearch and the ground truth data generated with gpt-4o

In [15]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o.csv')
df_question.head()

Unnamed: 0,id,question
0,0,What is the protein content in the Potato Latk...
1,0,Can you tell me how many grams of carbohydrate...
2,0,I need to know the fat content in the Potato L...
3,0,What type of cuisine is the Potato Latkes Made...
4,0,Is the Potato Latkes Made Simple recipe suitab...


In [16]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'What is the protein content in the Potato Latkes Made Simple recipe?'}

In [17]:
documents = ingest_data()
documents[0]

{'id': 0,
 'diet_type': 'dash',
 'recipe_name': 'Potato Latkes Made Simple: A Twitter Recipe',
 'cuisine_type': 'kosher',
 'protein(g)': 31.55,
 'carbs(g)': 110.84,
 'fat(g)': 118.28}

In [18]:
text_fields = [
    'id',
    'recipe_name',
    'cuisine_type',
    'diet_type',
    'protein(g)',
    'carbs(g)',
    'fat(g)',
]
keyword_fields = ['id']
index = get_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

In [19]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.947, 'mrr': 0.9229761904761906}

## Evaluate retrieval using Elastic Search and the ground truth data generated with gpt-4o-mini


In [21]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o_mini.csv')
df_question.head()

Unnamed: 0,id,question
0,0,Can you provide the nutritional breakdown for ...
1,0,What makes this Potato Latkes recipe suitable ...
2,0,Are there any specific kosher dietary guidelin...
3,0,How can I adjust the ingredients in the Potato...
4,0,What type of cuisine do the Potato Latkes belo...


In [22]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'Can you provide the nutritional breakdown for the Potato Latkes recipe, including protein, carbs, and fat content?'}

In [23]:
documents = ingest_data()
documents[0]

{'id': 0,
 'diet_type': 'dash',
 'recipe_name': 'Potato Latkes Made Simple: A Twitter Recipe',
 'cuisine_type': 'kosher',
 'protein(g)': 31.55,
 'carbs(g)': 110.84,
 'fat(g)': 118.28}

#### Create Embeddings using Sentence Transformer

In [25]:
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")

In [26]:
# Created the dense vector using the pre-trained model
operations = []
embeddings = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["recipe_name_vector"] = model.encode(doc["recipe_name"]).tolist()
    operations.append(doc)
    embeddings.append(doc["recipe_name_vector"])

In [27]:
len(operations[0].get("recipe_name_vector"))

768

#### Step 3: Setup ElasticSearch connection

In [None]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

es_client.info()

#### Step 4: Create Mappings and Index  
* Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.

* Each document is a collection of fields, which each have their own data type.

* We can compare mapping to a database schema in how it describes the fields and properties that documents hold, the datatype of each field (e.g., string, integer, or date), and how those fields should be indexed and stored 

In [28]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties":
            {
                'id': {"type": "keyword"},
                'diet_type': {"type": "text"},
                'recipe_name': {"type": "text"},
                'cuisine_type': {"type": "text"},
                'protein(g)': {"type": "text"},
                'carbs(g)': {"type": "text"},
                'fat(g)': {"type": "text"},
                'recipe_name_vector': {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
            }
    }
}

In [None]:
index_name = "recipes"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)