In [4]:
import pandas as pd
from tqdm.auto import tqdm

from meal_mentor.ingest import ingest_data


In [1]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [2]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    results_dict = {}
    for q in tqdm(ground_truth):
        try:
            doc_id = q['id']
            results = search_function(q)

            relevance = [str(d['id']) == str(doc_id) for d in results]

            results_dict[q['id']] = (q, results)
            relevance_total.append(relevance)
    
        except Exception as e:
            print(f"Error processing query: {q} with exeption: {e}")

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Evaluate retrieval using Elastic Search and the ground truth data generated with gpt-4o-mini


In [31]:
df_question = pd.read_csv('../data/ground-truth-retrieval_4o.csv')
df_question.head()

Unnamed: 0,id,question
0,0,What is the protein content in the Potato Latk...
1,0,Can you tell me how many grams of carbohydrate...
2,0,I need to know the fat content in the Potato L...
3,0,What type of cuisine is the Potato Latkes Made...
4,0,Is the Potato Latkes Made Simple recipe suitab...


In [32]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'What is the protein content in the Potato Latkes Made Simple recipe?'}

In [33]:
documents = ingest_data(file_path="../data/recipes.csv")
documents[0]

{'id': 0,
 'diet_type': 'dash',
 'recipe_name': 'Potato Latkes Made Simple: A Twitter Recipe',
 'cuisine_type': 'kosher',
 'protein(g)': 31.55,
 'carbs(g)': 110.84,
 'fat(g)': 118.28}

#### Create Embeddings using Sentence Transformer

In [34]:
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")

In [35]:
# Created the dense vector using the pre-trained model
operations = []
embeddings = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["recipe_name_vector"] = model.encode(doc["recipe_name"]).tolist()
    operations.append(doc)
    embeddings.append(doc["recipe_name_vector"])

In [36]:
len(operations[0].get("recipe_name_vector"))

768

#### Step 3: Setup ElasticSearch connection

In [37]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

es_client.info()

ObjectApiResponse({'name': '03bdc84c18b3', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'kTLA9jVdR3KZAJ28zgLU7Q', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### Step 4: Create Mappings and Index  
* Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.

* Each document is a collection of fields, which each have their own data type.

* We can compare mapping to a database schema in how it describes the fields and properties that documents hold, the datatype of each field (e.g., string, integer, or date), and how those fields should be indexed and stored 

In [38]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties":
            {
                'id': {"type": "keyword"},
                'diet_type': {"type": "text"},
                'recipe_name': {"type": "text"},
                'cuisine_type': {"type": "text"},
                'protein(g)': {"type": "text"},
                'carbs(g)': {"type": "text"},
                'fat(g)': {"type": "text"},
                'recipe_name_vector': {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
            }
    }
}

In [39]:
index_name = "recipes"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'recipes'})

#### Step 5: Add documents into index

In [40]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

#### Step 6: Create end user query


In [41]:
search_term = "Recommend a nice Italian low carb recipe to me."
vector_search_term = model.encode(search_term)

In [42]:
query = {
    "field": "recipe_name_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}

In [43]:
res = es_client.search(index=index_name, knn=query,
                       source=["diet_type", "recipe_name", "cuisine_type", "protein(g)", "carbs(g)", "fat(g)"])
res["hits"]["hits"]

[{'_index': 'recipes',
  '_id': 'JkPX1ZIBp2E4dDQX0eMA',
  '_score': 0.8027644,
  '_source': {'protein(g)': 95.67,
   'carbs(g)': 97.25,
   'recipe_name': 'Low Carb Keto Asiago Alfredo Brussels Sprouts',
   'diet_type': 'keto',
   'fat(g)': 191.06,
   'cuisine_type': 'italian'}},
 {'_index': 'recipes',
  '_id': 'G0PX1ZIBp2E4dDQX0ONA',
  '_score': 0.7833885,
  '_source': {'protein(g)': 108.02,
   'carbs(g)': 12.96,
   'recipe_name': 'Keto Pork Chops al Pastor - Low Carb',
   'diet_type': 'keto',
   'fat(g)': 151.9,
   'cuisine_type': 'italian'}},
 {'_index': 'recipes',
  '_id': 'CUPX1ZIBp2E4dDQXz-Md',
  '_score': 0.7799237,
  '_source': {'protein(g)': 104.47,
   'carbs(g)': 48.84,
   'recipe_name': 'Broccoli & Chicken Stir Fry Recipe – Low Carb, Keto & Paleo recipes',
   'diet_type': 'keto',
   'fat(g)': 99.09,
   'cuisine_type': 'chinese'}},
 {'_index': 'recipes',
  '_id': 'CkPX1ZIBp2E4dDQXz-Ms',
  '_score': 0.77932715,
  '_source': {'protein(g)': 93.66,
   'carbs(g)': 25.74,
   'recipe

#### Step 7: Perform Keyword search with Semantic Search (Hybrid/Advanced Search)

In [44]:
knn_query = {
    "field": "recipe_name_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [45]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"cuisine_type": "Italian"},
    },
    knn=knn_query,
    size=5
)

In [46]:
response["hits"]["hits"]

[{'_index': 'recipes',
  '_id': 'JkPX1ZIBp2E4dDQX0eMA',
  '_score': 2.8540435,
  '_source': {'id': 81,
   'diet_type': 'keto',
   'recipe_name': 'Low Carb Keto Asiago Alfredo Brussels Sprouts',
   'cuisine_type': 'italian',
   'protein(g)': 95.67,
   'carbs(g)': 97.25,
   'fat(g)': 191.06,
   'recipe_name_vector': [-0.02249760925769806,
    0.009652617387473583,
    -0.018066462129354477,
    0.001840426935814321,
    0.04174616560339928,
    -0.01123574934899807,
    -0.043377868831157684,
    0.08697789162397385,
    -0.02272709459066391,
    0.07083239406347275,
    -0.029090359807014465,
    0.0016267204191535711,
    -0.024811001494526863,
    0.1129797101020813,
    -0.027997486293315887,
    0.019111577421426773,
    0.020239463075995445,
    0.03560256585478783,
    -0.027110178023576736,
    -0.030802562832832336,
    0.006963629275560379,
    -0.02233368530869484,
    0.017135178670287132,
    0.017739761620759964,
    -0.056063804775476456,
    -0.002268361859023571,
    0.0

## Evaluate retrieval using Elastic Search and the ground truth data generated with gpt-4o-mini

In [47]:
import numpy as np


class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [48]:
X = np.array(embeddings)
X.shape

(200, 768)

In [49]:
v = model.encode("A nice low carb recipe thats italian")


In [50]:
search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'id': 81,
  'diet_type': 'keto',
  'recipe_name': 'Low Carb Keto Asiago Alfredo Brussels Sprouts',
  'cuisine_type': 'italian',
  'protein(g)': 95.67,
  'carbs(g)': 97.25,
  'fat(g)': 191.06,
  'recipe_name_vector': [-0.02249760925769806,
   0.009652617387473583,
   -0.018066462129354477,
   0.001840426935814321,
   0.04174616560339928,
   -0.01123574934899807,
   -0.043377868831157684,
   0.08697789162397385,
   -0.02272709459066391,
   0.07083239406347275,
   -0.029090359807014465,
   0.0016267204191535711,
   -0.024811001494526863,
   0.1129797101020813,
   -0.027997486293315887,
   0.019111577421426773,
   0.020239463075995445,
   0.03560256585478783,
   -0.027110178023576736,
   -0.030802562832832336,
   0.006963629275560379,
   -0.02233368530869484,
   0.017135178670287132,
   0.017739761620759964,
   -0.056063804775476456,
   -0.002268361859023571,
   0.0012041792506352067,
   0.04353763163089752,
   0.0017505692085251212,
   -0.045998115092515945,
   -0.0027209280524402857,
 

In [52]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [53]:
def numpy_cosine_search(q):
    question = q['question']

    v_q = model.encode(question)

    return search_engine.search(v_q, num_results=5)

In [54]:
from tqdm import tqdm
evaluate(ground_truth, numpy_cosine_search)

100%|██████████| 1000/1000 [01:23<00:00, 11.98it/s]


{'hit_rate': 0.974, 'mrr': 0.9506833333333334}

{'hit_rate': 0.929, 'mrr': 0.9034500000000001}

# Approach 2

In [55]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties":
            {
                'id': {"type": "keyword"},
                'diet_type': {"type": "text"},
                'recipe_name': {"type": "text"},
                'cuisine_type': {"type": "text"},
                'protein(g)': {"type": "text"},
                'carbs(g)': {"type": "text"},
                'fat(g)': {"type": "text"},
                'recipe_name_vector': {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
            }
    }
}

index_name = "recipes"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'recipes'})

In [56]:
for d, emb in zip(tqdm(documents), embeddings):
    d['recipe_name_vector'] = emb
    es_client.index(index=index_name, document=d)


100%|██████████| 200/200 [00:03<00:00, 60.51it/s]


In [57]:
documents[0]['recipe_name_vector']


[0.025358784943819046,
 -0.05348053574562073,
 -0.004305927082896233,
 -0.0452263168990612,
 -0.007009272929280996,
 -0.010723767802119255,
 -0.05304490774869919,
 0.03940995782613754,
 0.038392506539821625,
 0.04558773711323738,
 0.041243258863687515,
 -0.03479721024632454,
 -0.007507154252380133,
 0.05227464437484741,
 -0.014766673557460308,
 0.0008408072753809392,
 -0.011344579048454762,
 0.04044389724731445,
 -0.018833311274647713,
 -0.0016189852030947804,
 0.0016307939076796174,
 0.03665860369801521,
 0.02874969132244587,
 -0.005517846904695034,
 -0.01990666799247265,
 0.02916085347533226,
 -0.00019057909958064556,
 0.05754733458161354,
 -0.004746710881590843,
 -0.053623124957084656,
 -0.014328332617878914,
 0.01509995199739933,
 -0.02710817940533161,
 -0.021249152719974518,
 1.5286449297491345e-06,
 0.04052254557609558,
 -0.014001375064253807,
 -0.006686178036034107,
 -0.03702076897025108,
 0.06002744659781456,
 0.012844227254390717,
 -0.038929782807826996,
 -0.027645954862236977

In [58]:
def elastic_search_knn(field, vector, filter=None):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        # "filter": {
        #     "term": {
        #         "course": course
        #     }
        # }
    }

    search_query = {
        "knn": knn,
        "_source": ["recipe_name", "cuisine_type", "protein(g)", "carbs(g)", "fat(g)", "id"]

    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [59]:
elastic_search_knn('recipe_name_vector', v)


[{'protein(g)': 95.67,
  'carbs(g)': 97.25,
  'recipe_name': 'Low Carb Keto Asiago Alfredo Brussels Sprouts',
  'id': 81,
  'fat(g)': 191.06,
  'cuisine_type': 'italian'},
 {'protein(g)': 149.54,
  'carbs(g)': 21.9,
  'recipe_name': 'Easiest Keto Meatza You’ll Ever Make! recipes',
  'id': 59,
  'fat(g)': 110.64,
  'cuisine_type': 'american'},
 {'protein(g)': 93.66,
  'carbs(g)': 25.74,
  'recipe_name': 'Keto Pesto Chicken Pasta',
  'id': 53,
  'fat(g)': 125.36,
  'cuisine_type': 'italian'},
 {'protein(g)': 120.35,
  'carbs(g)': 97.18,
  'recipe_name': 'Italian Sausage & Vegetable Stew recipes',
  'id': 111,
  'fat(g)': 216.44,
  'cuisine_type': 'italian'},
 {'protein(g)': 39.46,
  'carbs(g)': 26.2,
  'recipe_name': 'Low Carb Keto "Cornbread" recipes',
  'id': 48,
  'fat(g)': 99.89,
  'cuisine_type': 'american'}]

In [60]:
def question_vector_knn(q):
    question = q['question']
    # course = 'machine-learning-zoomcamp'

    v_q = model.encode(question)

    return elastic_search_knn('recipe_name_vector', v_q)

In [61]:
question_vector_knn(ground_truth[10])


[{'protein(g)': 44.09,
  'carbs(g)': 153.84,
  'recipe_name': 'Bread Salad',
  'id': 2,
  'fat(g)': 86.03,
  'cuisine_type': 'american'},
 {'protein(g)': 32.18,
  'carbs(g)': 3.95,
  'recipe_name': 'Mustard Sardines Salad Recipe [Paleo, Keto]',
  'id': 51,
  'fat(g)': 14.99,
  'cuisine_type': 'american'},
 {'protein(g)': 1.91,
  'carbs(g)': 3.77,
  'recipe_name': 'Mediterranean Tuna Salad',
  'id': 107,
  'fat(g)': 28.61,
  'cuisine_type': 'mediterranean'},
 {'protein(g)': 43.48,
  'carbs(g)': 102.28,
  'recipe_name': 'Gluten Free, Keto Friendly, Low Carb, Pie crust',
  'id': 72,
  'fat(g)': 134.1,
  'cuisine_type': 'british'},
 {'protein(g)': 39.75,
  'carbs(g)': 149.57,
  'recipe_name': 'Crispy Fried Spinach with Tomato, Onion, Tamarind, and Yogurt Recipe',
  'id': 177,
  'fat(g)': 124.56,
  'cuisine_type': 'american'}]

In [62]:
evaluate(ground_truth, question_vector_knn)


100%|██████████| 1000/1000 [01:39<00:00, 10.03it/s]


{'hit_rate': 0.974, 'mrr': 0.9506833333333334}