In [4]:
from dotenv import load_dotenv
import os
from data_formats import *
import json
from opensearchpy import OpenSearch
from opensearchpy import helpers
from tqdm import tqdm
assert load_dotenv()
# os.environ["NOVA_SEARCH_PW"]


index_name = os.environ["NOVA_SEARCH_US"]

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': os.environ["NOVA_SEARCH_HOST"], 'port': os.environ["NOVA_SEARCH_PORT"]}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (os.environ["NOVA_SEARCH_US"], os.environ["NOVA_SEARCH_PW"]),
    url_prefix = 'opensearch',
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

client.indices.exists(index_name)

True

In [5]:
with open("data_with_embeddings.json", "r") as f:
    
    data = json.load(f)
    
data = [Recipe(**d) for d in data]

EMBEDDING_DIM = len(data[0].embedding)

In [6]:
index_body = {
    "settings": {
        "index": {
            "number_of_replicas": 0,
            "number_of_shards": 4,
            "refresh_interval": "1s",
            "knn": True,
        }
    },
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "displayName": {
                "type": "text",
                "analyzer": "standard",
                "similarity": "BM25",
            },
            "description": {
                "type": "text",
                "analyzer": "standard",
                "similarity": "BM25",
            },
            "tools": {
                "type": "nested",
                "properties": {
                    "displayName": {"type": "text", "analyzer": "standard"},
                    "images": {
                        "type": "nested",
                        "properties": {"url": {"type": "keyword"}},
                    },
                    "embedding": {
                        "type": "knn_vector",
                        "dimension": EMBEDDING_DIM,
                        "method": {
                            "name": "hnsw",
                            "space_type": "innerproduct",
                            "engine": "faiss",
                            "parameters": {"ef_construction": 256, "m": 48},
                        },
                    },
                },
            },
            "ingredients": {
                "type": "nested",
                "properties": {
                    "displayText": {"type": "text", "analyzer": "standard"},
                    "ingredient": {"type": "keyword"},
                    "ingredientId": {"type": "keyword"},
                    "quantity": {"type": "float"},
                    "unit": {"type": "keyword"},
                    "images": {
                        "type": "nested",
                        "properties": {"url": {"type": "keyword"}},
                    },
                    "embedding": {
                        "type": "knn_vector",
                        "dimension": EMBEDDING_DIM,
                        "method": {
                            "name": "hnsw",
                            "space_type": "innerproduct",
                            "engine": "faiss",
                            "parameters": {"ef_construction": 256, "m": 48},
                        },
                    },
                },
            },
            "images": {"type": "nested", "properties": {"url": {"type": "keyword"}}},
            "instructions": {
                "type": "nested",
                "properties": {
                    "stepNumber": {"type": "integer"},
                    "stepTitle": {"type": "text", "analyzer": "standard"},
                    "stepText": {"type": "text", "analyzer": "standard"},
                    "stepImages": {
                        "type": "nested",
                        "properties": {"url": {"type": "keyword"}},
                    },
                    "embedding": {
                        "type": "knn_vector",
                        "dimension": EMBEDDING_DIM,
                        "method": {
                            "name": "hnsw",
                            "space_type": "innerproduct",
                            "engine": "faiss",
                            "parameters": {"ef_construction": 256, "m": 48},
                        },
                    },
                },
            },
            "totalTimeMinutes" : {"type": "integer"},
            "embedding": {
                "type": "knn_vector",
                "dimension": EMBEDDING_DIM,
                "method": {
                    "name": "hnsw",
                    "space_type": "innerproduct",
                    "engine": "faiss",
                    "parameters": {"ef_construction": 256, "m": 48},
                },
            },
        },
    },
}

In [7]:
response = client.indices.delete(
    index = index_name,
    timeout = 10
)
response

{'acknowledged': True}

In [8]:
client.indices.create(index=index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'user205'}

In [9]:
for id, recipe in enumerate(tqdm(data)):
    response = client.index(index=index_name, body=recipe.model_dump(), id=id)

  0%|          | 0/994 [00:00<?, ?it/s]

100%|██████████| 994/994 [05:19<00:00,  3.11it/s]


In [37]:
client.get(index=index_name, id=0)

{'_index': 'user205',
 '_type': '_doc',
 '_id': '0',
 '_version': 3,
 '_seq_no': 2,
 '_primary_term': 1,
 'found': True,
 '_source': {'displayName': 'How To Make Chicken Parmesan',
  'description': 'Master the classic dish of chicken Parmesan by starting with the chicken, choosing a marinara sauce you love, and using a trio of cheese. ',
  'tools': [{'displayName': 'Meat mallet or small saucepan',
    'images': [],
    'embedding': [-0.004053195007145405,
     -0.024163026362657547,
     -0.06125914677977562,
     0.019552865996956825,
     -0.05888981372117996,
     -0.029235390946269035,
     0.08615607023239136,
     0.05018667131662369,
     0.0535789430141449,
     -0.015074439346790314,
     0.06425481289625168,
     -0.05213938653469086,
     -0.08691108971834183,
     0.04176289588212967,
     -4.947096840623999e-06,
     -0.09501690417528152,
     0.158568874001503,
     0.025248082354664803,
     0.04101670905947685,
     -0.03483079746365547,
     -0.06679116934537888,
     

In [11]:
client.indices.close(index=index_name)

{'acknowledged': True, 'shards_acknowledged': False, 'indices': {}}

In [40]:
client.indices.open(index=index_name)

{'acknowledged': True, 'shards_acknowledged': True}

In [58]:
def search_by_recipy_name(name: str, size=5):

    query = {
        "size": size,
        "query": {
            "multi_match": {"query": name, "fields": ["displayName", "description"]}
        },
    }

    return client.search(body=query, index=index_name)


search_by_recipy_name("chicken parmesan")

{'took': 16,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 87, 'relation': 'eq'},
  'max_score': 9.324757,
  'hits': [{'_index': 'user205',
    '_type': '_doc',
    '_id': '557',
    '_score': 9.324757,
    '_source': {'displayName': 'Chicken Parmesan',
     'description': None,
     'tools': [],
     'ingredients': [{'displayText': '4  skinless, boneless chicken breast halves',
       'ingredient': None,
       'ingredientId': '32900cb9e09a58a7cc2c3ec42af0fcda56cbb456934b28168d1d12e8968dd3a4',
       'quantity': 4.0,
       'unit': 'COUNT',
       'images': [],
       'embedding': [0.03762403130531311,
        0.014560741372406483,
        -0.021826310083270073,
        0.012402557767927647,
        -0.02465919964015484,
        -0.02514045499265194,
        0.03478061780333519,
        -0.00496554234996438,
        -0.007680173963308334,
        0.001759665901772678,
        -0.03354646638035774,
        -0.127

In [66]:
def search_by_recipy_name(name: str, size=5):
    query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {"query": name, "fields": ["displayName", "description"]}
                },
                "filter": {
                    "nested": {
                        "path": "images",
                        "query": {
                            "exists": {"field": "images.url"}
                        }
                    }
                }
            }
        },
    }

    return client.search(body=query, index=index_name)
search_by_recipy_name("chicken parmesan")['hits']['hits'][0]['_source']['images']

[{'url': 'https://m.media-amazon.com/images/S/alexa-kitchen-msa-na-prod/recipes/allrecipes/a0fcc1142b78560e19f57145293f616b78260c20615afecd48ae73482af98a02.jpg'}]

In [55]:
def get_recipy_by_ingredients(ingredients: List[str], min_should=-1, size=5):

    query = {
        "size": size,
        "query": {
            "bool": {
                "should": [
                    {
                        "nested": {
                            "path": "ingredients",
                            "query": {"multi_match": {'query' : ingredient, 'fields' : ['ingredients.displayText', 'ingredients.ingredient']}},
                        }
                    }
                    for ingredient in ingredients
                ],
                "minimum_should_match": (
                    min_should if min_should > 0 else len(ingredients)
                ),
            }
        }
    }

    return client.search(body=query, index=index_name)

get_recipy_by_ingredients(["oregano", "chicken", "butter"])

{'took': 14,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 15.012972,
  'hits': [{'_index': 'user205',
    '_type': '_doc',
    '_id': '415',
    '_score': 15.012972,
    '_source': {'displayName': "Mike's Polish Smothered Chicken",
     'description': None,
     'tools': [],
     'ingredients': [{'displayText': 'Italian dressing or marinade, to taste',
       'ingredient': None,
       'ingredientId': 'aba9a20a31840377347e22108ebc1ea54cebb8edf8e566e22ac3febd59fa21da',
       'quantity': 1.0,
       'unit': 'TO_TASTE',
       'images': [],
       'embedding': [-0.10482634603977203,
        -0.033175043761730194,
        0.006261628121137619,
        0.04040108248591423,
        -0.08248759806156158,
        0.008138930425047874,
        0.08492666482925415,
        -0.01186260487884283,
        -0.011555724777281284,
        -0.1001816987991333,
        0.06172015890479088,
  

In [59]:
def recipy_with_images(size=5):

    query = {
        "size": size,
        "query": {"exists": {"field": "images"}}
    }

    return client.search(body=query, index=index_name)

recipy_with_images()

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}