In [2]:
! pip install bs4 sentence-transformers tqdm langchain_community tiktoken langchain-openai langchainhub chromadb langchain



In [1]:
import os 
import requests
import bs4
import tiktoken
import numpy as np
import json
import re
import pandas as pd
from langchain.llms.base import LLM
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, GoogleDriveLoader, UnstructuredWordDocumentLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI  # Uses OpenAI-compatible API
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

USER_AGENT environment variable not set, consider setting it to identify your requests.


**Helper functions**

In [2]:
# loading pdf contents
def load_pdf(path: str):
    loader = PyPDFLoader(path)
    return loader.load()

def load_word(path: str):
    return UnstructuredWordDocumentLoader(path).load()
    
# loading website contents
def load_web(path: str):
    loader = WebBaseLoader(
        web_path = (path,),
        bs_kwargs = dict(
          parse_only = bs4.SoupStrainer(
              class_ = ("post-content", "post-title", "post-header") # depending on CSS class
          )  
        ),
    )
    return loader

# loading google doc contents (see below)

def normalize(text):
    return re.sub(r'[^a-z]', '', str(text).lower())

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def extract_json_from_llm_output(output: str) -> dict:
    """
    Extracts and parses a JSON object from LLM output that may include Markdown formatting.
    Handles triple backticks, optional language labels, and excessive whitespace.
    """
    output = output.strip()

    # Match content between ```json ... ``` or just ``` ... ```
    match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", output, re.DOTALL)
    
    if match:
        json_str = match.group(1)
    else:
        json_str = output

    return json.loads(json_str)

def get_doc_id(url):
    return url.split("/")[5]

OpenAI embeddings don't work in HK (even with VPN). Hence, HuggingFace embedding model was used. Also feel free to test out API calls from DeepSeep or replace it with your choice of LLM for comparison.

In [3]:
embd = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# change to bedrock llm
llm = ChatOpenAI(
    openai_api_base="https://api.deepseek.com/v1",
    openai_api_key="sk-ea1868b36aa34a36be9a223e75c1c63c", 
    model="deepseek-chat"
)

  embd = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import spacy

# load spacy models
nlp_textcat = spacy.load("spaCy/textcat_model")
nlp_ner = spacy.load("spaCy/ner_model")
nlp_ner_food = spacy.load("spaCy/ner_food_model")



**Intermediate Step: Preparing Google Cloud API**
- for loading contents of Google Docs
- you can also try loading content using **load_pdf()** and **load_web**
- the code below uses my Google Cloud credentials included in this folder

In [6]:
! pip install --upgrade google-auth google-auth-oauthlib google-api-python-client

Collecting google-auth-oauthlib
  Using cached google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)
Collecting google-api-python-client
  Downloading google_api_python_client-2.174.0-py3-none-any.whl.metadata (7.0 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client)
  Using cached httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client)
  Using cached google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Using cached uritemplate-4.2.0-py3-none-any.whl.metadata (2.6 kB)
Collecting proto-plus<2.0.0,>=1.22.3 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5->google-api-python-client)
  Using cached proto_plus-1.26.1-py3

In [31]:
from google_auth_oauthlib.flow import InstalledAppFlow

# Correct scope
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

def generate_token():
    flow = InstalledAppFlow.from_client_secrets_file('credentials_google.json', SCOPES)
    creds = flow.run_local_server(port=0)

    with open('token.json', 'w') as token:
        token.write(creds.to_json())

    print("✅ token.json generated successfully.")

generate_token()


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=808603111555-e7db6vhbucu4hj1ovgmd01ikvsnqn8ul.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A52225%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.readonly&state=urGcMPuC2LXklegz4rE1qcXPpuFTuJ&access_type=offline
✅ token.json generated successfully.


doc_links contains the Google Docs from the Sommelier Knowledge Base

In [5]:
docs_links = [
    "https://docs.google.com/document/d/1MvX9CTrVcoWg7WLAscq2MmnhTIrR0hZIGkpJMqhgflo/edit?tab=t.0",
    "https://docs.google.com/document/d/1NcV9_JGjMfA4WlihW3vNTduy24NBWdy1RWXrA2W0BIk/edit?usp=sharing",
    "https://docs.google.com/document/d/193rx2Rh6u-Ud40k-rgnqSQs-94SvHdeXPrPxOWK59X0/edit?usp=sharing",
    "https://docs.google.com/document/d/1vRDsn5o5mdymOEJ_O0tS4wcOjsAjt_2mLZqFfvgDUOs/edit?usp=sharing",
    "https://docs.google.com/document/d/1JceLBII727AZzSrDFfdGthJ1G4PhCDsA8sEm_dQMVr0/edit?usp=sharing",
    "https://docs.google.com/document/d/1yonU4qcysNkgd0BvbFmeIW9NF2ARErRJVW8QZynJyvM/edit?usp=sharing",
    "https://docs.google.com/document/d/1bq2AE1Jy6cQFt1xgjqtkof12Lw6F6fujqTlN1nZnh0A/edit?usp=sharing",
    "https://docs.google.com/document/d/1i-OcQeo7XOG83gS2ay2u0SLMWs4f8FG0JE_7l87qJkw/edit?usp=sharing",
    "https://docs.google.com/document/d/1PyZE8v3S3aUY66lFn97q0vaXDHc60lyso2oUFP0htjY/edit?usp=sharing",
    "https://docs.google.com/document/d/1Dudd7-6yl_UQrxfGa3MJZlKOfzHQdqWg0fb8Z9RzBec/edit?usp=sharing",
    "https://docs.google.com/document/d/1cjhrhCccuwiIh0ujj8QeamJ2JHhI6CjPmO84t1DSRZ0/edit?usp=sharing",
    "https://docs.google.com/document/d/1ESlfU6v8jseFlllZb3eaUeCJt69EIMsZiyMrDac-wX8/edit?usp=sharing",
    "https://docs.google.com/document/d/1xQhAkC3oP2cb262EjaHCV6CxEgeEGUsKrC8pH2p6RiY/edit?usp=sharing"
    
]

g_docs = GoogleDriveLoader(
    document_ids = [get_doc_id(i) for i in docs_links],
    credentials_path = "credentials_google.json",
    token_path = "token.json"
).load()

  g_docs = GoogleDriveLoader(


Chroma seems to be a convenient alternative, as a chroma vectorstore can be converted directly into a retriever that returns k relevant documents. 

In [6]:
# loading sample pdf document
path = "wine_food_pairing_knowledge.pdf"
word = load_pdf(path)
g_docs = g_docs + word

In [7]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300, 
    chunk_overlap = 50
)

splits = text_splitter.split_documents(g_docs)

vectorstore = Chroma.from_documents(
    documents = splits,
    embedding = embd
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)


**Preparing Pydantic Schemas and LLM Modelfiles**

In [8]:
from typing import List, Optional
from pydantic import BaseModel, Field

# define simple schema as template, map doc.ent if present else None
class WineMetadata(BaseModel):
    """Schema for metadata filters"""
    wine_name: Optional[str] = Field(default = None, description = "Specific wine name mentioned in query")
    max_price: Optional[float] = Field(default = None, description = "Specific max price mentioned in the query")
    min_price: Optional[float] = Field(default = None, description = "Specific min price mentioned in the query")
    wine_type: Optional[str] = Field(default = None, description = "Specific wine type (e.g., white, red, sparkling, etc.)")
    
    

In [9]:
# for generating taste profiles

system = """You are a master-level sommelier and wine data expert.
Your task is to create a hypothetical, ideal wine profile in the form of structured metadata based on a user's preference, question, or context.

This wine does not have to exist — it should represent the best possible match for what the user is looking for.

You must output a plausible, detailed JSON object that aligns with the schema below.

Do not refer back to the user query. Do not explain. Only output the JSON.

{
  "wine_name": "str",              // Specific wine name mentioned
  "winemaker": "str",              // Name of the winemaker or producer
  "vintage": "int",                // Vintage year (e.g. 2015)
  "country": "str",                // Country of origin
  "region": "str",                 // Region or appellation
  "wine_type": "str",              // Type of wine (e.g. red, white, rosé, sparkling)
  "wine_grapes": "str",            // Grape variety or blend (e.g. Merlot, Syrah)
  "occasion": "str",               // Occasion suitability (e.g. "wedding", "gift", "everyday")
  "body": "str",                   // Body type (e.g. "light", "full-bodied")
  "acidity": "str",                // Acidity level (e.g. "crisp", "low")
  "alcohol": "float",              // Alcohol content (as percentage, e.g. 13.5)
  "fruitiness": "str",             // Level of fruitiness (e.g. "dry", "juicy")
  "minerality": "str",             // Presence of mineral notes (e.g. "chalky", "flinty")
  "sweetness": "str",              // Sweetness level (e.g. "dry", "semi-sweet", "sweet")
}

"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

In [10]:
FIELD_MAP = {
    "wine_name": "Product Name",
    "min_price": "WS Retail Price",
    "max_price": "WS Retail Price",
    "wine_type": "Wine Type",
}

**More helper functions**

In [11]:
# Stopwatch
import time
import random
from statistics import quantiles
from functools import wraps

def timed(log_times):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            start = time.time()
            result = func(*args, **kwargs)
            end = time.time()
            log_times[func.__name__] = round(end - start, 4)
            return result
        return wrapper
    return decorator

log_times = {}

# identify if the intent is to recommend food or wine
def recommend_wine(query:str):
    category = nlp_textcat(query)
    recommend_wine = category.cats["recommend_wine"]
    recommend_food = category.cats["recommend_food"]
    if recommend_wine > recommend_food:
        return True
    else:
        print("recommend food")
        return False

def get_price(wine, price_field = "price"):
    try:
        return float(wine.get(price_field, 0))
    except (ValueError, TypeError):
        return 0.0

def sample_quartiles(wines, price_field="price", k_per_quartile=10):
    prices = [get_price(w, price_field) for w in wines]
    if not prices:
        return []

    q1, q2, q3 = quantiles(prices, n=4)

    buckets = {
        "Q1": [w for w in wines if get_price(w, price_field) <= q1],
        "Q2": [w for w in wines if q1 < get_price(w, price_field) <= q2],
        "Q3": [w for w in wines if q2 < get_price(w, price_field) <= q3],
        "Q4": [w for w in wines if q3 < get_price(w, price_field)]
    }

    sampled = []
    for group in buckets.values():
        if group:
            sampled.extend(random.sample(group, min(k_per_quartile, len(group))))
    return sampled                         
        

<br>

**Main functions***

In [12]:
@timed(log_times)
def query_analyzer(query: str):
    """ Extracts structured wine metadata from a natural language query
    using a spaCy NER model and returns it as a WineMetadata object."""
    
    if recommend_wine(query):
        doc = nlp_ner(query)
        all_labels = nlp_ner.get_pipe("ner").labels
        result = {label: None for label in all_labels}
        
        for ent in doc.ents:
            if (ent.label_ == "min_price") or (ent.label_ == "max_price"):
                result[ent.label_] = float(ent.text)
            elif (ent.label_ == "wine_type"):
                if ent.text in ["red", "white", "sparkling"]:
                    result[ent.label_] = ent.text
                else:
                    result[ent.label_] = None 
            else:
                result[ent.label_] = ent.text
            
        print(result)
        return WineMetadata(**result)

@timed(log_times)
def filter_wines(data, model_instance, field_map, max_results=20):
    """
    Given metadata filters constructed from user query using LLM,
    returns first n wine profiles that match. Also includes fallback logic
    """
    filters = model_instance.model_dump(exclude_none=True)
    results = []
    price_field = "price"

    # identify filter state
    has_filters = bool(filters)
    has_price = "min_price" in filters or "max_price" in filters
    has_type = "wine_type" in filters
    
    # case 1: have both filters
    if has_filters and ( len(filters) > 1 or has_price ):
        for wine in data:
            match = True
            for key, value in filters.items():
    
                if key not in field_map:
                    print(f"{key} not in field map")
                    continue
                    
                if key in ("min_price", "max_price"):
                    wine_price = get_price(wine, price_field)
                    if key == "min_price" and wine_price < value:
                        match = False
                        break
                    if key == "max_price" and wine_price > value:
                        match = False
                        break
    
                else:
                    field = field_map.get(key)
                    if field not in wine:
                        # print(f"{field} not in wine")
                        match = False
                        break
                    wine_val = normalize(wine[field])
                    query_val = normalize(value)
                    if query_val not in wine_val:
                        # print(f"{query_val} not in {wine_val}")
                        match = False
                        break
    
            if match:
                results.append(wine)
                if len(results)== max_results:
                    break
    
        return results

    # case 2: filter only has wine_type (no price)
    elif has_type and not has_price:
        wine_type = normalize(filters["wine_type"])
        type_filtered = [w for w in data if normalize(w.get(field_map["wine_type"], "")) == wine_type]
        return sample_quartiles(type_filtered, price_field=price_field, k_per_quartile=4)

    else:
        type_buckets = {
            "red": [],
            "white": [],
            "sparkling": []
        }
        for w in data:
            wt = normalize(w.get(field_map.get("wine_type", "wine_type"), ""))
            if wt in type_buckets:
                type_buckets[wt].append(w)

        sampled = []
        sampled += sample_quartiles(type_buckets["red"], price_field, k_per_quartile=10)
        sampled += sample_quartiles(type_buckets["white"], price_field, k_per_quartile=10)
        sampled += sample_quartiles(type_buckets["sparkling"], price_field, k_per_quartile=10)

        return sampled[:max_results]


@timed(log_times)
def create_taste_profile(query: str):
    """
    Generates a hypothetical document embedding (HyDE) of 
    the ideal taste profile based on user query.
    """
    messages = [
        SystemMessage(content=system),
        HumanMessage(content=query)
    ]
    response = llm.invoke(messages)
    print(response.content + "\n")
    
    return response.content         

    
@timed(log_times)
def generate_recommendations(filtered, profile, top_k=5):
    """
    Performs similarity search between embeddings of (a) each of the filtered wines
    and (b) enriched taste profile from query, if present otherwise randomly 
    sample 3 wines (cheapest -> middle -> most expensive)
    """
    if len(filtered) == 0:
        return None
        
    if len(filtered) < top_k:
        return filtered
        
    if profile:
        embedded_wines = []
        for wine in filtered:
            content = "\n".join(f"{key}: {value}" for key, value in wine.items() if value)
            wine_embed = embd.embed_query(content)
            embedded_wines.append((wine, wine_embed))

        profile_embed = embd.embed_query(profile)
        scored_embed = [(wine, cosine_similarity(profile_embed, wine_embed)) for wine, wine_embed in embedded_wines]
        top_wines = sorted(scored_embed, key=lambda x: x[1], reverse=True)[:top_k]
        return [wine for wine, _ in top_wines]

    else:
        sorted_wines = sorted(filtered, key=lambda x: float(x.WS_Retail_Price))
        n = len(sorted_wines)
        if n == 0:
            return []
        step = max(1, n // top_k)
        sampled = [random.choice(sorted_wines[i:i+step]) for i in range(0, n, step)][:top_k]

        return sampled 

def ask_ai(question, data, field_map):
    start_main = time.time()
    parsed_query = query_analyzer(question)
    filtered = filter_wines(data=data, model_instance=parsed_query, field_map=field_map)
    profile = create_taste_profile(question)
    recommendations = generate_recommendations(filtered, profile)

    # Print timing summary
    total_time = round(time.time() - start_main, 4)
    print("Timing summary:")

    for name, duration in log_times.items():
        print(f"  {name}: {duration} seconds")
        
    print(f"  Total time: {total_time} seconds")
    
    return recommendations

In [24]:
wine_data = "wine_data.csv"
food_data = "food_data.csv"

wine_df = pd.read_csv(wine_data)
food_df = pd.read_csv(food_data)

wine_json = wine_df.to_dict(orient="records")
food_json = food_df.to_dict(orient="records")

### Wine-Food Pairing Tests

Questions:
1. What wine goes well with spicy Thai green curry with coconut milk?
2. Recommend a red wine under 300 HKD that pairs well with grilled lamb and comes from Spain
3. Suggest a celebratory wine that works with oysters and has high acidity.
4. I'm cooking mushroom risotto and want something medium-bodied and earthy to go with it.
5. Pair a bold Napa Cabernet Sauvignon with sushi.

In [19]:
question = "Looking for the perfect red to pair with a smoky eggplant curry — any suggestions?"
recommendation = ask_ai(question=question, data=wine_json, field_map=FIELD_MAP)
if recommendation == None:
    print("No recommendations found")

for j in range(5):
    try:
        print(f"Wine {j+1}:", recommendation[j]['Product Name'])
    except:
        print("No more suggested wines")
        break
print("\n")

{'max_price': None, 'min_price': None, 'wine_name': None, 'wine_type': None}
```json
{
  "wine_name": "Domaine de la Janasse Côtes du Rhône Villages",
  "winemaker": "Christophe Sabon",
  "vintage": 2018,
  "country": "France",
  "region": "Rhône Valley",
  "wine_type": "red",
  "wine_grapes": "Grenache, Syrah, Mourvèdre",
  "occasion": "dinner pairing",
  "body": "medium-bodied",
  "acidity": "balanced",
  "alcohol": 14.0,
  "fruitiness": "ripe",
  "minerality": "subtle",
  "sweetness": "dry"
}
```

Timing summary:
  query_analyzer: 0.0071 seconds
  filter_wines: 0.0 seconds
  create_taste_profile: 8.4404 seconds
  generate_recommendations: 2.1776 seconds
  Total time: 10.6333 seconds
Wine 1: Domaine Derey Freres Marsannay Blanc 2022
Wine 2: Michel Bouzereau, Meursault 1er Cru, Genevrieres 2008
Wine 3: Leroy Beaujolais Villages Primeur 2023
Wine 4: Stephanie Ogier Blanc De Ogier 2016
Wine 5: Vinas Mora Kaamen Primorska Hrvatska 2021




In [20]:
questions = [
"What wine goes well with spicy Thai green curry with coconut milk?",
"Recommend a red wine under 300 HKD that pairs well with grilled lamb and comes from Spain",
"Suggest a celebratory wine that works with oysters and has high acidity.",
"I'm cooking mushroom risotto and want something medium-bodied and earthy to go with it.",
"Pair a bold Napa Cabernet Sauvignon with sushi.",
]

for i in range(len(questions)):
# for i in range(2, 5, 2):
    print(questions[i])  # Correct variable name
    recommendation = ask_ai(question=questions[i], data=wine_json, field_map=FIELD_MAP)
    if recommendation == None:
        print("No recommendations found")
        continue

    for j in range(5):
        try:
            print(f"Wine {j+1}:", recommendation[j]['Product Name'])
        except:
            print("No more suggested wines")
            continue
    print("\n")






What wine goes well with spicy Thai green curry with coconut milk?
{'max_price': None, 'min_price': None, 'wine_name': None, 'wine_type': None}
```json
{
  "wine_name": "Hypothetical Harmony Reserve",
  "winemaker": "Vinya Siam",
  "vintage": 2021,
  "country": "Germany",
  "region": "Mosel",
  "wine_type": "white",
  "wine_grapes": "Riesling",
  "occasion": "spicy cuisine pairing",
  "body": "light-bodied",
  "acidity": "high",
  "alcohol": 9.5,
  "fruitiness": "juicy",
  "minerality": "flinty",
  "sweetness": "off-dry"
}
```

Timing summary:
  query_analyzer: 0.0135 seconds
  filter_wines: 0.0 seconds
  create_taste_profile: 8.4637 seconds
  generate_recommendations: 1.8019 seconds
  Total time: 10.2791 seconds
Wine 1: Blank Canvas Anandale Farm Riesling 2023
Wine 2: F.X. Pichler Durnsteiner Riesling Smaragd Wachau 2018
Wine 3: Michel Niellon Chassagne Montrachet 1er Cru Clos de la Maltroie 2020
Wine 4: Grgich Hills Fume Blanc 2021
Wine 5: Henri Jouan Chambolle Musigny Vieilles Vigne

### Food-Wine Pairing Tests

Questions
1. What are the best dishes to serve with a 2020 Puligny-Montrachet Chardonnay?
2. I have a bottle of Amarone della Valpolicella — what foods would pair well with it?
3. What should I cook for dinner to go with a chilled bottle of Sancerre?
4. Can you suggest a full-course meal to go with a vintage Champagne?
5. What kind of food works well with a sweet Riesling from Mosel?
   

In [20]:
# Base Models for Reverse Pairings
class FoodMetadata(BaseModel):
    """Structured metadata filters for querying a food product database."""
    
    dish_name: Optional[str] = Field(
        default=None, description="Name of the dish (e.g., 'roast duck', 'brie cheese')."
    )
    min_price: Optional[float] = Field(
        default=None, description="Minimum price filter (inclusive)."
    )
    max_price: Optional[float] = Field(
        default=None, description="Maximum price filter (inclusive)."
    )
    food_type: Optional[str] = Field(
        default=None, description="Food type (e.g., 'fruit', 'pastry', 'vegetarian')."
    )
    course: Optional[str] = Field(
        default=None, description="Course (e.g., 'starter', 'main', 'dessert')."
    )

    


In [26]:
food_system = """You are a master-level chef and food data expert.
Your task is to create a hypothetical, ideal food profile in the form of structured metadata based on a user's preference, question, or context.

This food does not have to exist — it should represent the best possible match for what the user is looking for.

You must output a plausible, detailed JSON object that aligns with the schema below.

Do not refer back to the user query. Do not explain. Only output the JSON.
{
    "dish_name": "str",
    "food_type": "str",                  // e.g. 'meat', 'vegetarian', 'pastry'
    "course": "str",                // e.g. 'starter', 'main', 'dessert'
    "regional_pairing": "str",     // e.g. 'Provence', 'Piedmont'
    "sweetness": "str",            // 1–10
    "salitiness": "str",           // 1–10
    "acidity": "str",              // 1–10
    "sourness": "str",             // 1–10
    "umami": "str"                 // 1–10
}
"""

food_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", food_system),
        ("human", "{question}"),
    ]
)

In [27]:
FOOD_FIELD_MAP = {
    "dish_name": "Product Name",
    "min_price": "Price",
    "max_price": "Price",
    "food_type": "Food Type",
    "course": "Course"
}

In [13]:
import time
from functools import wraps

def timed(log_times):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            start = time.time()
            result = func(*args, **kwargs)
            end = time.time()
            log_times[func.__name__] = round(end - start, 4)
            return result
        return wrapper
    return decorator

log_times = {}

In [37]:
@timed(log_times)
def food_query_analyzer(query: str):
    """ Extracts structured food metadata from a natural language query
    using a spaCy NER model and returns it as a FoodMetadata object."""
    
    if not recommend_wine(query):
        doc = nlp_ner_food(query)
        all_labels = nlp_ner_food.get_pipe("ner").labels
        result = {label: None for label in all_labels}
        
        for ent in doc.ents:
            if (ent.label_ == "min_price") or (ent.label_ == "max_price"):
                result[ent.label_] = float(ent.text)
            # below codes are for labels with discrete values (e.g., type = {starter, main, dessert, etc.} 
            # to prevent unwanted values
            
            # elif (ent.label_ == ""):
            #     if ent.text in ["", "", ""]:
            #         result[ent.label_] = ent.text
            #     else:
            #         result[ent.label_] = None 
            
            else:
                result[ent.label_] = ent.text
            
        print(result)
        return FoodMetadata(**result)

@timed(log_times)
def filter_food(data, model_instance, field_map, max_results=20):
    """
    Given metadata filters constructed from user query using LLM,
    returns first n food taste profiles that match. Also includes fallback logic
    """
    filters = model_instance.model_dump(exclude_none=True)
    results = []
    price_field = "price"

    # identify filter state
    has_filters = bool(filters)
    has_price = "min_price" in filters or "max_price" in filters
    has_type = "food_type" in filters
    has_course = "course" in filters  # implement course logic
    
    # case 1: have both price and type filters
    if has_filters and ( len(filters) > 1 or has_price ):
        for food in data:
            match = True
            for key, value in filters.items():
    
                if key not in field_map:
                    print(f"{key} not in field map")
                    continue
                    
                if key in ("min_price", "max_price"):
                    food_price = get_price(food, price_field)
                    if key == "min_price" and food_price < value:
                        match = False
                        break
                    if key == "max_price" and food_price > value:
                        match = False
                        break
    
                else:
                    field = field_map.get(key)
                    if field not in food:
                        # print(f"{field} not in food")
                        match = False
                        break
                    food_val = normalize(food[field])
                    query_val = normalize(value)
                    if query_val not in food_val:
                        # print(f"{query_val} not in {food_val}")
                        match = False
                        break
    
            if match:
                results.append(food)
                if len(results)== max_results:
                    break
    
        return results

    # case 2: filter only has food_type (no price)
    elif has_type and not has_price:
        food_type = normalize(filters["food_type"])
        type_filtered = [w for w in data if normalize(w.get(field_map["food_type"], "")) == food_type]
        return sample_quartiles(type_filtered, price_field=price_field, k_per_quartile=4)

    # temp fallback: choose randomly from beef, chicken, and pork dishes
    else:
        type_buckets = {
            "beef": [],
            "chicken": [],
            "pork": []
        }
        for w in data:
            wt = normalize(w.get(field_map.get("food_type", "food_type"), ""))
            if wt in type_buckets:
                type_buckets[wt].append(w)

        sampled = []
        sampled += sample_quartiles(type_buckets["beef"], price_field, k_per_quartile=10)
        sampled += sample_quartiles(type_buckets["chicken"], price_field, k_per_quartile=10)
        sampled += sample_quartiles(type_buckets["pork"], price_field, k_per_quartile=10)

        return sampled[:max_results]


@timed(log_times)
def create_food_taste_profile(query: str):
    """
    Generates a hypothetical document embedding (HyDE) of 
    the ideal taste profile based on user query.
    """
    messages = [
        SystemMessage(content=food_system),
        HumanMessage(content=query)
    ]
    response = llm.invoke(messages)
    print(response.content + "\n")
    
    return response.content         

    
@timed(log_times)
def generate_food_recommendations(filtered, profile, top_k=5):
    """
    Performs similarity search between embeddings of (a) each of the filtered dishes
    and (b) enriched taste profile from query, if present otherwise randomly 
    sample 3 dishes (cheapest -> middle -> most expensive)
    """
    if len(filtered) == 0:
        return None
        
    if len(filtered) < top_k:
        return filtered
        
    if profile:
        embedded_food = []
        for food in filtered:
            content = "\n".join(f"{key}: {value}" for key, value in food.items() if value)
            food_embed = embd.embed_query(content)
            embedded_food.append((food, food_embed))

        profile_embed = embd.embed_query(profile)
        scored_embed = [(food, cosine_similarity(profile_embed, food_embed)) for food, food_embed in embedded_food]
        top_dishes = sorted(scored_embed, key=lambda x: x[1], reverse=True)[:top_k]
        return [dish for dish, _ in top_dishes]

    else:
        sorted_food = sorted(filtered, key=lambda x: float(x.Price))
        n = len(sorted_food)
        if n == 0:
            return []
        step = max(1, n // top_k)
        sampled = [random.choice(sorted_food[i:i+step]) for i in range(0, n, step)][:top_k]

        return sampled 

def ask_food_ai(question, data, field_map):
    start_main = time.time()
    parsed_query = food_query_analyzer(question)
    filtered = filter_food(data=data, model_instance=parsed_query, field_map=field_map)
    profile = create_food_taste_profile(question)
    recommendations = generate_food_recommendations(filtered, profile)

    # Print timing summary
    total_time = round(time.time() - start_main, 4)
    print("Timing summary:")

    for name, duration in log_times.items():
        print(f"  {name}: {duration} seconds")
        
    print(f"  Total time: {total_time} seconds")
    
    return recommendations

In [40]:
questions = [
"What are the best dishes under 200 HKD in the main course to serve with a 2020 Puligny-Montrachet Chardonnay?"
]

for i in range(len(questions)):
# for i in range(2, 5, 2):
    print(questions[i])  # Correct variable name
    recommendation = ask_food_ai(question=questions[i], data = food_json, field_map = FOOD_FIELD_MAP)
    if recommendation == None:
        print("No recommendation found")
        continue

    for j in range(5):
        try:
            print(f"Food {j+1}:", recommendation[j]['Product Name'])
        except:
            print("No more suggested food")
            continue
    print("\n")


What are the best dishes under 200 HKD in the main course to serve with a 2020 Puligny-Montrachet Chardonnay?
recommend food
{'course': 'main course', 'dish_name': None, 'max_price': 200.0, 'min_price': None, 'type': None}
```json
{
    "dish_name": "Seared Scallops with Truffle Risotto",
    "food_type": "seafood",
    "course": "main",
    "regional_pairing": "Burgundy",
    "sweetness": "2",
    "salitiness": "5",
    "acidity": "6",
    "sourness": "3",
    "umami": "8"
}
```

Timing summary:
  food_query_analyzer: 0.0082 seconds
  filter_food: 0.0 seconds
  create_food_taste_profile: 8.3093 seconds
  generate_food_recommendations: 2.5446 seconds
  Total time: 10.8621 seconds
Food 1: Braised Beef Short Ribs with Truffle Risotto
Food 2: Braised Lamb Shanks with Garlic Mashed Potatoes
Food 3: Bouillabaisse (Traditional Provençal Fish Stew)
Food 4: Beef Bourguignon
Food 5: Bouillabaisse




In [35]:
question = "What are the starter dishes under 600 HKD to serve with a 2020 Puligny-Montrachet Chardonnay?"
start_main = time.time()
parsed_query = food_query_analyzer(question)
end_time = time.time()

print(f"time:", {round(end_time - start_main, 4)})

recommend food
{'course': 'starter', 'dish_name': None, 'max_price': 600.0, 'min_price': None, 'type': None}
time: {0.0102}


Current problems:
- faulty "course" parsing leads to no recommendation. Fix
- ner_food_model only accepts "600 HKD" and not "HKD 600". Train on more data.