In [2]:
! pip install bs4 sentence-transformers tqdm langchain_community tiktoken langchain-openai langchainhub chromadb langchain



In [1]:
import os 
import requests
import bs4
import tiktoken
import numpy as np
import faiss
import json
import re
import pandas as pd
from typing import Optional, List
from langchain.llms.base import LLM
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, GoogleDriveLoader, UnstructuredWordDocumentLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI  # Uses OpenAI-compatible API
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

USER_AGENT environment variable not set, consider setting it to identify your requests.


**Helper functions**

In [2]:
# loading pdf contents
def load_pdf(path: str):
    loader = PyPDFLoader(path)
    return loader.load()

def load_word(path: str):
    return UnstructuredWordDocumentLoader(path).load()
    
# loading website contents
def load_web(path: str):
    loader = WebBaseLoader(
        web_path = (path,),
        bs_kwargs = dict(
          parse_only = bs4.SoupStrainer(
              class_ = ("post-content", "post-title", "post-header") # depending on CSS class
          )  
        ),
    )
    return loader

# loading google doc contents (see below)

def normalize(text):
    return re.sub(r'[^a-z]', '', str(text).lower())

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def extract_json_from_llm_output(output: str) -> dict:
    """
    Extracts and parses a JSON object from LLM output that may include Markdown formatting.
    Handles triple backticks, optional language labels, and excessive whitespace.
    """
    output = output.strip()

    # Match content between ```json ... ``` or just ``` ... ```
    match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", output, re.DOTALL)
    
    if match:
        json_str = match.group(1)
    else:
        json_str = output

    return json.loads(json_str)

def get_doc_id(url):
    return url.split("/")[5]

OpenAI embeddings don't work in HK (even with VPN). Hence, HuggingFace embedding model was used. Also feel free to test out API calls from DeepSeep or replace it with your choice of LLM for comparison.

In [100]:
embd = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# change to bedrock llm
llm = ChatOpenAI(
    openai_api_base="https://api.deepseek.com/v1",
    openai_api_key="sk-ea1868b36aa34a36be9a223e75c1c63c", 
    model="deepseek-chat"
)

In [3]:
import spacy

# load spacy models
nlp_textcat = spacy.load("spaCy/textcat_model")
nlp_ner = spacy.load("spaCy/ner_model")



**Intermediate Step: Preparing Google Cloud API**
- for loading contents of Google Docs
- you can also try loading content using **load_pdf()** and **load_web**
- the code below uses my Google Cloud credentials included in this folder

In [6]:
! pip install --upgrade google-auth google-auth-oauthlib google-api-python-client

Collecting google-auth-oauthlib
  Using cached google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)
Collecting google-api-python-client
  Downloading google_api_python_client-2.174.0-py3-none-any.whl.metadata (7.0 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client)
  Using cached httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client)
  Using cached google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Using cached uritemplate-4.2.0-py3-none-any.whl.metadata (2.6 kB)
Collecting proto-plus<2.0.0,>=1.22.3 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5->google-api-python-client)
  Using cached proto_plus-1.26.1-py3

doc_links contains the Google Docs from the Sommelier Knowledge Base

In [4]:
docs_links = [
    "https://docs.google.com/document/d/1MvX9CTrVcoWg7WLAscq2MmnhTIrR0hZIGkpJMqhgflo/edit?tab=t.0",
    "https://docs.google.com/document/d/1NcV9_JGjMfA4WlihW3vNTduy24NBWdy1RWXrA2W0BIk/edit?usp=sharing",
    "https://docs.google.com/document/d/193rx2Rh6u-Ud40k-rgnqSQs-94SvHdeXPrPxOWK59X0/edit?usp=sharing",
    "https://docs.google.com/document/d/1vRDsn5o5mdymOEJ_O0tS4wcOjsAjt_2mLZqFfvgDUOs/edit?usp=sharing",
    "https://docs.google.com/document/d/1JceLBII727AZzSrDFfdGthJ1G4PhCDsA8sEm_dQMVr0/edit?usp=sharing",
    "https://docs.google.com/document/d/1yonU4qcysNkgd0BvbFmeIW9NF2ARErRJVW8QZynJyvM/edit?usp=sharing",
    "https://docs.google.com/document/d/1bq2AE1Jy6cQFt1xgjqtkof12Lw6F6fujqTlN1nZnh0A/edit?usp=sharing",
    "https://docs.google.com/document/d/1i-OcQeo7XOG83gS2ay2u0SLMWs4f8FG0JE_7l87qJkw/edit?usp=sharing",
    "https://docs.google.com/document/d/1PyZE8v3S3aUY66lFn97q0vaXDHc60lyso2oUFP0htjY/edit?usp=sharing",
    "https://docs.google.com/document/d/1Dudd7-6yl_UQrxfGa3MJZlKOfzHQdqWg0fb8Z9RzBec/edit?usp=sharing",
    "https://docs.google.com/document/d/1cjhrhCccuwiIh0ujj8QeamJ2JHhI6CjPmO84t1DSRZ0/edit?usp=sharing",
    "https://docs.google.com/document/d/1ESlfU6v8jseFlllZb3eaUeCJt69EIMsZiyMrDac-wX8/edit?usp=sharing",
    "https://docs.google.com/document/d/1xQhAkC3oP2cb262EjaHCV6CxEgeEGUsKrC8pH2p6RiY/edit?usp=sharing"
    
]

g_docs = GoogleDriveLoader(
    document_ids = [get_doc_id(i) for i in docs_links],
    credentials_path = "credentials_google.json",
    token_path = "token.json"
).load()

  g_docs = GoogleDriveLoader(


Chroma seems to be a convenient alternative, as a chroma vectorstore can be converted directly into a retriever that returns k relevant documents. 

In [5]:
path = "wine_food_pairing_knowledge.pdf"
word = load_pdf(path)
g_docs = g_docs + word

In [6]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300, 
    chunk_overlap = 50
)

splits = text_splitter.split_documents(g_docs)

vectorstore = Chroma.from_documents(
    documents = splits,
    embedding = embd
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)


**Preparing Pydantic Schemas and LLM Modelfiles**

In [7]:
import datetime
from typing import List, Optional
from pydantic import BaseModel, Field

class FoodPairing(BaseModel):
    """Information about a food item that pairs well with a wine."""
    
    dish_name: Optional[str] = Field(
        default=None, description="Name of the dish (e.g., 'roast duck', 'brie cheese')."
    )
    pairing_type: Optional[str] = Field(
        default=None, description="Type of pairing (e.g., 'complementary', 'contrast')."
    )
    course: Optional[str] = Field(
        default=None, description="Course type (e.g., 'starter', 'main', 'dessert')."
    )
    description: Optional[str] = Field(
        default=None, description="Description of the pairing experience or rationale."
    )
    suitability: Optional[int] = Field(
        default=None, description="Suitability score or rating for the pairing (e.g., 1–10)."
    )
    acidity: Optional[str] = Field(
        default=None, description="Acidity match or contrast for the dish (e.g., 'low', 'crisp')."
    )
    regional_pairing: Optional[str] = Field(
        default=None, description="Regional or traditional pairing origin (e.g., 'Provence')."
    )
    sweetness: Optional[str] = Field(
        default=None, description="Sweetness level or match (e.g., 'dry', 'sweet')."
    )
    

class WineMetadata(BaseModel):
    """Structured metadata filters for querying a wine product database."""

    wine_name: Optional[str] = Field(
        default=None, description="Specific wine name mentioned in the query."
    )
    variant_id: Optional[int] = Field(
        default=None, description="Specific product variant ID."
    )
    size: Optional[str] = Field(
        default=None, description="Bottle size (e.g., '750ml', '1.5L')."
    )
    volume_unit: Optional[str] = Field(
        default=None, description="Units for volume (e.g., 'ml', 'L')."
    )
    rating: Optional[str] = Field(
        default=None, description="Expert or user rating (e.g., '90+', '4.5 stars')."
    )
    stock: Optional[int] = Field(
        default=None, description="Stock availability if specified."
    )

    min_price: Optional[float] = Field(
        default=None, description="Minimum price filter (inclusive)."
    )
    max_price: Optional[float] = Field(
        default=None, description="Maximum price filter (inclusive)."
    )

    winemaker: Optional[str] = Field(
        default=None, description="Name of the winemaker or producer."
    )
    vintage: Optional[int] = Field(
        default=None, description="Vintage year of the wine (e.g., 2015)."
    )
    country: Optional[str] = Field(
        default=None, description="Country of origin."
    )
    region: Optional[str] = Field(
        default=None, description="Region or appellation."
    )
    wine_type: Optional[str] = Field(
        default=None, description="Type of wine (e.g., red, white, rosé, sparkling)."
    )
    wine_grapes: Optional[str] = Field(
        default=None, description="Grape variety or blend (e.g., Merlot, Syrah)."
    )

    level_to_drink: Optional[str] = Field(
        default=None, description="Drinkability status (e.g., 'drink now', 'ageing potential')."
    )
    vinification: Optional[str] = Field(
        default=None, description="Winemaking process (e.g., 'oak-aged', 'carbonic maceration')."
    )
    season: Optional[str] = Field(
        default=None, description="Season the wine is suited for (e.g., 'summer')."
    )
    soil_type: Optional[str] = Field(
        default=None, description="Soil characteristics (e.g., 'limestone', 'volcanic')."
    )

    description: Optional[str] = Field(
        default=None, description="Flavor notes or sensory descriptions."
    )
    occasion: Optional[str] = Field(
        default=None, description="Occasion suitability (e.g., 'wedding', 'gift')."
    )

    body: Optional[str] = Field(
        default=None, description="Body type (e.g., 'light', 'full-bodied')."
    )
    acidity: Optional[str] = Field(
        default=None, description="Acidity level (e.g., 'crisp', 'low')."
    )
    alcohol: Optional[float] = Field(
        default=None, description="Alcohol content as a percentage (e.g., 13.5)."
    )
    fruitiness: Optional[str] = Field(
        default=None, description="Level of fruitiness (e.g., 'dry', 'juicy')."
    )
    minerality: Optional[str] = Field(
        default=None, description="Presence of mineral notes (e.g., 'chalky')."
    )
    sweetness: Optional[str] = Field(
        default=None, description="Sweetness level (e.g., 'dry', 'semi-sweet')."
    )
    food_pairings: List[FoodPairing] = Field(
        default=None,
        description="List of recommended or matching food pairings."
    )

    def pretty_print(self) -> None:
        for field_name, field in self.model_fields.items():
            val = getattr(self, field_name)
            if val is not None and val != field.default:
                print(f"{field_name}: {val}")
    
    

In [8]:
# for generating taste profiles

system = """You are an expert at converting user questions into wine taste profiles
in the form of structured metadata

Your job is to convert a user's question into a JSON object that matches the following schema:

{
  "wine_name": "Optional[str]",              // Specific wine name mentioned
  "winemaker": "Optional[str]",              // Name of the winemaker or producer
  "vintage": "Optional[int]",                // Vintage year (e.g. 2015)
  "country": "Optional[str]",                // Country of origin
  "region": "Optional[str]",                 // Region or appellation
  "wine_type": "Optional[str]",              // Type of wine (e.g. red, white, rosé, sparkling)
  "wine_grapes": "Optional[str]",            // Grape variety or blend (e.g. Merlot, Syrah)
  "level_to_drink": "Optional[str]",         // Drinkability status (e.g. "drink now", "ageing potential")
  "vinification": "Optional[str]",           // Wine-making process (e.g. "oak-aged", "carbonic maceration")
  "season": "Optional[str]",                 // Season the wine is suited for (e.g. "summer", "winter")
  "soil_type": "Optional[str]",              // Soil characteristics (e.g. "limestone", "volcanic")
  "description": "Optional[str]",            // Flavor notes or sensory descriptions
  "occasion": "Optional[str]",               // Occasion suitability (e.g. "wedding", "gift", "everyday")
  "body": "Optional[str]",                   // Body type (e.g. "light", "full-bodied")
  "acidity": "Optional[str]",                // Acidity level (e.g. "crisp", "low")
  "alcohol": "Optional[float]",              // Alcohol content (as percentage, e.g. 13.5)
  "fruitiness": "Optional[str]",             // Level of fruitiness (e.g. "dry", "juicy")
  "minerality": "Optional[str]",             // Presence of mineral notes (e.g. "chalky", "flinty")
  "sweetness": "Optional[str]",              // Sweetness level (e.g. "dry", "semi-sweet", "sweet")
}

Only include optional fields if they are explicitly mentioned in the user's query. 
Return the result as **pure JSON only**, with no code block, no Markdown, and no explanation.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

In [9]:
FIELD_MAP = {
    "wine_name": "Product Name",
    "variant_id": "VariantID",
    "size": "Size",
    "volume_unit": "Volume Unit",
    "rating": "Expert Ratings",
    "stock": "Stock",
    "min_price": "WS Retail Price",
    "max_price": "WS Retail Price",
    "winemaker": "Winemaker Name",
    "vintage": "Vintage (Year)",
    "country": "Country",
    "region": "Region",
    "wine_type": "Wine Type",
    "wine_grapes": "Wine Grapes",
    "season": "Season",
    "soil_type": "Soil Type",
    "occasion": "Occasion",

}
    # "level_to_drink": "Level to Drink",
    # "description": "Description",
    # "vinification": "Vinification Process",
    # "body": "Body",
    # "acidity": "Acidity",
    # "alcohol": "Alcohol",
    # "fruitiness": "Fruitiness",
    # "minerality": "Minerality",
    # "sweetness": "Sweetness / Dry"



**More helper functions**

In [89]:
# Stopwatch
import time
from functools import wraps

def timed(log_times):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            start = time.time()
            result = func(*args, **kwargs)
            end = time.time()
            log_times[func.__name__] = round(end - start, 4)
            return result
        return wrapper
    return decorator

log_times = {}
    

In [92]:
@timed(log_times)
def query_analyzer(question: str) -> WineMetadata:
    """
    Converts a natural language question into a structured query
    using a spaCy custom NLP model.
    """
    messages = [
        SystemMessage(content=system),
        HumanMessage(content=question)
    ]
    
    response = llm.invoke(messages)
    parsed_dict = extract_json_from_llm_output(response.content)
    return WineMetadata(**parsed_dict)

@timed(log_times)
def filter_wines(data, model_instance, field_map, max_results=20):
    """
    Given metadata filters constructed from user query using LLM,
    returns first n wine profiles that match. 
    """
    filters = model_instance.model_dump(exclude_none=True)
    results = []

    for wine in data:
        match = True
        for key, value in filters.items():
            if key in ("food_pairings"):
                continue

            if key not in field_map:
                continue
                
            elif key in ("min_price", "max_price"):
                price_field = field_map[key]
                try:
                    wine_price = float(wine.get(price_field,0))
                except ValueError:
                    match = False
                    break
                if key == "min_price" and wine_price < value:
                    match = False
                    break
                if key == "max_price" and wine_price > value:
                    match = False
                    break

            else:
                field = field_map.get(key)
                if field not in wine:
                    match = False
                    # print(f"[MISSING] Field '{field}' missing in wine")
                    break
                wine_val = normalize(wine[field])
                query_val = normalize(value)
                if query_val not in wine_val:
                    # print(f"[FAIL] {key}: '{query_val}' not in '{wine_val}' (wine: {wine.get('Product Name')})")
                    match = False
                    break

        if match:
            results.append(wine)
            if len(results)== max_results:
                break

    return results

@timed(log_times)
def create_taste_profile(parsed_query):
    """
    Enriches food taste profile from user query using LLM.
    """
    data = parsed_query.model_dump(exclude_none=True)

    # Check if food_pairing exists and is not empty
    food_pairings = data.get("food_pairings")
    pairing_descriptions = []
    if food_pairings:
        # Extract raw attributes
        for pairing in food_pairings:
            if isinstance(pairing, dict):
                entries = [f"{key}: {value}" for key, value in pairing.items() if value]
                block = "\n".join(entries)
                pairing_descriptions.append(block)
            else:
                try:
                    entries = [f"{key}: {getattr(pairing, key)}" for key in pairing.__fields__ if getattr(pairing, key)]
                    block = "\n".join(entries)
                    pairing_descriptions.append(block)
                except:
                    continue
        raw_input = "\n\n---\n\n".join(pairing_descriptions)

        # RAG
        response = qa_chain.invoke(f"Create a taste profile based on the following context: {raw_input}")
        return response["result"]
    return None               

@timed(log_times)
def generate_recommendations(filtered, profile, top_k=5):
    """
    Performs similarity search between embeddings of (a) each of the filtered wines
    and (b) enriched taste profile from query, if present otherwise randomly 
    sample 3 wines (cheapest -> middle -> most expensive)
    """
    if len(filtered) == 0:
        return None
        
    if len(filtered) < top_k:
        return filtered
        
    if profile:
        # print("yay")
        embedded_wines = []
        for wine in filtered:
            # print(wine)
            content = "\n".join(f"{key}: {value}" for key, value in wine.items() if value)
            wine_embed = embd.embed_query(content)
            embedded_wines.append((wine, wine_embed))

        profile_embed = embd.embed_query(profile)
        scored_embed = [(wine, cosine_similarity(profile_embed, wine_embed)) for wine, wine_embed in embedded_wines]
        top_wines = sorted(scored_embed, key=lambda x: x[1], reverse=True)[:top_k]
        return [wine for wine, _ in top_wines]

    else:
        # print("nooo")
        sorted_wines = sorted(filtered, key=lambda x: float(x.WS_Retail_Price))
        n = len(sorted_wines)
        if n == 0:
            return []
        step = max(1, n // top_k)
        sampled = [random.choice(sorted_wines[i:i+step]) for i in range(0, n, step)][:top_k]

        return sampled 

def ask_ai(question, data, field_map):
    start_main = time.time()
    parsed_query = query_analyzer(question)
    filtered = filter_wines(data=data, model_instance=parsed_query, field_map=field_map)
    profile = create_taste_profile(parsed_query)
    recommendations = generate_recommendations(filtered, profile)

    
    total_time = round(time.time() - start_main, 4)
    # Print timing summary
    print("Timing summary:")
    for name, duration in log_times.items():
        print(f"  {name}: {duration} seconds")
    print(f"  Total time: {total_time} seconds")

    
    return recommendations

In [11]:
wine_data = "wine_data.csv"
food_data = "food_data.csv"

wine_df = pd.read_csv(wine_data)
food_df = pd.read_csv(food_data)

wine_json = wine_df.to_dict(orient="records")
food_json = food_df.to_dict(orient="records")

### Wine-Food Pairing Tests

Questions:
1. What wine goes well with spicy Thai green curry with coconut milk?
2. Recommend a red wine under 300 HKD that pairs well with grilled lamb and comes from Spain
3. Suggest a celebratory wine that works with oysters and has high acidity.
4. I'm cooking mushroom risotto and want something medium-bodied and earthy to go with it.
5. Pair a bold Napa Cabernet Sauvignon with sushi.

In [93]:
questions = [
"What wine goes well with spicy Thai green curry with coconut milk?",
"Recommend a red wine under 300 HKD that pairs well with grilled lamb and comes from Spain",
"Suggest a celebratory wine that works with oysters and has high acidity.",
"I'm cooking mushroom risotto and want something medium-bodied and earthy to go with it.",
"Pair a bold Napa Cabernet Sauvignon with sushi.",
]

for i in range(1):
# for i in range(2, 5, 2):
    print(questions[i])  # Correct variable name
    recommendation = ask_ai(question=questions[i], data=wine_json, field_map=FIELD_MAP)
    if recommendation == None:
        print("No recommendations found")
        continue

    for j in range(5):
        try:
            print(f"Wine {j+1}:", recommendation[j]['Product Name'])
        except:
            print("No more suggested wines")
            continue
    print("\n")



What wine goes well with spicy Thai green curry with coconut milk?
Timing summary:
  query_analyzer: 11.4839 seconds
  filter_wines: 0.0 seconds
  create_taste_profile: 39.4799 seconds
  generate_recommendations: 2.3598 seconds
  Total time: 53.3235 seconds


NameError: name 'recommendations' is not defined

### Food-Wine Pairing Tests

Questions
1. What are the best dishes to serve with a 2020 Puligny-Montrachet Chardonnay?
2. I have a bottle of Amarone della Valpolicella — what foods would pair well with it?
3. What should I cook for dinner to go with a chilled bottle of Sancerre?
4. Can you suggest a full-course meal to go with a vintage Champagne?
5. What kind of food works well with a sweet Riesling from Mosel?
   

In [53]:
# Base Models for Reverse Pairings
class WinePairing(BaseModel):
    """Structured metadata filters for querying a wine product database."""

    wine_name: Optional[str] = Field(
        default=None, description="Specific wine name mentioned in the query."
    )
    wine_type: Optional[str] = Field(
        default=None, description="Type of wine (e.g., red, white, rosé, sparkling)."
    )
    wine_grapes: Optional[str] = Field(
        default=None, description="Grape variety or blend (e.g., Merlot, Syrah)."
    )
    level_to_drink: Optional[str] = Field(
        default=None, description="Drinkability status (e.g., 'drink now', 'ageing potential')."
    )
    season: Optional[str] = Field(
        default=None, description="Season the wine is suited for (e.g., 'summer')."
    )
    description: Optional[str] = Field(
        default=None, description="Flavor notes or sensory descriptions."
    )
    occasion: Optional[str] = Field(
        default=None, description="Occasion suitability (e.g., 'wedding', 'gift')."
    )
    body: Optional[str] = Field(
        default=None, description="Body type (e.g., 'light', 'full-bodied')."
    )
    acidity: Optional[str] = Field(
        default=None, description="Acidity level (e.g., 'crisp', 'low')."
    )
    alcohol: Optional[float] = Field(
        default=None, description="Alcohol content as a percentage (e.g., 13.5)."
    )
    fruitiness: Optional[str] = Field(
        default=None, description="Level of fruitiness (e.g., 'dry', 'juicy')."
    )
    minerality: Optional[str] = Field(
        default=None, description="Presence of mineral notes (e.g., 'chalky')."
    )
    sweetness: Optional[str] = Field(
        default=None, description="Sweetness level (e.g., 'dry', 'semi-sweet')."
    )


class FoodMetadata(BaseModel):
    """Structured metadata filters for querying a food product database."""
    
    dish_name: Optional[str] = Field(
        default=None, description="Name of the dish (e.g., 'roast duck', 'brie cheese')."
    )
    min_price: Optional[float] = Field(
        default=None, description="Minimum price filter (inclusive)."
    )
    max_price: Optional[float] = Field(
        default=None, description="Maximum price filter (inclusive)."
    )
    type: Optional[str] = Field(
        default=None, description="Food type (e.g., 'fruit', 'pastry', 'vegetarian')."
    )
    course: Optional[str] = Field(
        default=None, description="Course (e.g., 'starter', 'main', 'dessert')."
    )
    description: Optional[str] = Field(
        default=None, description="Description of the dish."
    )
    acidity: Optional[str] = Field(
        default=None, description="Acidity match or contrast for the dish (e.g., 'low', 'crisp')."
    )
    regional_pairing: Optional[str] = Field(
        default=None, description="Regional or traditional pairing origin (e.g., 'Provence')."
    )
    sweetness: Optional[str] = Field(
        default=None, description="Sweetness level or match (e.g., 'dry', 'sweet')."
    )
    wine_pairings: List[WinePairing] = Field(
        default=None,
        description="List of recommended or matching wine pairings."
    )

    def pretty_print(self) -> None:
        for field_name, field in self.model_fields.items():
            val = getattr(self, field_name)
            if val is not None and val != field.default:
                print(f"{field_name}: {val}")
    


In [54]:
food_system = """You are an expert at converting user questions into structured metadata filters 
to query a database of food products

Your job is to convert a user's question into a JSON object that matches the following schema:
            
{
    "dish_name": "Optional[str]",                 // Name of the dish (e.g., 'roast duck', 'brie cheese').
    "min_price": "Optional[float]" ,              // Minimum price filter (inclusive).
    "max_price": "Optional[float]" ,              // Maximum price filter (inclusive).
    "type": "Optional[str]" ,                     // Food type (e.g., 'fruit', 'pastry', 'vegetarian').
    "course": "Optional[str]" ,                   // Course (e.g., 'starter', 'main', 'dessert').
    "description": "Optional[str]" ,              // Description of the dish.
    "acidity": "Optional[str]" ,                  // Acidity match or contrast for the dish (e.g., 'low', 'crisp').
    "regional_pairing": "Optional[str]" ,         // Regional or traditional pairing origin (e.g., 'Provence').
    "sweetness": "Optional[str]" ,                // Sweetness level or match (e.g., 'dry', 'sweet').

    "wine_pairings": [                            // Optional: List of recommended food matches
    {
          "wine_name": "Optional[str]",              // Specific wine name mentioned
          "wine_type": "Optional[str]",              // Type of wine (e.g. red, white, rosé, sparkling)
          "wine_grapes": "Optional[str]",            // Grape variety or blend (e.g. Merlot, Syrah)
          "level_to_drink": "Optional[str]",         // Drinkability status (e.g. "drink now", "ageing potential")
          "season": "Optional[str]",                 // Season the wine is suited for (e.g. "summer", "winter")        
          "description": "Optional[str]",            // Flavor notes or sensory descriptions
          "occasion": "Optional[str]",               // Occasion suitability (e.g. "wedding", "gift", "everyday")
          "body": "Optional[str]",                   // Body type (e.g. "light", "full-bodied")
          "acidity": "Optional[str]",                // Acidity level (e.g. "crisp", "low")
          "alcohol": "Optional[float]",              // Alcohol content (as percentage, e.g. 13.5)
          "fruitiness": "Optional[str]",             // Level of fruitiness (e.g. "dry", "juicy")
          "minerality": "Optional[str]",             // Presence of mineral notes (e.g. "chalky", "flinty")
          "sweetness": "Optional[str]",              // Sweetness level (e.g. "dry", "semi-sweet", "sweet")
    }
  ]
}

Only include optional fields if they are explicitly mentioned in the user's query. 
Return the result as **pure JSON only**, with no code block, no Markdown, and no explanation.
"""

food_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", food_system),
        ("human", "{question}"),
    ]
)

In [55]:
FOOD_FIELD_MAP = {
    "dish_name": "Product Name",
    "price": "Price",
    "type": "Food Type",
    "course": "Course"
}

In [None]:
import time
from functools import wraps

def timed(log_times):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            start = time.time()
            result = func(*args, **kwargs)
            end = time.time()
            log_times[func.__name__] = round(end - start, 4)
            return result
        return wrapper
    return decorator

log_times = {}

In [96]:
@timed(log_times)
def food_query_analyzer(question: str) -> FoodMetadata:
    """
    Converts a natural language question into a structured query
    using a DeepSeek LLM backend.
    """
    messages = [
        SystemMessage(content=food_system),
        HumanMessage(content=question)
    ]
    
    response = llm.invoke(messages)
    print(response.content)
    parsed_dict = extract_json_from_llm_output(response.content)
    return FoodMetadata(**parsed_dict)

@timed(log_times)
def filter_food(data, model_instance, field_map, max_results=20):
    """
    Given metadata filters constructed from user query using LLM,
    returns first n wine profiles that match. 
    """
    filters = model_instance.model_dump(exclude_none=True)
    results = []

    for food in data:
        match = True
        for key, value in filters.items():
            if key not in field_map:
                continue
                
            elif key in ("min_price", "max_price"):
                price_field = field_map[key]
                try:
                    food_price = float(food.get(price_field,0))
                except ValueError:
                    match = False
                    break
                if key == "min_price" and food_price < value:
                    match = False
                    break
                if key == "max_price" and food_price > value:
                    match = False
                    break

            else:
                field = field_map.get(key)
                if field not in food:
                    match = False
                    # print(f"[MISSING] Field '{field}' missing in wine")
                    break
                food_val = normalize(food[field])
                query_val = normalize(value)
                if query_val not in food_val:
                    # print(f"[FAIL] {key}: '{query_val}' not in '{wine_val}' (wine: {wine.get('Product Name')})")
                    match = False
                    break

        if match:
            results.append(food)
            if len(results)== max_results:
                break

    return results

@timed(log_times)
def create_taste_profile(parsed_query):
    """
    Enriches wine taste profile from user query using LLM.
    """
    data = parsed_query.model_dump(exclude_none=True)

    # Check if food_pairing exists and is not empty
    wine_pairings = data.get("wine_pairings")
    pairing_descriptions = []
    if wine_pairings:
        # Extract raw attributes
        for pairing in wine_pairings:
            if isinstance(pairing, dict):
                entries = [f"{key}: {value}" for key, value in pairing.items() if value]
                block = "\n".join(entries)
                pairing_descriptions.append(block)
            else:
                try:
                    entries = [f"{key}: {getattr(pairing, key)}" for key in pairing.__fields__ if getattr(pairing, key)]
                    block = "\n".join(entries)
                    pairing_descriptions.append(block)
                except:
                    continue
        raw_input = "\n\n---\n\n".join(pairing_descriptions)

        # RAG
        response = qa_chain.invoke(f"Create a taste profile based on the following context: {raw_input}")
        return response["result"]
    return None                       

@timed(log_times)
def generate_recommendations(filtered, profile, top_k=5):
    """
    Performs similarity search between embeddings of (a) each of the filtered wines
    and (b) enriched taste profile from query, if present otherwise randomly 
    sample 3 wines (cheapest -> middle -> most expensive)
    """
    if len(filtered) == 0:
        return None
        
    if len(filtered) < top_k:
        return filtered
        
    if profile:
        # print("yay")
        embedded_food = []
        for food in filtered:
            # print(wine)
            content = "\n".join(f"{key}: {value}" for key, value in food.items() if value)
            food_embed = embd.embed_query(content)
            embedded_food.append((food, food_embed))

        profile_embed = embd.embed_query(profile)
        scored_embed = [(food, cosine_similarity(profile_embed, food_embed)) for food, food_embed in embedded_food]
        top_food = sorted(scored_embed, key=lambda x: x[1], reverse=True)[:top_k]
        return [food for food, _ in top_food]

    else:
        # print("nooo")
        sorted_wines = sorted(filtered, key=lambda x: float(x.WS_Retail_Price))
        n = len(sorted_wines)
        if n == 0:
            return []
        step = max(1, n // top_k)
        sampled = [random.choice(sorted_wines[i:i+step]) for i in range(0, n, step)][:top_k]

        return sampled 

@timed(log_times)
def ask_food_ai(question, data, field_map):
    start_main = time.time()
    parsed_query = food_query_analyzer(question)
    # parsed_query.pretty_print()
    filtered = filter_wines(data=data, model_instance=parsed_query, field_map=field_map)
    # print("\n\nFiltered:", filtered)
    profile = create_taste_profile(parsed_query)
    # print("\n\nProfile:", profile)
    recommendations = generate_recommendations(filtered, profile)
    # print(recommendations)
    
    total_time = round(time.time() - start_main, 4)
    # Print timing summary
    print("Timing summary:")
    for name, duration in log_times.items():
        print(f"  {name}: {duration} seconds")
    print(f"  Total time: {total_time} seconds")
    
    return recommendations

In [95]:
questions = [
"What are the best dishes to serve with a 2020 Puligny-Montrachet Chardonnay?"
]

for i in range(len(questions)):
# for i in range(2, 5, 2):
    print(questions[i])  # Correct variable name
    recommendation = ask_food_ai(question=questions[i], data = food_json, field_map = FOOD_FIELD_MAP)
    if recommendation == None:
        print("No recommendation found")
        continue

    for j in range(5):
        try:
            print(f"Food {j+1}:", recommendation[j]['Product Name'])
        except:
            print("No more suggested food")
            continue
    print("\n")


What are the best dishes to serve with a 2020 Puligny-Montrachet Chardonnay?
Timing summary:
  query_analyzer: 11.4839 seconds
  filter_wines: 0.0 seconds
  create_taste_profile: 46.7189 seconds
  generate_recommendations: 1.7185 seconds
  food_query_analyzer: 7.6931 seconds
  Total time: 56.1305 seconds
Food 1: Alsatian Apple Tart with Caramelized Pecans
Food 2: Bacalhau à Brás
Food 3: Apple and Brie Tartlets with Caramelized Onions
Food 4: Alsatian Bacon and Onion Tart
Food 5: Baked Camembert with Fig Jam




In [101]:
question = "What are the best dishes to serve with a 2020 Puligny-Montrachet Chardonnay?"
start_main = time.time()
parsed_query = food_query_analyzer(question)
end_time = time.time()

print(f"time:", {round(end_time - start_main, 4)})

BadRequestError: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}