In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import os
from openai import OpenAI
from getpass import getpass
from dotenv import load_dotenv  

load_dotenv(".env")  # or wherever your API key is
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

Step 1: Fetch raw HTML

In [92]:
urls = [
    #licence for import, export and transhipment
"https://www.sfa.gov.sg/food-import-export/commercial-imports/what-you-need-to-know-for-import-of-animal-feed",
"https://www.sfa.gov.sg/commercial-exports/what-you-need-to-know-for-export-tranship-of-animal-feed",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/licence-for-import-export-transhipment-of-meat-and-fish-products",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/licence-for-import-transhipment-of-fresh-fruits-and-vegetables",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/licence-to-import-poultry",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Licence-to-Import-Sheep-&-Goats",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Licence-to-Import-Table-Eggs",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/licence-to-import-export-tranship-food-animals--birds--eggs-and-biologics",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Registration-to-Import-Animal-Feed",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Registration-to-Import-Feed-for-Food-Producing-Animals",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Registration-to-Import-Processed-Food-Products-and-Food-Appliances",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Registration-to-Import-Live-Frogs-for-Human-Consumption",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Rice-Licence",
    #licence for food retail
"https://www.sfa.gov.sg/food-retail/licence-permit/requirements-for-licence-permit-for-food-retail",
"https://www.sfa.gov.sg/food-retail/businesses-that-currently-do-not-need-licence-permit/online-food-businesses/types-of-online-food-businesses",
"https://www.sfa.gov.sg/food-retail/businesses-that-currently-do-not-need-licence-permit/home-based-food-businesses/about-hdb-ura-home-based-business-scheme",
"https://www.sfa.gov.sg/food-retail/businesses-that-currently-do-not-need-licence-permit/food-delivery-businesses/requirements-for-food-delivery-businesses",
"https://www.sfa.gov.sg/food-retail/businesses-that-currently-do-not-need-licence-permit/group-buy-activities/guidelines-for-group-buy-organisers",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/food-shop-licence",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/food-stall-licence",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Supermarket-Licence",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Permit-for-Temporary-Fair",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Temporary-Fair-Stall-Licence",
#licence for food non-retail
"https://www.sfa.gov.sg/food-manufacturing-storage/licence-registration/businesses-that-need-licence-registration-for-food-manufacturing-storage",
"https://www.sfa.gov.sg/food-manufacturing-storage/licence-registration/application-process-fees-for-licence-to-manufacture-and-process-animal-feed",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Licence-to-Manufacture-Animal-Feed-and-Process-of-Animal-Feed-for-Food-Producing-Animals",
#licence for farming
"https://www.sfa.gov.sg/farming/licence-registration/businesses-that-need-licence-registration-for-farming",
"https://www.sfa.gov.sg/farming/pesticides/requirements-for-pesticide-usage-in-farms",
"https://www.sfa.gov.sg/wholesale-markets-fishery-ports/licence-registration/businesses-that-need-licence-registration-for-fishing-vessel-gear-personnel",
#licence for export Health, Veterinary & Food Safety
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/health-certificate-for-export-of-live-foodfish-for-breeding",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/export-health-certificate",
"https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/free-sale-certificate-(food)",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Free-Sale-Certificate-(Animal-Feed)-for-Food-Producing-Animals",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Veterinary-Certificate-for-Food-Producing-Animals",
"https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Submission-Form-for-Food-Safety-Management-System",
#others
 "https://www.sfa.gov.sg/regulatory-standards-frameworks-guidelines/food-safety-management/understanding-food-safety-management-system-fsms",
 "https://www.sfa.gov.sg/faqs#food-retail-general",
 "https://www.police.gov.sg/E-Services/Apply-for-Liquor-Licence",
 "https://onemotoring.lta.gov.sg/content/onemotoring/home/buying/vehicle-types-and-registrations/commercial-vehicle/goods-vehicle-and-engineering-plant.html",
 "https://www.nea.gov.sg/our-services/hawker-management/becoming-a-hawker",
 "https://www.muis.gov.sg/halal/for-business/scheme-types---eligibility-criteria/",
 "https://www.hsa.gov.sg/tobacco-regulation/licences/retail/apply"
]


In [93]:
html_pages = []
for url in urls:
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract meaningful text only (remove scripts, styles, etc.)
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text_content = soup.get_text(separator="\n", strip=True)
    
    html_pages.append({"url": url, "text": text_content})

Step 2: Use LLM to parse structured content

In [94]:
def parse_text_with_llm(text, url):
    prompt = f"""
You are a careful information extraction assistant.

Return ONLY a valid JSON array of objects, no markdown, no comments, no explanations. 
Each object must have:
- "title": short descriptive title (from the text if possible)
- "subsection": subsection heading if available, else null
- "description": relevant text excerpt including URLs if found
- "licence": 
    - one exact matching licence name (from the official list below), OR
    - "No licence required" if the text states that no licence is needed
- "reason_for_licence": short explanation (e.g., “Home-based food businesses do not require an SFA licence to operate. ”)
- "requirements": all requirements or conditions for licence
- "app_guidance": all application or registration processes
- "other": other information
- "url": source page URL

STRICT RULES:
1. Only use the official licence names listed below.
2. If a section explicitly says that a licence is not required, set "licence": "No licence required" and include a short reason.
3. Do NOT invent or guess licence names (e.g., "Vending Machine Licence", "Pet Cafe Licence", "Food Catering Permit"). 
4. Please note that specific types of Food Shops, such as Pet Cafes, Mobile Food Wagons, Vending Machines, Private Markets, Herbal Tea and Mini Restaurants need Food Shop licence and have specific requirements, not Supermarket Licence.
5. If nothing relevant appears, return an empty array [].

VALID LICENCE NAMES:

-- Singapore Food Agency (SFA) --
"Licence for Import/Export/Transhipment of Meat and Fish Products",
"Licence for Import/Transhipment of Fresh Fruits and Vegetables",
"Licence to Import Poultry",
"Licence to Import Sheep & Goats",
"Licence to Import Table Eggs",
"Licence to Import/Export/Tranship Food Animals, Birds, Eggs and Biologics",
"Registration to Import Animal Feed",
"Registration to Import Feed for Food-Producing Animals",
"Registration to Import Processed Food Products and Food Appliances",
"Registration to Import Live Frogs for Human Consumption",
"Rice Licence",
"Food Shop Licence",
"Food Stall Licence",
"Supermarket Licence",
"Permit for Temporary Fair",
"Temporary Fair Stall Licence",
"Licence to Manufacture Animal Feed and Process of Animal Feed for Food-Producing Animals",
"Licence to Operate a Food Processing Establishment",
"Licence to Operate a Coldstore",
"Licence to Operate a Slaughterhouse",
"Export Health Certificate",
"Health Certificate for Export of Live Foodfish for Breeding",
"Free Sale Certificate (Food)",
"Free Sale Certificate (Animal Feed) for Food-Producing Animals",
"Veterinary Certificate for Food-Producing Animals",
"Submission Form for Food Safety Management System (FSMS)",

-- Other Agencies --
"Liquor Licence (Singapore Police Force)",
"https://www.police.gov.sg/Business-E-Services/Apply-for-Public-Entertainment-Licence",
"Hawker Stall Licence (National Environment Agency)",
"Vehicle Registration for Goods Vehicle (Land Transport Authority)",
"Halal Certification (Majlis Ugama Islam Singapura - MUIS)",
"Tobacco Retail Licence (Health Sciences Authority)"

TEXT EXTRACTED FROM HTML:
{text}

URL:
{url}
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    text = response.choices[0].message.content
    return text


In [95]:
structured_data = []
for page in html_pages:
    json_text = parse_text_with_llm(page['text'], page['url'])
    try:
        data = json.loads(json_text)
        structured_data.append(data)
        print(f"✅ Successfully parsed: {page['url']}")
    except json.JSONDecodeError:
        print(f"❌ Error parsing JSON for {page['url']}")
        print(json_text)

✅ Successfully parsed: https://www.sfa.gov.sg/food-import-export/commercial-imports/what-you-need-to-know-for-import-of-animal-feed
✅ Successfully parsed: https://www.sfa.gov.sg/commercial-exports/what-you-need-to-know-for-export-tranship-of-animal-feed
✅ Successfully parsed: https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/licence-for-import-export-transhipment-of-meat-and-fish-products
✅ Successfully parsed: https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/licence-for-import-transhipment-of-fresh-fruits-and-vegetables
✅ Successfully parsed: https://www.gobusiness.gov.sg/browse-all-licences/singapore-food-agency-(sfa)/licence-to-import-poultry
✅ Successfully parsed: https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Licence-to-Import-Sheep-&-Goats
✅ Successfully parsed: https://www.gobusiness.gov.sg/browse-all-licences/Singapore-Food-Agency-(SFA)/Licence-to-Import-Table-Eggs
✅ Successfully parsed: htt

Save raw data

In [96]:
with open("data/alllicenceinfo_parsed.json", "w", encoding="utf-8") as f:
    json.dump(structured_data, f, indent=2, ensure_ascii=False)

print("Raw Data JSON saved")


Raw Data JSON saved


Adding and Parsing content of additional URLs

In [97]:
# # New URLs only
# new_urls = [
#     "https://www.police.gov.sg/Business-E-Services/Apply-for-Public-Entertainment-Licence"
# ]

# # File path
# output_file = "data/alllicenceinfo_parsed.json"

# # Step 0: Load existing data if file exists
# if os.path.exists(output_file):
#     with open(output_file, "r", encoding="utf-8") as f:
#         structured_data = json.load(f)
# else:
#     structured_data = []

# # Step 1: Fetch & clean HTML for new URLs
# html_pages = []
# for url in new_urls:
#     resp = requests.get(url)
#     soup = BeautifulSoup(resp.text, "html.parser")

#     # Remove noise
#     for tag in soup(["script", "style", "noscript"]):
#         tag.decompose()

#     text_content = soup.get_text(separator="\n", strip=True)
#     html_pages.append({"url": url, "text": text_content})


# # Step 2: LLM parsing function
# def parse_text_with_llm(text, url):
#     prompt = f"""
# You are a careful information extraction assistant specialising in Singapore Food Agency (SFA) and other food-related regulatory licences. 
# Extract information only if it clearly relates to official licences, permits, registrations, or certificates — or if the text explicitly says that no licence is required.

# Return ONLY a valid JSON array of objects, no markdown, no comments, no explanations. 
# Each object must have:
# - "title": short descriptive title (from the text if possible)
# - "description": relevant text excerpt including URLs if found
# - "licence": 
#     - one exact matching licence name (from the official list below), OR
#     - "No licence required" if the text states that no licence is needed
# - "reason": short explanation (e.g., “Prepacked beverages are exempt from licensing”)
# - "subsection": subsection heading if available, else null
# - "url": source page URL

# STRICT RULES:
# 1. Only use the official licence names listed below.
# 2. If a section explicitly says that a licence is not required, set "licence": "No licence required" and include a short reason.
# 3. Do NOT invent or guess licence names (e.g., “Vending Machine Licence”, “Café Licence”, “Food Catering Permit”).
# 4. If nothing relevant appears, return an empty array [].

# VALID LICENCE NAMES:

# -- Singapore Food Agency (SFA) --
# "Licence for Import/Export/Transhipment of Meat and Fish Products",
# "Licence for Import/Transhipment of Fresh Fruits and Vegetables",
# "Licence to Import Poultry",
# "Licence to Import Sheep & Goats",
# "Licence to Import Table Eggs",
# "Licence to Import/Export/Tranship Food Animals, Birds, Eggs and Biologics",
# "Registration to Import Animal Feed",
# "Registration to Import Feed for Food-Producing Animals",
# "Registration to Import Processed Food Products and Food Appliances",
# "Registration to Import Live Frogs for Human Consumption",
# "Rice Licence",
# "Food Shop Licence",
# "Food Stall Licence",
# "Supermarket Licence",
# "Permit for Temporary Fair",
# "Temporary Fair Stall Licence",
# "Licence to Manufacture Animal Feed and Process of Animal Feed for Food-Producing Animals",
# "Licence to Operate a Food Processing Establishment",
# "Licence to Operate a Coldstore",
# "Licence to Operate a Slaughterhouse",
# "Export Health Certificate",
# "Health Certificate for Export of Live Foodfish for Breeding",
# "Free Sale Certificate (Food)",
# "Free Sale Certificate (Animal Feed) for Food-Producing Animals",
# "Veterinary Certificate for Food-Producing Animals",
# "Submission Form for Food Safety Management System (FSMS)",

# -- Other Agencies --
# "Liquor Licence (Singapore Police Force)",
# "https://www.police.gov.sg/Business-E-Services/Apply-for-Public-Entertainment-Licence",
# "Hawker Stall Licence (National Environment Agency)",
# "Vehicle Registration for Goods Vehicle (Land Transport Authority)",
# "Halal Certification (Majlis Ugama Islam Singapura - MUIS)",
# "Tobacco Retail Licence (Health Sciences Authority)"

# TEXT EXTRACTED FROM HTML:
# {text}

# URL:
# {url}
# """

#     response = client.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=[{"role": "user", "content": prompt}],
#         temperature=0
#     )
#     return response.choices[0].message.content


# # Step 3: Process only new URLs and append results
# for page in html_pages:
#     json_text = parse_text_with_llm(page['text'], page['url'])
#     try:
#         data = json.loads(json_text)
#         structured_data.append(data)
#         print(f"✅ Successfully parsed: {page['url']}")
#     except json.JSONDecodeError:
#         print(f"❌ Error parsing JSON for {page['url']}")
#         print(json_text)


# # Step 4: Save updated data back
# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(structured_data, f, indent=2, ensure_ascii=False)

# print(f"📂 Updated JSON saved at {output_file}")


Preview final JSON

In [98]:
with open("data/alllicenceinfo_parsed.json", "r", encoding="utf-8") as f:
    preview_data = json.load(f)

# Show only first X entries
preview_count = 3
preview_subset = preview_data[:preview_count]

print(json.dumps(preview_subset, indent=2, ensure_ascii=False))

[
  [
    {
      "title": "Import of Animal Feed for Food-Producing Animals",
      "subsection": null,
      "description": "Businesses that want to import animal feed for food-producing animals (e.g. live chickens and fish) must register with SFA and obtain a Cargo Clearance Permit (CCP) issued by Singapore Customs for each consignment.",
      "licence": "Registration to Import Animal Feed",
      "reason_for_licence": "Businesses must register to import animal feed for food-producing animals.",
      "requirements": "Must be registered with ACRA, activate UEN with Singapore Customs, set up a GIRO arrangement with SFA, and ensure animal feed complies with SFA’s requirements.",
      "app_guidance": "Log into the GoBusiness portal to submit your Registration to Import Animal Feed. Complete applications are processed within 1 working day.",
      "other": "Animal feed must be packaged and labelled in English with specific information.",
      "url": "https://www.sfa.gov.sg/food-impor

### Chunking and Vector Indexing

In [2]:
import json
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
import faiss
import numpy as np

1. Load and flatten JSON

In [3]:
with open("data/alllicenceinfo_parsed.json", "r", encoding="utf-8") as f:
    licence_data = json.load(f)

# Flatten nested lists
flat_licence_data = []
for item in licence_data:
    if isinstance(item, list):
        flat_licence_data.extend(item)
    else:
        flat_licence_data.append(item)
licence_data = flat_licence_data

2. Create Chunks

In [None]:
all_chunks = []
all_metadata = []

for i, item in enumerate(licence_data):
    title = str(item.get("title", ""))
    url = str(item.get("url", ""))
    description = str(item.get("description", ""))
    reason = str(item.get("reason_for_licence", ""))
    
    reqs = item.get("requirements", [])
    if isinstance(reqs, list):
        requirements = " ".join([str(r) for r in reqs])
    else:
        requirements = str(reqs)
    
    app_guidance = str(item.get("app_guidance", ""))
    other = str(item.get("other", ""))
    licence_name = str(item.get("licence", title))

    # Include URL in the chunk text
    chunk_text = "\n".join(filter(None, [description, reason, requirements, app_guidance, other, f"URL: {url}" if url else ""]))
    all_chunks.append(chunk_text)

    safe_title = "".join(c if c.isalnum() else "_" for c in title) or f"doc_{i}"
    chunk_id = f"{safe_title}_{i}"

    all_metadata.append({
        "chunk_id": chunk_id,
        "title": title,
        "licence_name": licence_name,
        "url": url,
        "original_index": i
    })

print(f"Total chunks created: {len(all_chunks)}")
print("Example chunk:\n", all_chunks[0])


Total chunks created: 48
Example chunk:
 Businesses that want to import animal feed for food-producing animals (e.g. live chickens and fish) must register with SFA and obtain a Cargo Clearance Permit (CCP) issued by Singapore Customs for each consignment.
Businesses must register to import animal feed for food-producing animals.
Must be registered with ACRA, activate UEN with Singapore Customs, set up a GIRO arrangement with SFA, and ensure animal feed complies with SFA’s requirements.
Log into the GoBusiness portal to submit your Registration to Import Animal Feed. Complete applications are processed within 1 working day.
Animal feed must be packaged and labelled in English with specific information.


3. Generate embeddings for all chunks

In [11]:
embeddings_model = OpenAIEmbeddings()
chunk_embeddings = embeddings_model.embed_documents(all_chunks)
chunk_embeddings = np.array(chunk_embeddings).astype("float16")

6. Create FAISS vector store

In [12]:
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)


In [13]:
# Stack all embeddings into a single 2D array (num_chunks, 1536)
X = np.vstack([emb.astype(np.float16).reshape(1, -1) for emb in chunk_embeddings])
X = np.ascontiguousarray(X, dtype=np.float16)

print("Shape:", X.shape)
print("Dtype:", X.dtype)
print("C_CONTIGUOUS:", X.flags['C_CONTIGUOUS'])

# Create FAISS index
dimension = X.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings
index.add(X)
print(f"FAISS index created with {index.ntotal} vectors")

Shape: (48, 1536)
Dtype: float16
C_CONTIGUOUS: True
FAISS index created with 48 vectors


In [14]:
# Keep track of metadata using chunk_id as key
metadata_dict = {meta["chunk_id"]: meta for meta in all_metadata}

# Example: inspect first 3 entries
for k, v in list(metadata_dict.items())[:3]:
    print(f"Chunk ID: {k}")
    print("Metadata:", v)
    print("---")

Chunk ID: Import_of_Animal_Feed_for_Food_Producing_Animals_0
Metadata: {'chunk_id': 'Import_of_Animal_Feed_for_Food_Producing_Animals_0', 'title': 'Import of Animal Feed for Food-Producing Animals', 'licence_name': 'Registration to Import Animal Feed', 'url': 'https://www.sfa.gov.sg/food-import-export/commercial-imports/what-you-need-to-know-for-import-of-animal-feed', 'original_index': 0}
---
Chunk ID: Export___Tranship_of_Animal_Feed_for_Food_Producing_Animals_1
Metadata: {'chunk_id': 'Export___Tranship_of_Animal_Feed_for_Food_Producing_Animals_1', 'title': 'Export / Tranship of Animal Feed for Food-Producing Animals', 'licence_name': 'Licence to Manufacture Animal Feed and Process of Animal Feed for Food-Producing Animals', 'url': 'https://www.sfa.gov.sg/commercial-exports/what-you-need-to-know-for-export-tranship-of-animal-feed', 'original_index': 1}
---
Chunk ID: Licence_for_Import_Export_Transhipment_of_Meat_and_Fish_Products_2
Metadata: {'chunk_id': 'Licence_for_Import_Export_Tr

Save raw chunks, metadata and embeddings

In [15]:
# Save raw chunks
with open("data/all_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

# Save metadata
with open("data/all_metadata.json", "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, ensure_ascii=False, indent=2)

# Save embeddings as a contiguous float16 array
X = np.array(chunk_embeddings, dtype=np.float16)          # convert all at once
X = np.ascontiguousarray(X)                               # make contiguous in memory
np.save("data/chunk_embeddings.npy", X)

# Save FAISS index
dimension = X.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(X)                                              # add float16 vectors
faiss.write_index(index, "data/faiss_index.index")

print("Chunks, metadata, embeddings, and FAISS index saved successfully!")

Chunks, metadata, embeddings, and FAISS index saved successfully!


Sample query

In [23]:
# -------------------------------
# 1️⃣ Define the user query
# -------------------------------
business_type = "Vending Machine"
additional_details = "selling raw seafood"

# Combine business type + details into query text for embeddings
query_text = f"Business type: {business_type}. Details: {additional_details}"

# -------------------------------
# 2️⃣ Embed the query
# -------------------------------
query_vector = np.array(embeddings_model.embed_query(query_text), dtype=np.float16)

# -------------------------------
# 3️⃣ Compute similarity to ALL chunks
# -------------------------------
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarities = [cosine_similarity(query_vector, emb) for emb in X]

# -------------------------------
# 4️⃣ Rank chunks by similarity
# -------------------------------
top_n = 12  # get more candidates initially
sorted_indices = np.argsort(similarities)[::-1]
top_indices = sorted_indices[:top_n]

# -------------------------------
# 4a️⃣ Filter by business type keywords
# -------------------------------
business_keywords = ["vending machine", "food shop", "retail outlet"]
filtered_indices = [
    i for i in top_indices
    if any(kw.lower() in all_chunks[i].lower() for kw in business_keywords)
]

filtered_chunks = [all_chunks[i] for i in filtered_indices]
filtered_metadata = [all_metadata[i] for i in filtered_indices]

print(f"Selected {len(filtered_chunks)} chunks after business-type filtering.\n")

# Debug: show scores and metadata
for idx in filtered_indices:
    meta = all_metadata[idx]
    print(f"Score: {similarities[idx]:.3f}")
    print("Title:", meta["title"])
    print("Licence:", meta.get("licence_name", meta["title"]))
    print("Preview:", all_chunks[idx][:200], "\n---")

# -------------------------------
# 5️⃣ Combine chunks into prompt
# -------------------------------
combined_context = "\n\n".join(filtered_chunks)

prompt = f"""
You are a knowledgeable assistant for Singapore food business regulations.

Given the following government-sourced information:

{combined_context}

The user wants to open a business with these details:
Business type: {business_type}
Product sold: {additional_details}

Instructions for you:
- Only list licences, permits, or approvals that are specifically relevant to this type of business and product.
- Ignore general licences that cover unrelated food types or retail formats.
- Provide plain-language explanation of why each licence is required.
- Include step-by-step guidance or application URLs if available.
"""

# -------------------------------
# 6️⃣ Ask LLM to summarise
# -------------------------------
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}],
    temperature=0
)

summary = response.choices[0].message.content
print("\n--- Summary for User ---\n")
print(summary)


Selected 2 chunks after business-type filtering.

Score: 0.787
Title: Requirements for Food Shop Licences
Licence: Food Shop Licence
Preview: Food Shops include bakeries, restaurants, food vending machines, canteens, coffee shops, eating houses, food courts, standalone kiosks, takeaway food carts, and private canteens/markets. The requireme 
---
Score: 0.765
Title: Food Shop Licence
Licence: Food Shop Licence
Preview: For the operation of a food shop where there is retail sale of food and/or beverage. You will need to apply for a Food Shop Licence if you intend to operate a retail food outlet where food and/or drin 
---

--- Summary for User ---

To operate a vending machine business selling raw seafood in Singapore, you will need to obtain specific licenses and approvals. Here’s a breakdown of the relevant requirements:

### 1. Food Shop Licence
**Why it's required:** The Food Shop Licence is necessary for any food retail outlet, including vending machines, to ensure that the business