In [29]:
# file: scripts/normalize_menu.py

import csv
import re
from html import unescape
from pathlib import Path

INPUT_PATH = Path("full_menu.txt")
OUTPUT_PATH = Path("menu_normalized_443.csv")

COFFEE_DRINKS = {"americano", "cappuccino", "affogato"}

def clean_name(raw: str) -> str:
    s = unescape(raw)
    s = s.replace("Pidge/porter", "Pidge Porter")
    s = s.replace("Piec", "Pie")
    s = s.replace("Vanila", "Vanilla")
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"\(.*?\)", "", s).strip()
    return s

def infer_type(name: str, raw: str) -> str:
    n = name.lower()
    r = raw.lower()

    if "employee dessert" in r:
        return "DESSERT"
    if "visit" in r:
        return "SERVICE"
    if n in COFFEE_DRINKS:
        return "DRINK"
    if "ice cream" in n:
        return "ICE_CREAM"
    if any(x in n for x in ["cake", "brownie", "cheesecake", "tiramisu", "lamington", "tres leches", "cookie"]):
        return "DESSERT"
    if any(x in n for x in ["cone", "bottle", "packaging", "charges"]):
        return "EXTRA"
    if any(x in n for x in ["duo", "combo", "pack"]):
        return "COMBO"
    return "DESSERT"

def infer_variant(raw: str) -> str:
    r = raw.lower()

    if "employee dessert" in r:
        return "1_PIECE"
    if "visit" in r:
        return "FACTORY_VISIT"
    if "2pcs" in r:
        return "2_PIECES"
    if "1pcs" in r or "1pc" in r:
        return "1_PIECE"

    m = re.search(r"(\d+)\s*(ml|gms|gm|kg)", r)
    if m:
        return f"{m.group(1)}_{m.group(2).upper()}"

    if "junior scoop" in r:
        return "JUNIOR_SCOOP"
    if "regular scoop" in r or r.endswith("(scoop)"):
        return "REGULAR_SCOOP"
    if "mini tub" in r or "mini indulgence" in r:
        return "MINI_TUB"
    if "family feast" in r:
        return "FAMILY_FEAST"
    if "family tub" in r:
        return "FAMILY_TUB"
    if "perfect plenty" in r:
        return "PERFECT_PLENTY"

    return "1_PIECE"

def main():
    with INPUT_PATH.open(encoding="utf-8") as f:
        raw_items = [line.strip(" ,{}'") for line in f if line.strip()]

    rows = []
    for raw in raw_items:
        name = clean_name(raw)
        item_type = infer_type(name, raw)
        variant = infer_variant(raw)
        rows.append((name, item_type, variant))

    with OUTPUT_PATH.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["name", "type", "variant"])
        writer.writerows(rows)

    print(f"âœ… Rows written: {len(rows)}")
    print(f"ðŸ“„ Output file: {OUTPUT_PATH}")

In [30]:
main()

âœ… Rows written: 443
ðŸ“„ Output file: menu_normalized_443.csv


In [22]:
def extract_item_variant(item_name):
    weight_gms = None
    variant_name = None
    base_name = item_name.strip()

    # Extract weight
    weight_match = re.search(r'\((\d+)\s*gms?\)', item_name, re.IGNORECASE)
    if weight_match:
        weight_gms = int(weight_match.group(1))

    # Extract variant (anything inside first parentheses)
    variant_match = re.search(r'\(([^()]*)\)', item_name)
    if variant_match:
        variant_name = variant_match.group(1).strip()
        base_name = item_name[:variant_match.start()].strip()

    return base_name, variant_name, weight_gms


def extract_unique_pairs(order_payloads):
    unique_items = set()

    for order in order_payloads:
        for item in order.get("OrderItem", []):
            base, variant, weight = extract_item_variant(item["name"])
            unique_items.add((base, variant, weight, False))

            for addon in item.get("addon", []):
                base_a, variant_a, weight_a = extract_item_variant(addon["name"])
                unique_items.add((base_a, variant_a, weight_a, True))

    return sorted(
        unique_items,
        key=lambda x: (x[0] or "", x[1] or "", x[2] or 0, x[3])
    )


pairs = extract_unique_pairs(x)

for p in pairs:
    print(p)


('Banoffee Ice Cream', '160gm', 160, True)
('Banoffee Ice Cream (Family Tub', '500gms', 500, False)
('Banoffee Ice Cream (Junior Scoop', '60gm', 60, False)
('Banoffee Ice Cream (Mini Tub', '160gms', 160, False)
('Banoffee Ice Cream (Regular Scoop', '120gm', 120, False)
('Bean To Bar Dark Chocolate Ice Cream Small Scoop', None, None, True)
('Bean-to-bar 70% Dark Chocolate Ice Cream', '160gm', 160, True)
('Bean-to-bar 70% Dark Chocolate Ice Cream 200ml', None, None, True)
('Bean-to-bar Chocolate Dark Ice Cream (Regular Scoop', '120gm', 120, False)
('Bean-to-bar Dark Chocolate Ice Cream (Mini Tub', '160gms', 160, False)
('Bean-to-bar Dark Chocolate Ice Cream (Regular Tub', '220gms', 220, False)
('Boston Cream Pie', '1pcs', None, False)
('Boston Cream Pie Dessert', '2pcs', None, False)
('Boston Cream Piec', '2pcs', None, False)
('Brownie Cheesecake', None, None, False)
('Butter Waffle Cone', '2pcs', None, True)
('Butter Waffle Cones', '1pcs', None, False)
('Cakes & Cookies', '160gm', 160, 

In [10]:
rows

[{'canonical_name': 'Affogato Ice Cream',
  'item_type': 'ICE_CREAM',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False,
  'created_at': datetime.datetime(2026, 1, 8, 17, 28, 43, 225744),
  'updated_at': datetime.datetime(2026, 1, 8, 17, 28, 43, 225744)},
 {'canonical_name': 'Alphonso Mango Ice Cream',
  'item_type': 'ICE_CREAM',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False,
  'created_at': datetime.datetime(2026, 1, 8, 17, 28, 43, 225744),
  'updated_at': datetime.datetime(2026, 1, 8, 17, 28, 43, 225744)},
 {'canonical_name': 'Americano Ice Cream',
  'item_type': 'ICE_CREAM',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False,
  'created_at': datetime.datetime(2026, 1, 8, 17, 28, 43, 225744),
  'updated_at': datetime.datetime(2026, 1, 8, 17, 28, 43, 225744)},
 {'canonical_name': 'Banoffee Ice Cream',
  'item_type': 'ICE_CREAM',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False,
  'creat