In [2]:
import requests
import json
import html
import re
from typing import Optional, List, Dict, Any

BASE_URL = "https://webhooks.db1-prod-dachnona.store/analytics"
API_KEY = "f3e1753aa4c44159fa7218a31cd8db1e"

HEADERS = {
    "X-API-Key": API_KEY,
}

def _safe_json_load(value):
    if value is None:
        return None
    if isinstance(value, (dict, list)):
        return value
    try:
        return json.loads(value)
    except (TypeError, ValueError):
        return value

def fetch_stream_raw(
    endpoint: str,
    limit: int = 500,
    start_cursor: Optional[int] = 0,
) -> List[Dict[str, Any]]:
    results = []
    last_stream_id = start_cursor or 0

    while True:
        params = {
            "limit": limit,
            "cursor": last_stream_id,
        }

        resp = requests.get(
            f"{BASE_URL}/{endpoint}/",
            headers=HEADERS,
            params=params,
            timeout=60,
        )
        resp.raise_for_status()

        payload = resp.json()
        batch = payload.get("data", [])

        if not batch:
            break

        results.extend(batch)
        last_stream_id = batch[-1]["stream_id"]

        if len(batch) < limit:
            break

    return results

records = fetch_stream_raw("orders")

In [10]:
x = []
for r in records:
    for i in r['raw_event']['raw_payload']['properties']['OrderItem']:
        if "Employee" in i['name']:
            x.append(r)  

In [16]:
x[-1]['raw_event']['raw_payload']['properties']

{'Tax': [{'rate': 2.5, 'type': 'P', 'title': 'CGST@2.5', 'amount': 0},
  {'rate': 2.5, 'type': 'P', 'title': 'SGST@2.5', 'amount': 0}],
 'Order': {'total': 0,
  'biller': 'biller (biller2173)',
  'status': 'Success',
  'comment': '',
  'orderID': 5301,
  'assignee': '',
  'table_no': '2',
  'token_no': '30',
  'round_off': '0',
  'tax_total': 0,
  'core_total': 0,
  'created_on': '2026-01-02 22:48:41',
  'order_from': 'POS',
  'order_type': 'Dine In',
  'payment_type': 'Card',
  'no_of_persons': 0,
  'order_from_id': '',
  'discount_total': 0,
  'service_charge': 0,
  'sub_order_type': 'AC',
  'delivery_charges': 0,
  'packaging_charge': 0,
  'customer_invoice_id': '5301'},
 'Customer': {'name': 'Kritika Sharma',
  'gstin': '',
  'phone': '7045656234',
  'address': ''},
 'Discount': [],
 'OrderItem': [{'tax': 0,
   'name': 'Employee Dessert ( Any 1 )',
   'addon': [],
   'price': 0,
   'total': 0,
   'itemid': 1292464076,
   'discount': 0,
   'itemcode': 'employee dessert ( a',
   'qua

In [4]:
x = []
for i in records:
    addons = i['raw_event']['raw_payload']['properties']['OrderItem'][0].get('addon', [])

    if any(float(a.get('price', 0)) > 0 for a in addons):
        x.append((i['raw_event']['raw_payload']['properties']))


In [8]:
def extract_item_variant(item_name):
    weight_gms = None
    variant_name = None
    base_name = item_name.strip()

    # Extract weight
    weight_match = re.search(r'\((\d+)\s*gms?\)', item_name, re.IGNORECASE)
    if weight_match:
        weight_gms = int(weight_match.group(1))

    # Extract variant (anything inside first parentheses)
    variant_match = re.search(r'\(([^()]*)\)', item_name)
    if variant_match:
        variant_name = variant_match.group(1).strip()
        base_name = item_name[:variant_match.start()].strip()

    return base_name, variant_name, weight_gms


def extract_unique_pairs(order_payloads):
    unique_items = set()

    for order in order_payloads:
        for item in order.get("OrderItem", []):
            base, variant, weight = extract_item_variant(item["name"])
            unique_items.add((base, variant, weight, False))

            for addon in item.get("addon", []):
                base_a, variant_a, weight_a = extract_item_variant(addon["name"])
                unique_items.add((base_a, variant_a, weight_a, True))

    return sorted(
        unique_items,
        key=lambda x: (x[0] or "", x[1] or "", x[2] or 0, x[3])
    )


pairs = extract_unique_pairs(x)

for p in pairs:
    print(p)


('Banoffee Ice Cream', '160gm', 160, True)
('Banoffee Ice Cream (Family Tub', '500gms', 500, False)
('Banoffee Ice Cream (Junior Scoop', '60gm', 60, False)
('Banoffee Ice Cream (Mini Tub', '160gms', 160, False)
('Banoffee Ice Cream (Regular Scoop', '120gm', 120, False)
('Bean To Bar Dark Chocolate Ice Cream Small Scoop', None, None, True)
('Bean-to-bar 70% Dark Chocolate Ice Cream', '160gm', 160, True)
('Bean-to-bar 70% Dark Chocolate Ice Cream 200ml', None, None, True)
('Bean-to-bar Chocolate Dark Ice Cream (Regular Scoop', '120gm', 120, False)
('Bean-to-bar Dark Chocolate Ice Cream (Mini Tub', '160gms', 160, False)
('Bean-to-bar Dark Chocolate Ice Cream (Regular Tub', '220gms', 220, False)
('Boston Cream Pie', '1pcs', None, False)
('Boston Cream Pie Dessert', '2pcs', None, False)
('Boston Cream Piec', '2pcs', None, False)
('Brownie Cheesecake', None, None, False)
('Butter Waffle Cone', '2pcs', None, True)
('Butter Waffle Cones', '1pcs', None, False)
('Cakes & Cookies', '160gm', 160, 

In [60]:
records[0]['raw_event']['raw_payload']['properties']['OrderItem'][0]['name']

'Bean-to-bar 70% Dark Chocolate Ice Cream (Perfect Plenty (300ml))'

In [68]:
item_name = []
addon_name = []
for record in records:
    item_lst = record['raw_event']['raw_payload']['properties']['OrderItem']
    for item in item_lst:
        item_name.append(item['name'])
        for addon in item['addon']:
            addon_name.append(addon['name'])            

In [67]:
records[115]['raw_event']['raw_payload']['properties']['OrderItem'][0]

{'tax': 36.0,
 'name': 'Cakes & Cookies Ice Cream (Regular Scoop)',
 'addon': [{'name': 'Cup',
   'price': 0,
   'addonid': '53392899',
   'quantity': '1',
   'group_name': 'Cuporcone',
   'addon_sap_code': ''}],
 'price': 200.0,
 'total': 200.0,
 'itemid': 1284985096,
 'discount': 0.0,
 'itemcode': 'CCIceream',
 'quantity': 1,
 'sap_code': '',
 'specialnotes': '',
 'category_name': 'The Curiosity Cabinet',
 'vendoritemcode': ''}

In [74]:
set(addon_name)

{'Banoffee Ice Cream (160gm)',
 'Bean To Bar Dark Chocolate Ice Cream Small Scoop',
 'Bean-to-bar 70% Dark Chocolate Ice Cream (160gm)',
 'Bean-to-bar 70% Dark Chocolate Ice Cream 200ml',
 'Butter Waffle Cone (2pcs)',
 'Cakes & Cookies (160gm)',
 'Cakes & Cookies 200ml',
 'Cakes & Cookies Ice Cream (160gm)',
 'Cakes & Cookies Ice Cream 200ml',
 'Cakes & Cookies Ice Cream Small Scoop',
 'Cakes &amp; Cookies 200ml',
 'Cakes &amp; Cookies Ice Cream 200ml',
 'Cakes &amp; Cookies Ice Cream Small Scoop',
 'Cherry & Chocolate (160gm)',
 'Chocolate & Orange With Alcohol (160gm)',
 'Chocolate Overload (160gm)',
 'Coconut & Pineapple (160gm)',
 'Coconut &amp; Pineapple (160gm)',
 'Coffee Mascarpone Ice Cream (160gm)',
 'Coffee Mascarpone Ice Cream 200ml',
 'Coffee Mascarpone Small Scoop',
 'Cup',
 'Dates & Chocolate (160gm)',
 'Dates & Chocolate 200ml',
 'Dates & Chocolate 200ml (eggless)',
 'Dates &amp; Chocolate Small Scoop',
 'Dates With Fig & Orange (160gm)',
 'Dates With Fig & Orange 200ml'

In [9]:
VARIANT_MASTER = [
    # Scoops
    {"code": "JUNIOR_SCOOP", "patterns": ["junior scoop"], "unit_type": "GM", "unit_value": 60},
    {"code": "REGULAR_SCOOP", "patterns": ["regular scoop"], "unit_type": "GM", "unit_value": 120},
    {"code": "SMALL_SCOOP", "patterns": ["small scoop"], "unit_type": "GM", "unit_value": None},

    # Tubs
    {"code": "MINI_TUB", "patterns": ["mini tub"], "unit_type": "GM", "unit_value": 160},
    {"code": "REGULAR_TUB", "patterns": ["regular tub"], "unit_type": "GM", "unit_value": 220},
    {"code": "FAMILY_TUB", "patterns": ["family tub"], "unit_type": "GM", "unit_value": 500},

    # Volume
    {"code": "200_ML", "patterns": ["200ml"], "unit_type": "ML", "unit_value": 200},
    {"code": "300_ML", "patterns": ["300ml"], "unit_type": "ML", "unit_value": 300},

    # Pieces / Desserts
    {"code": "ONE_PC", "patterns": ["1pc", "1pcs"], "unit_type": "PCS", "unit_value": 1},
    {"code": "TWO_PCS", "patterns": ["2pcs"], "unit_type": "PCS", "unit_value": 2},
    {"code": "SINGLE_SERVE", "patterns": [], "unit_type": "PCS", "unit_value": 1},

    # Packaging
    {"code": "CUP", "patterns": ["cup"], "unit_type": "PCS", "unit_value": 1},
    {"code": "TAKEAWAY_CUP", "patterns": ["takeaway cup"], "unit_type": "PCS", "unit_value": 1},
    {"code": "WAFFLE_CONE", "patterns": ["waffle cone"], "unit_type": "PCS", "unit_value": 1},
]


In [10]:
def finalize_canonical_name(name):
    if not name:
        return None

    # Remove dangling brackets and punctuation
    name = re.sub(r"[\(\)\-]+$", "", name)
    name = re.sub(r"\s+", " ", name)

    return name.strip().title()


VARIANT_PATTERNS = [
    "junior scoop", "regular scoop", "small scoop",
    "mini tub", "regular tub", "family tub",
    "200ml", "300ml",
    "1pc", "1pcs", "2pcs"
]


def normalize_text(text):
    if not text:
        return None
    text = html.unescape(text)
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text


def extract_flags(name):
    name_l = name.lower()
    return {
        "contains_alcohol": "alcohol" in name_l,
        "is_eggless": "eggless" in name_l
    }


def strip_attributes(name):
    """
    Removes non-variant attributes like 'eggless', 'contains alcohol'
    """
    name = re.sub(r"\beggless\b", "", name, flags=re.IGNORECASE)
    name = re.sub(r"\bcontains alcohol\b", "", name, flags=re.IGNORECASE)
    return normalize_text(name)


def detect_variant(text):
    if not text:
        return None

    text_l = text.lower()
    for v in VARIANT_MASTER:
        for p in v["patterns"]:
            if p in text_l:
                return v
    return None


def strip_variant_from_name(name):
    """
    Removes variant words from canonical item name
    """
    name_l = name.lower()
    for p in VARIANT_PATTERNS:
        if p in name_l:
            name_l = name_l.replace(p, "")
    return normalize_text(name_l)

def normalize_stage2_2(extracted_rows):
    normalized = []
    failed = []

    for base, variant, weight, is_addon in extracted_rows:
        base_raw = normalize_text(base)
        variant_raw = normalize_text(variant)

        flags = extract_flags(base_raw)
        base_clean = strip_attributes(base_raw)

        # 1Ô∏è‚É£ Packaging short-circuit
        if base_clean.lower() in ["cup", "takeaway cup", "waffle cone", "butter waffle cone"]:
            normalized.append({
                "canonical_item_name": base_clean.title(),
                "variant_code": "SINGLE_SERVE",
                "unit_type": "PCS",
                "unit_value": 1,
                "is_addon": True,
                "contains_alcohol": False,
                "is_eggless": False
            })
            continue

        # 2Ô∏è‚É£ Detect variant (variant field ‚Üí base field)
        v = detect_variant(variant_raw) or detect_variant(base_clean)

        # 3Ô∏è‚É£ Infer by weight
        if not v and weight:
            weight_map = {
                60: "JUNIOR_SCOOP",
                120: "REGULAR_SCOOP",
                160: "MINI_TUB",
                220: "REGULAR_TUB",
                500: "FAMILY_TUB"
            }
            if weight in weight_map:
                v = next(x for x in VARIANT_MASTER if x["code"] == weight_map[weight])

        # 4Ô∏è‚É£ PCS fallback from variant text
        if not v and variant_raw:
            v = detect_variant(variant_raw)

        # 5Ô∏è‚É£ Dessert fallback (single serve)
        if not v and not is_addon:
            v = next(x for x in VARIANT_MASTER if x["code"] == "SINGLE_SERVE")

        # 6Ô∏è‚É£ True failure (combos, bundles)
        if not v:
            failed.append({
                "base_name": base,
                "variant": variant,
                "weight": weight,
                "is_addon": is_addon,
                "reason": "COMBO_OR_UNSUPPORTED"
            })
            continue

        canonical_name = finalize_canonical_name(strip_variant_from_name(base_clean))

        normalized.append({
            "canonical_item_name": canonical_name,
            "variant_code": v["code"],
            "unit_type": v["unit_type"],
            "unit_value": v["unit_value"],
            "is_addon": is_addon,
            "contains_alcohol": flags["contains_alcohol"],
            "is_eggless": flags["is_eggless"]
        })

    return normalized, failed

In [11]:
normalized_rows, failed_rows = normalize_stage2_2(pairs)

print("NORMALIZED:")
for r in normalized_rows[:10]:
    print(r)

print("\nFAILED (needs manual review):")
for f in failed_rows:
    print(f)


NORMALIZED:
{'canonical_item_name': 'Banoffee Ice Cream', 'variant_code': 'MINI_TUB', 'unit_type': 'GM', 'unit_value': 160, 'is_addon': True, 'contains_alcohol': False, 'is_eggless': False}
{'canonical_item_name': 'Banoffee Ice Cream', 'variant_code': 'FAMILY_TUB', 'unit_type': 'GM', 'unit_value': 500, 'is_addon': False, 'contains_alcohol': False, 'is_eggless': False}
{'canonical_item_name': 'Banoffee Ice Cream', 'variant_code': 'JUNIOR_SCOOP', 'unit_type': 'GM', 'unit_value': 60, 'is_addon': False, 'contains_alcohol': False, 'is_eggless': False}
{'canonical_item_name': 'Banoffee Ice Cream', 'variant_code': 'MINI_TUB', 'unit_type': 'GM', 'unit_value': 160, 'is_addon': False, 'contains_alcohol': False, 'is_eggless': False}
{'canonical_item_name': 'Banoffee Ice Cream', 'variant_code': 'REGULAR_SCOOP', 'unit_type': 'GM', 'unit_value': 120, 'is_addon': False, 'contains_alcohol': False, 'is_eggless': False}
{'canonical_item_name': 'Bean To Bar Dark Chocolate Ice Cream', 'variant_code': 'SMA

In [12]:
import pandas as pd
from collections import defaultdict


In [13]:
def infer_item_type(name: str) -> str:
    n = name.lower()

    # Packaging
    if any(x in n for x in ["cup", "cone"]):
        return "EXTRAS"

    # Desserts
    if any(x in n for x in [
        "cake", "brownie", "cheesecake", "tiramisu", "lamington", "pie"
    ]):
        return "DESSERT"

    # Combos / Meals
    if any(x in n for x in ["combo", "pack", "duo", "meal"]):
        return "COMBO"

    # Default
    return "ICE_CREAM"

import pandas as pd

def build_menu_item_table(normalized_rows):
    items = {}

    for r in normalized_rows:
        name = r["canonical_item_name"]

        if name not in items:
            items[name] = {
                "canonical_name": name,
                "item_type": infer_item_type(name),
                "contains_alcohol": r["contains_alcohol"],
                "is_eggless": r["is_eggless"],
                "is_defunct": False
            }
        else:
            # Merge flags safely across variants
            items[name]["contains_alcohol"] |= r["contains_alcohol"]
            items[name]["is_eggless"] |= r["is_eggless"]

    return (
        pd.DataFrame(items.values())
        .sort_values(["item_type", "canonical_name"])
        .reset_index(drop=True)
    )

menu_item_df = build_menu_item_table(normalized_rows)
menu_item_df


Unnamed: 0,canonical_name,item_type,contains_alcohol,is_eggless,is_defunct
0,Design Family Pack Of 3 Ice Creams,COMBO,False,False,False
1,Design Your Indulgence Duo Ice Creams,COMBO,False,True,False
2,Half In Half Combo,COMBO,False,False,False
3,Boston Cream Pie,DESSERT,False,False,False
4,Boston Cream Pie Dessert,DESSERT,False,False,False
5,Boston Cream Piec,DESSERT,False,False,False
6,Brownie Cheesecake,DESSERT,False,False,False
7,Cakes & Cookies,DESSERT,False,False,False
8,Cakes & Cookies Ice Cream,DESSERT,False,False,False
9,Classic Brownie & Ice Cream With Fudge Sauce,DESSERT,False,False,False


In [14]:
def build_menu_item_variant_table(normalized_rows):
    variants = {}
    id_counter = 1

    for r in normalized_rows:
        key = (r["canonical_item_name"], r["variant_code"])

        if key not in variants:
            variants[key] = {
                "menu_item_variant_id": id_counter,
                "canonical_item_name": r["canonical_item_name"],
                "variant_code": r["variant_code"],
                "unit_type": r["unit_type"],
                "unit_value": r["unit_value"],
                "is_addon": r["is_addon"],
                "is_platform_only": False,
                "is_defunct": False
            }
            id_counter += 1

    return pd.DataFrame(variants.values()) \
             .sort_values(["canonical_item_name", "variant_code"])


In [15]:
menu_item_variant_df = build_menu_item_variant_table(normalized_rows)
menu_item_variant_df

Unnamed: 0,menu_item_variant_id,canonical_item_name,variant_code,unit_type,unit_value,is_addon,is_platform_only,is_defunct
60,61,Banoffee Ice Cream,200_ML,ML,200.0,True,False,False
1,2,Banoffee Ice Cream,FAMILY_TUB,GM,500.0,False,False,False
2,3,Banoffee Ice Cream,JUNIOR_SCOOP,GM,60.0,False,False,False
0,1,Banoffee Ice Cream,MINI_TUB,GM,160.0,True,False,False
3,4,Banoffee Ice Cream,REGULAR_SCOOP,GM,120.0,False,False,False
...,...,...,...,...,...,...,...,...
125,126,Sunshine Limone Ice Cream,REGULAR_SCOOP,GM,120.0,False,False,False
126,127,Sunshine Limone Ice Cream,REGULAR_TUB,GM,220.0,False,False,False
128,129,Sunshine Limone Ice Cream,SMALL_SCOOP,GM,,True,False,False
129,130,Takeaway Cup,SINGLE_SERVE,PCS,1.0,True,False,False


In [16]:
def build_variant_master_table():
    return pd.DataFrame([
        {"variant_code": "JUNIOR_SCOOP", "display_name": "Junior Scoop", "unit_type": "GM", "unit_value": 60},
        {"variant_code": "REGULAR_SCOOP", "display_name": "Regular Scoop", "unit_type": "GM", "unit_value": 120},
        {"variant_code": "SMALL_SCOOP", "display_name": "Small Scoop", "unit_type": "GM", "unit_value": None},
        {"variant_code": "MINI_TUB", "display_name": "Mini Tub", "unit_type": "GM", "unit_value": 160},
        {"variant_code": "REGULAR_TUB", "display_name": "Regular Tub", "unit_type": "GM", "unit_value": 220},
        {"variant_code": "FAMILY_TUB", "display_name": "Family Tub", "unit_type": "GM", "unit_value": 500},
        {"variant_code": "200_ML", "display_name": "200 ml", "unit_type": "ML", "unit_value": 200},
        {"variant_code": "300_ML", "display_name": "300 ml", "unit_type": "ML", "unit_value": 300},
        {"variant_code": "ONE_PC", "display_name": "1 pc", "unit_type": "PCS", "unit_value": 1},
        {"variant_code": "TWO_PCS", "display_name": "2 pcs", "unit_type": "PCS", "unit_value": 2},
        {"variant_code": "SINGLE_SERVE", "display_name": "Single Serve", "unit_type": "PCS", "unit_value": 1},
    ])
variant_master_df = build_variant_master_table()
variant_master_df


Unnamed: 0,variant_code,display_name,unit_type,unit_value
0,JUNIOR_SCOOP,Junior Scoop,GM,60.0
1,REGULAR_SCOOP,Regular Scoop,GM,120.0
2,SMALL_SCOOP,Small Scoop,GM,
3,MINI_TUB,Mini Tub,GM,160.0
4,REGULAR_TUB,Regular Tub,GM,220.0
5,FAMILY_TUB,Family Tub,GM,500.0
6,200_ML,200 ml,ML,200.0
7,300_ML,300 ml,ML,300.0
8,ONE_PC,1 pc,PCS,1.0
9,TWO_PCS,2 pcs,PCS,2.0


In [17]:
menu_item_variant_platform_df = pd.DataFrame(columns=[
    "menu_item_variant_id",
    "platform",
    "price",
    "is_available",
    "platform_sku_code",
    "effective_from",
    "effective_to"
])

menu_item_variant_platform_df


Unnamed: 0,menu_item_variant_id,platform,price,is_available,platform_sku_code,effective_from,effective_to


In [18]:
menu_item_records = menu_item_df.to_dict(orient="records")
menu_item_variant_records = menu_item_variant_df.to_dict(orient="records")
variant_master_records = variant_master_df.to_dict(orient="records")


In [19]:
menu_item_variant_df[menu_item_variant_df['variant_code'] == 'SINGLE_SERVE']

Unnamed: 0,menu_item_variant_id,canonical_item_name,variant_code,unit_type,unit_value,is_addon,is_platform_only,is_defunct
13,14,Brownie Cheesecake,SINGLE_SERVE,PCS,1.0,False,False,False
14,15,Butter Waffle Cone,SINGLE_SERVE,PCS,1.0,True,False,False
33,34,Classic Brownie & Ice Cream With Fudge Sauce,SINGLE_SERVE,PCS,1.0,False,False,False
36,37,Classic Tiramisu,SINGLE_SERVE,PCS,1.0,False,False,False
45,46,Cup,SINGLE_SERVE,PCS,1.0,True,False,False
46,47,D&N Traditional Plum Cake,SINGLE_SERVE,PCS,1.0,False,False,False
58,59,Design Family Pack Of 3 Ice Creams,SINGLE_SERVE,PCS,1.0,False,False,False
103,104,New York Baked Cheesecake,SINGLE_SERVE,PCS,1.0,False,False,False
109,110,Orange & Chocolate Cheesecake,SINGLE_SERVE,PCS,1.0,False,False,False
129,130,Takeaway Cup,SINGLE_SERVE,PCS,1.0,True,False,False


In [20]:
import re
import html
from typing import Optional, Dict

# -------------------------
# Variant Master (LOCKED)
# -------------------------
VARIANT_PATTERNS = [
    ("JUNIOR_SCOOP", r"junior\s*scoop|60\s*gm"),
    ("REGULAR_SCOOP", r"regular\s*scoop|120\s*gm"),
    ("MINI_TUB", r"mini\s*tub|160\s*gm"),
    ("REGULAR_TUB", r"regular\s*tub|220\s*gm"),
    ("FAMILY_TUB", r"family\s*tub|500\s*gm"),
    ("200_ML", r"200\s*ml"),
    ("ONE_PC", r"1\s*pc|1\s*pcs"),
    ("TWO_PC", r"2\s*pc|2\s*pcs"),
]

PACKAGING_KEYWORDS = [
    "cup",
    "cone",
    "waffle cone",
]
def normalize_text(text: str) -> str:
    if not text:
        return ""

    text = html.unescape(text)
    text = text.replace("&", " & ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()
def extract_attributes(name: str) -> Dict[str, bool]:
    name_l = name.lower()

    return {
        "is_eggless": "eggless" in name_l,
        "contains_alcohol": "alcohol" in name_l,
    }
def extract_variant(name: str) -> Optional[str]:
    name_l = name.lower()

    for variant_code, pattern in VARIANT_PATTERNS:
        if re.search(pattern, name_l):
            return variant_code

    return None
def extract_canonical_name(raw_name: str) -> str:
    name = normalize_text(raw_name)

    # Remove bracketed noise
    name = re.sub(r"\([^)]*\)", "", name)

    # Remove variant keywords
    name = re.sub(
        r"\b("
        r"junior scoop|regular scoop|mini tub|regular tub|family tub|"
        r"\d+\s*gm|\d+\s*gms|\d+\s*ml|small scoop|"
        r"contains alcohol|eggless"
        r")\b",
        "",
        name,
        flags=re.IGNORECASE,
    )

    # Normalize known suffix noise
    name = re.sub(r"\bice cream ice cream\b", "Ice Cream", name, flags=re.IGNORECASE)

    # Collapse multiple spaces
    name = re.sub(r"\s+", " ", name).strip()

    # Title case
    return name.title()
def infer_item_type(canonical_name: str) -> str:
    n = canonical_name.lower()

    if any(p in n for p in ["cup", "cone"]):
        return "EXTRAS"
    if any(k in n for k in ["cake", "brownie", "cheesecake", "tiramisu", "pie", "lamington"]):
        return "DESSERT"
    if any(k in n for k in ["combo", "pack", "duo", "meal"]):
        return "COMBO"
    return "ICE_CREAM"
def normalize_menu_entity(
    raw_name: str,
    is_addon: bool,
) -> Dict[str, Optional[str]]:
    raw_name = normalize_text(raw_name)

    attrs = extract_attributes(raw_name)
    variant_code = extract_variant(raw_name)
    canonical_name = extract_canonical_name(raw_name)
    item_type = infer_item_type(canonical_name)

    return {
        "canonical_item_name": canonical_name,
        "variant_code": variant_code,
        "item_type": item_type,
        "is_addon": is_addon,
        "contains_alcohol": attrs["contains_alcohol"],
        "is_eggless": attrs["is_eggless"],
    }
normalized_rows = []

for order in records:
    for item in order["raw_event"]["raw_payload"]["properties"].get("OrderItem", []):
        normalized_rows.append(
            normalize_menu_entity(item["name"], is_addon=False)
        )

        for addon in item.get("addon", []):
            normalized_rows.append(
                normalize_menu_entity(addon["name"], is_addon=True)
            )


In [32]:
# -------------------------
# Post-normalization fixes
# -------------------------

def strip_trailing_junk(name: str) -> str:
    return re.sub(r"[)\-‚Äì:!.]+$", "", name).strip()


CANONICAL_OVERRIDES = {
    "Old Fashion Vanila Ice Cream": "Old Fashion Vanilla Ice Cream",
    "Eggles Cherry & Chocolate": "Eggless Cherry & Chocolate",
    "Alphanso Mango Ice Cream": "Alphonso Mango Ice Cream",
}


def enforce_ice_cream_suffix(name: str, item_type: str) -> str:
    if item_type == "ICE_CREAM" and not name.lower().endswith("ice cream"):
        return f"{name} Ice Cream"
    return name


IGNORE_PATTERNS = [
    r"delivery charge",
    r"porter",
    r"pidge",
    r"factory visit",
    r"employee",
    r"school kids",
    r"water bottle",
]


def is_ignored_item(name: str) -> bool:
    return any(re.search(p, name.lower()) for p in IGNORE_PATTERNS)


def post_process_canonical(name: str, item_type: str):
    name = strip_trailing_junk(name)
    name = CANONICAL_OVERRIDES.get(name, name)
    name = enforce_ice_cream_suffix(name, item_type)

    if is_ignored_item(name):
        return None

    return name
def normalize_menu_entity(raw_name: str, is_addon: bool):
    raw_name = normalize_text(raw_name)

    attrs = extract_attributes(raw_name)
    variant_code = extract_variant(raw_name)
    canonical_name = extract_canonical_name(raw_name)
    item_type = infer_item_type(canonical_name)

    canonical_name = post_process_canonical(canonical_name, item_type)
    if canonical_name is None:
        return None   # üëà ignored junk items

    return {
        "canonical_item_name": canonical_name,
        "variant_code": variant_code,
        "item_type": item_type,
        "is_addon": is_addon,
        "contains_alcohol": attrs["contains_alcohol"],
        "is_eggless": attrs["is_eggless"],
    }
normalized_rows = []

for order in records:
    for item in order["raw_event"]["raw_payload"]["properties"].get("OrderItem", []):
        row = normalize_menu_entity(item["name"], is_addon=False)
        if row:
            normalized_rows.append(row)

        for addon in item.get("addon", []):
            row = normalize_menu_entity(addon["name"], is_addon=True)
            if row:
                normalized_rows.append(row)
menu_item_df = build_menu_item_table(normalized_rows)

menu_item_df

Unnamed: 0,canonical_name,item_type,contains_alcohol,is_eggless,is_defunct
0,Assorted Cookie Duo + Choco Chip,COMBO,False,False,False
1,Classic Night & Day Duo,COMBO,False,False,False
2,Classic Night & Day Duo Ice Creams,COMBO,False,False,False
3,Curious Creations Duo,COMBO,False,False,False
4,Curious Creations Duo Ice Creams,COMBO,False,False,False
...,...,...,...,...,...
81,Rose Cardamom Ice Cream,ICE_CREAM,False,False,False
82,Strawberry Cream Cheese Ice Cream,ICE_CREAM,False,True,False
83,Sunshine Limone Ice Cream,ICE_CREAM,False,False,False
84,Tres Leches Ice Cream,ICE_CREAM,False,False,False


In [33]:
menu_item_df.to_dict(orient="records")

[{'canonical_name': 'Assorted Cookie Duo + Choco Chip',
  'item_type': 'COMBO',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False},
 {'canonical_name': 'Classic Night & Day Duo',
  'item_type': 'COMBO',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False},
 {'canonical_name': 'Classic Night & Day Duo Ice Creams',
  'item_type': 'COMBO',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False},
 {'canonical_name': 'Curious Creations Duo',
  'item_type': 'COMBO',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False},
 {'canonical_name': 'Curious Creations Duo Ice Creams',
  'item_type': 'COMBO',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False},
 {'canonical_name': 'Design Family Pack Of 3 Ice Creams',
  'item_type': 'COMBO',
  'contains_alcohol': False,
  'is_eggless': False,
  'is_defunct': False},
 {'canonical_name': 'Design Your Indulgence Duo',
  'item_type': 'COMBO',
  'con

In [35]:
CANONICAL_SYNONYMS = {
    # Fig & Orange
    "Fig Orange Ice Cream": "Fig & Orange Ice Cream",

    # Bean-to-bar normalization
    "Bean To Bar Dark Chocolate Ice Cream": "Bean-To-Bar Dark Chocolate Ice Cream",
    "Bean-To-Bar Chocolate Dark Ice Cream": "Bean-To-Bar Dark Chocolate Ice Cream",
    "Bean-To-Bar Chocolate 70% Dark Ice Cream": "Bean-To-Bar Dark Chocolate Ice Cream",

    # Chocolate & Orange
    "Chocolate & Orange With Alcohol Ice Cream": "Chocolate & Orange Ice Cream",
}
DESSERT_OVERRIDES = {
    "Boston Cream Piec": "Boston Cream Pie",
    "Boston Cream Pie Dessert": "Boston Cream Pie",
}
def post_process_canonical(name: str, item_type: str):
    name = strip_trailing_junk(name)
    name = CANONICAL_OVERRIDES.get(name, name)

    name = enforce_ice_cream_suffix(name, item_type)

    # Apply synonym collapsing
    name = CANONICAL_SYNONYMS.get(name, name)

    # Dessert cleanup
    if item_type == "DESSERT":
        name = DESSERT_OVERRIDES.get(name, name)

    if is_ignored_item(name):
        return None

    return name


In [50]:
import html
import re
from collections import defaultdict

CANONICAL_NAME_MAP = {

    "Affogato Ice Cream": [
        "Affogato",
        "Affogato Ice Cream",
    ],

    "Alphonso Mango Ice Cream": [
        "Alphanso Mango Ice Cream",
        "Alphonso Mango Ice Cream",
        "Alphonso Mango Ice Cream )",
    ],

    "Americano Ice Cream": [
        "Americano",
        "Americano Ice Cream",
    ],

    "Banoffee Ice Cream": [
        "Banoffee Ice Cream",
        "Banoffee Ice Cream )",
        "Eggless Banoffee Ice Cream",
        "Eggless Banoffee Ice Cream 200ml",
        "Eggless Banoffee Ice Cream Small Scoop",
    ],

    "Bean-To-Bar Dark Chocolate Ice Cream": [
        "Bean To Bar Dark Chocolate Ice Cream",
        "Bean-To-Bar Dark Chocolate Ice Cream",
        "Bean-To-Bar Chocolate Dark Ice Cream",
        "Bean-to-bar Chocolate Dark Ice Cream",
        "Bean-to-bar Dark Chocolate Ice Cream",
        "Bean-to-bar 70% Dark Chocolate Ice Cream",
        "Bean-To-Bar Chocolate 70% Dark Ice Cream",
        "Bean-To-Bar 70% Dark Chocolate Ice Cream",
    ],

    "Cappuccino Ice Cream": [
        "Cappuccino",
        "Cappuccino Ice Cream",
    ],

    "Cakes & Cookies Ice Cream": [
        "Cakes & Cookies Ice Cream",
        "Cakes & Cookies Ice Cream 200ml",
        "Cakes & Cookies Ice Cream Small Scoop",
        "Cakes & Cookies Ice Cream (Mini tub",
        "Cakes & Cookies Ice Cream (Regular Scoop",
        "Cakes & Cookies Ice Cream (Regular Tub",
        "Cakes &amp; Cookies Ice Cream Small Scoop",
        "Cakes & Cookies 200ml",
        "Cakes & Cookies"
    ],
    
    "Cherry & Chocolate Ice Cream": [
        "Cherry & Chocolate",
        "Cherry & Chocolate Ice Cream",
        "Cherry & Chocolate Ice Cream )",
        "Eggless Cherry & Chocolate Ice Cream Small Scoop",
        "Eggles Cherry & Chocolate",
    ],

    "Cherry & Chocolate Fudge Ice Cream": [
        "Cherry & Chocolate Fudge Ice Cream",
        "Cherry & Chocolate Fudge Ice Cream )",
    ],

    "Choco Chunk Cookie Ice Cream": [
        "Choco Chunk Cookie",
        "Choco Chunk Cookie Ice Cream",
    ],

    "Chocolate Ice Cream": [
        "Chocolate Ice Cream",
        "Chocolate Ice Cream )",
        "Eggless Chocolate Ice Cream",
    ],

    "Chocolate Overload Ice Cream": [
        "Chocolate Overload",
        "Chocolate Overload )",
        "Chocolate Overload Ice Cream",
        "Chocolate Overload Ice Cream )",
        "Eggless Chocolate Overload",
    ],

    "Chocolate & Orange Ice Cream": [
        "Chocolate & Orange",
        "Chocolate & Orange )",
        "Chocolate & Orange Ice Cream )",
        "Chocolate & Orange With Alcohol",
        "Chocolate & Orange With Alcohol Ice Cream",
    ],

    "Coconut & Pineapple Ice Cream": [
        "Coconut & Pineapple",
        "Coconut & Pineapple Ice Cream",
        "Coconut & Pineapple Ice Cream )",
        "Eggless Coconut & Pineapple",
    ],

    "Coffee Mascarpone Ice Cream": [
        "Coffee Mascarpone",
        "Coffee Mascarpone Ice Cream",
        "Coffee Mascarpone Ice Cream )",
        "Eggless Coffee Mascarpone Ice Cream",
    ],

    "Dates & Chocolate Ice Cream": [
        "Dates & Chocolate",
        "Dates & Chocolate )",
        "Dates & Chocolate Ice Cream",
        "Dates & Chocolate 200ml",
    ],

    "Dates Rose & Nuts Ice Cream": [
        "Dates Rose & Nuts Ice Cream",
        "Dates Rose & Nuts Ice Cream )",
    ],

    "Dates With Fig & Orange Ice Cream": [
        "Dates With Fig & Orange",
        "Dates With Fig & Orange )",
        "Dates With Fig & Orange Ice Cream",
    ],

    "Fig & Orange Ice Cream": [
        "Fig & Orange",
        "Fig & Orange Ice Cream",
        "Fig & Orange Ice Cream )",
        "Fig Orange Ice Cream",
        "Fig Orange Ice Cream )",
    ],

    "Go Bananas Ice Cream": [
        "Go Bananas Ice Cream",
        "Go Bananas Ice Cream )",
    ],

    "Just Chocolate Ice Cream": [
        "Just Chocolate",
        "Just Chocolate Ice Cream",
        "Just Chocolate Ice Cream )",
    ],

    "Kulfi Inspired Ice Cream": [
        "Kulfi Inspired Ice Cream",
    ],

    "Masala Chai Ice Cream": [
        "Masala Chai Ice Cream",
        "Masala Chai Ice Cream )",
    ],

    "Milk Chocolate Ice Cream": [
        "Milk Chocolate",
        "Milk Chocolate Ice Cream",
    ],

    "Monkey Business Ice Cream": [
        "Monkey Business Ice Cream",
        "Monkey Business Ice Cream )",
    ],

    "Old Fashion Vanilla Ice Cream": [
        "Old Fashion Vanila Ice Cream",
        "Old Fashion Vanila Ice Cream )",
        "Old Fashion Vanilla",
        "Old Fashion Vanilla Ice Cream",
        "Old Fashion Vanilla Ice Cream )",
    ],

    "Orange & Biscuits Ice Cream": [
        "Orange & Biscuits",
        "Orange & Biscuits Ice Cream",
    ],

    "Orange Ice Cream": [
        "Orange Ice Cream",
        "Orange Ice Cream )",
    ],

    "Paan & Gulkand Ice Cream": [
        "Paan & Gulkand Ice Cream",
        "Paan & Gulkand Ice Cream )",
        "Egg Based Paan & Gulkand Ice Cream",
    ],

    "Pistachio Ice Cream": [
        "Pistachio Ice Cream",
        "Pistachio Ice Cream )",
    ],

    "Rose Cardamom Ice Cream": [
        "Rose Cardamom Ice Cream",
        "Rose Cardamom Ice Cream )",
    ],

    "Strawberry Cream Cheese Ice Cream": [
        "Strawberry Cream Cheese",
        "Strawberry Cream Cheese Ice Cream",
        "Strawberry Cream Cheese Ice Cream )",
        "Egg Strawberry Cream Cheese Ice Cream",
    ],

    "Sunshine Limone Ice Cream": [
        "Sunshine Limone Ice Cream",
        "Sunshine Limone Ice Cream )",
    ],

    "Triple Chocolate Ice Cream": [
        "Triple Chocolate Ice Cream",
        "Triple Chocolate Ice Cream )",
    ],
}


DESSERT_CANONICAL_MAP = {

    "Boston Cream Pie": [
        "Boston Cream Pie",
        "Boston Cream Pie Dessert",
        "Boston Cream Piec",
    ],

    "Classic Brownie & Ice Cream With Fudge Sauce": [
        "Classic Brownie & Ice Cream With Fudge Sauce",
        "Classic Brownie & Vanilla Ice Cream With Fudge Sauce",
        "Brownie & Vanilla Ice Cream With Fudge Sauce. It's Classic",
    ],

    "Fudgy Chocolate Brownie": [
        "Fudgy Chocolate Brownie",
        "Fudgy Chocolate Brownie - Round Shape",
    ],

    "Ice Cream Cake": [
        "Ice Cream Cake",
        "Ice Cream Cake 1Kg",
        "Customised Ice Cream Cake",
    ],

    "Tres Leches Ice Cream": [
        "Tres Leches",
        "Tres Leches Ice Cream",
    ],

    "Nona‚Äôs Traditional Plum Cake": [
        "D & N Traditional Plum Cake",
        "D&n Traditional Plum Cake",
        "Nona‚ÄôS Traditional Plum Cake",
    ],
    "Brownie Cheesecake": [
        "Brownie Cheesecake",
    ],

    "Classic Chocolate Lamington": [
        "Classic Chocolate Lamington",
        "Classic Chocolate Lamington 1pcs",
        "Classic Chocolate Lamington 2pcs",
    ],

    "Classic Tiramisu": [
        "Classic Tiramisu",
    ],

    "New York Baked Cheesecake": [
        "New York Baked Cheesecake Eggless",
        "New York Baked Cheesecake",
    ],

    "Orange & Chocolate Cheesecake": [
        "Orange & Chocolate Cheesecake",
    ],
}


EXTRAS_CANONICAL_MAP = {

    "Butter Waffle Cone": [
        "Butter Waffle Cone",
        "Butter Waffle Cones",
    ],

    "Cup": [
        "Cup",
        "Takeaway Cup",
    ],

    "Waffle Cone": [
        "Waffle Cone",
    ],
}

COMBO_CANONICAL_MAP = {

    "Design Family Pack Of 3 Ice Creams": [
        "Design Family Pack Of 3 Ice Creams",
        "Design Family Pack Of 3 Ice Creams 200+200+200 Ml",
    ],

    "Design Your Indulgence Duo Ice Creams": [
        "Design Your Indulgence Duo Ice Creams",
        "Design Your Indulgence Duo Ice Creams 200ml+200ml",
        "Eggless Design Your Indulgence Duo Ice Creams",
    ],

    "Half In Half Regular Scoop Combo": [
        "Half In Half Regular Scoop Combo",
    ],
}


# ---- 1. Build reverse lookup: raw_name -> canonical_name ----

def build_reverse_map(*canonical_maps):
    reverse = {}
    for cmap in canonical_maps:
        for canonical, raw_list in cmap.items():
            for raw in raw_list:
                reverse[normalize_raw_name(raw)] = canonical
    return reverse


# ---- 2. Safe normalization (mirrors your Stage 3 cleanup only) ----

def normalize_raw_name(name: str) -> str:
    if not name:
        return ""

    name = html.unescape(name)
    name = name.strip()

    # Remove everything from first "(" onwards
    name = re.sub(r"\(.*$", "", name).strip()

    # Remove known variant words
    name = re.sub(
        r"\b(small|junior|regular|family|mini)\b.*",
        "",
        name,
        flags=re.IGNORECASE,
    )

    # Remove units
    name = re.sub(
        r"\b\d+\s*(gm|gms|ml|pcs|pc)\b",
        "",
        name,
        flags=re.IGNORECASE,
    )

    # Remove attributes
    name = re.sub(
        r"\b(eggless|contains alcohol|with alcohol)\b",
        "",
        name,
        flags=re.IGNORECASE,
    )

    # Collapse duplicate Ice Cream
    name = re.sub(
        r"\bIce Cream\s+Ice Cream\b",
        "Ice Cream",
        name,
        flags=re.IGNORECASE,
    )

    # Enforce Ice Cream suffix if applicable
    if (
        "Ice Cream" not in name
        and any(k in name.lower() for k in ["chocolate", "vanilla", "banana", "coffee", "pistachio", "mango"])
    ):
        name = f"{name} Ice Cream"

    # Final whitespace cleanup
    name = re.sub(r"\s+", " ", name)

    return name.strip()



# ---- 3. Coverage audit ----

def coverage_audit(pairs, reverse_map):
    covered = []
    uncovered = []

    for base_name, _, _, is_addon in pairs:
        normalized = normalize_raw_name(base_name)

        if normalized in reverse_map:
            covered.append((base_name, normalized, reverse_map[normalized], is_addon))
        else:
            uncovered.append((base_name, normalized, is_addon))

    return covered, uncovered


# ---- 4. Run audit ----

ALL_CANONICAL_MAPS = (
    CANONICAL_NAME_MAP,
    DESSERT_CANONICAL_MAP,
    EXTRAS_CANONICAL_MAP,
    COMBO_CANONICAL_MAP,
)

reverse_map = build_reverse_map(*ALL_CANONICAL_MAPS)

covered, uncovered = coverage_audit(pairs, reverse_map)

print(f"Total unique raw items: {len(pairs)}")
print(f"Covered: {len(covered)}")
print(f"Uncovered: {len(uncovered)}")
print(f"Coverage %: {len(covered) / len(pairs) * 100:.2f}%")


Total unique raw items: 161
Covered: 161
Uncovered: 0
Coverage %: 100.00%


In [51]:
from pprint import pprint

print("\nUncovered items:")
pprint(sorted(uncovered, key=lambda x: x[0]))



Uncovered items:
[]


In [52]:
addon_uncovered = [u for u in uncovered if u[2]]
base_uncovered = [u for u in uncovered if not u[2]]

print(f"Uncovered base items: {len(base_uncovered)}")
print(f"Uncovered addon items: {len(addon_uncovered)}")


Uncovered base items: 0
Uncovered addon items: 0
