In [11]:
import json
import re
from pathlib import Path
from typing import List, Dict, Set, Tuple
import pandas as pd


In [12]:
def load_orders_from_jsons(json_paths: List[str]) -> List[Dict]:
    orders = []
    for path in json_paths:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, list):
                orders.extend(data)
            else:
                orders.append(data)
    return orders


In [13]:
def extract_raw_menu_entries_from_orders(orders: List[Dict]) -> List[Dict]:
    extracted = []

    for order in orders:
        raw_event = order.get("raw_event", {})
        raw_payload = raw_event.get("raw_payload", {})
        properties = raw_payload.get("properties", {})

        # Order items
        items = properties.get("OrderItem", [])
        for item in items:
            extracted.append({
                "raw_name": item.get("name", "").strip(),
                "description": None,
                "weight": None
            })

            # Addons
            addons = item.get("addon", [])
            for addon in addons:
                extracted.append({
                    "raw_name": addon.get("name", "").strip(),
                    "description": None,
                    "weight": None
                })

    return extracted


In [53]:
def fix_html_entities(text):
    """Fix HTML entities like &amp; -> &"""
    return text.replace('&amp;', '&')

def fix_typos(name):
    """Fix common typos in item names"""
    # Fix "Eggles" -> "Eggless"
    name = re.sub(r'\bEggles\b', 'Eggless', name)
    
    # Fix Boston Cream Piec -> Boston Cream Pie
    name = re.sub(r'\bBoston Cream Piec\b', 'Boston Cream Pie', name)
    
    # Fix Bean-to-bar capitalization variations
    name = re.sub(r'\bBean[- ]to[- ]bar\b', 'Bean-to-Bar', name, flags=re.IGNORECASE)
    name = re.sub(r'\bBean To Bar\b', 'Bean-to-Bar', name, flags=re.IGNORECASE)
    
    # Fix "Chocolate Dark" -> "Dark Chocolate" (word order)
    name = re.sub(r'\bChocolate Dark\b', 'Dark Chocolate', name)
    
    # Fix double "Ice Cream Ice Cream"
    name = re.sub(r'\bIce Cream Ice Cream\b', 'Ice Cream', name)
    
    # Fix D&n -> D&N (D&N Traditional Plum Cake)
    name = re.sub(r'\bD&n\b', 'D&N', name)
    
    # Fix "Fig Orange" -> "Fig & Orange" (missing ampersand)
    name = re.sub(r'\bFig Orange\b', 'Fig & Orange', name)
    
    # Fix "Cherry & Chocolate" without "Fudge" -> add "Fudge" to match full name
    # Based on data, the full ice cream name is "Cherry & Chocolate Fudge Ice Cream"
    # This applies to both regular and Eggless versions
    if 'Cherry & Chocolate' in name and 'Fudge' not in name:
        name = name.replace('Cherry & Chocolate', 'Cherry & Chocolate Fudge')
    
    # Fix "Chocolate & Orange With Alcohol" -> standardize naming
    if 'Chocolate & Orange With Alcohol' in name:
        name = name.replace('Chocolate & Orange With Alcohol', 'Chocolate & Orange (Contains Alcohol) Ice Cream')
        # Remove duplicate "Ice Cream" if present
        name = name.replace('Ice Cream Ice Cream', 'Ice Cream')
    
    # Remove trailing parenthesis if incomplete (like "Mini Tub" without closing)
    name = re.sub(r'\s*\([^)]*$', '', name)
    
    return name.strip()
    

In [54]:
def normalize_name(raw_name, description, weight):
    """Extract and normalize the clean item name"""
    name = fix_html_entities(raw_name)
    name = fix_typos(name)
    
    # Remove variant info from name (things in parentheses at end)
    variant_patterns = [
        r'\s*\(Family Tub\)?',
        r'\s*\(Junior Scoop\)?',
        r'\s*\(Mini [Tt]ub\)?',
        r'\s*\(Regular Scoop\)?',
        r'\s*\(Regular Tub\)?',
        r'\s*\(Perfect Plenty[^)]*\)?',
        r'\s*\(160gm[s]?\)',  # Handle (160gm) or (160gms)
        r'\s*\(2\s*pc[s]?\)',  # Handle (2pcs)
        r'\s*\(1\s*pc[s]?\)',  # Handle (1pcs)
        r'\s*\(eggless\)',  # Handle (eggless) suffix
    ]
    for pattern in variant_patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    # Remove size suffixes from name
    name = re.sub(r'\s+200ml$', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s+300ml$', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s+160gm$', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s+Small Scoop$', '', name, flags=re.IGNORECASE)
    
    # Handle "Chocolate & Orange" with alcohol
    if 'Chocolate & Orange' in name and description and 'alcohol' in description.lower():
        name = 'Chocolate & Orange (Contains Alcohol) Ice Cream'
    elif 'Chocolate & Orange With Alcohol' in name:
        name = 'Chocolate & Orange (Contains Alcohol) Ice Cream'
    
    # Add "Ice Cream" suffix for ice cream items that are missing it
    ice_cream_items_without_suffix = [
        'Cakes & Cookies',
        'Cherry & Chocolate Fudge',  # Note: "Cherry & Chocolate" gets converted to this in fix_typos
        'Chocolate Overload',
        'Coconut & Pineapple',
        'Coffee Mascarpone',
        'Dates & Chocolate',
        'Dates With Fig & Orange',
        'Fig & Orange',
        'Just Chocolate',
        'Old Fashion Vanilla',
        'Eggless Cherry & Chocolate Fudge',  # Note: converted from "Eggless Cherry & Chocolate"
        'Eggless Chocolate',
        'Eggless Chocolate Overload',
        'Eggless Coconut & Pineapple',
        'Eggless Coffee Mascarpone',
        'Eggless Fig & Orange',
        'Eggless Just Chocolate',
        'Eggless Milk Chocolate',
        'Eggless Paan & Gulkand',
        'Eggless Strawberry Cream Cheese',
    ]
    
    for item in ice_cream_items_without_suffix:
        if name == item or name.startswith(item + ' '):
            if 'Ice Cream' not in name:
                name = item + ' Ice Cream'
            break
    
    # Ensure "Cherry & Chocolate Fudge" stays as is (it's not "Cherry & Chocolate")
    # Already handled by exact match above
    
    return name.strip()

In [16]:
# Extras
EXTRAS = [
    'Cup',
    'Takeaway Cup',
    'Waffle Cone',
    'Butter Waffle Cone',
    'Butter Waffle Cones',
]

# Desserts
DESSERTS = [
    'Boston Cream Pie',
    'Boston Cream Pie Dessert',
    'Brownie Cheesecake',
    'Classic Brownie & Ice Cream With Fudge Sauce',
    'Classic Chocolate Lamington',
    'Classic Tiramisu',
    'D&N Traditional Plum Cake',
    'Fudgy Chocolate Brownie',
    'New York Baked Cheesecake Eggless',
    'Orange & Chocolate Cheesecake',
]

# Combos
COMBOS = [
    'Design Family Pack Of 3 Ice Creams',
    'Design Your Indulgence Duo Ice Creams',
    'Eggless Design Your Indulgence Duo Ice Creams',
    'Half In Half Regular Scoop Combo',
]

def determine_type(name, raw_name):
    """Determine the type of item: Ice Cream, Dessert, Drinks, Extra, Combo"""
    
    # Check extras
    for extra in EXTRAS:
        if extra.lower() in name.lower() or extra.lower() in raw_name.lower():
            return 'Extra'
    
    # Check desserts
    for dessert in DESSERTS:
        if dessert.lower() in name.lower() or dessert.lower() in raw_name.lower():
            return 'Dessert'
    
    # Check combos
    for combo in COMBOS:
        if combo.lower() in name.lower() or combo.lower() in raw_name.lower():
            return 'Combo'
    
    # Default to Ice Cream for most items
    return 'Ice Cream'


In [56]:
def determine_variant(raw_name, description, weight):
    """Determine the variant based on name, description, and weight"""
    
    raw_lower = raw_name.lower()
    desc_lower = (description or '').lower()
    
    # Check for explicit variant in name
    if '(family tub' in raw_lower or 'family tub' in desc_lower:
        return 'FAMILY_TUB_500GMS'
    
    if '(junior scoop' in raw_lower or desc_lower == 'junior scoop':
        return 'JUNIOR_SCOOP_60GMS'
    
    if '(mini tub' in raw_lower or 'mini tub' in desc_lower:
        return 'MINI_TUB_160GMS'
    
    if '(regular scoop' in raw_lower or desc_lower == 'regular scoop':
        return 'REGULAR_SCOOP_120GMS'
    
    if '(regular tub' in raw_lower or 'regular tub' in desc_lower:
        if '300ml' in desc_lower or '300ml' in raw_lower:
            return 'REGULAR_TUB_300ML'
        return 'REGULAR_TUB_220GMS'
    
    if 'small scoop' in raw_lower:
        return 'JUNIOR_SCOOP_60GMS'
    
    # Check for (160gm) pattern in name (common in addon data)
    if '(160gm' in raw_lower:
        return 'MINI_TUB_160GMS'
    
    # Check for piece counts in name (for extras like Butter Waffle Cone)
    if '(2pc' in raw_lower or '(2 pc' in raw_lower:
        return '2_PIECES'
    if '(1pc' in raw_lower or '(1 pc' in raw_lower:
        return '1_PIECE'
    
    # Check for size in description
    if description:
        if '200ml+200ml' in description.lower() or '200+200+200' in description.lower():
            # Combo variant
            if '200+200+200' in description:
                return 'FAMILY_PACK_3X200ML'
            return 'DUO_200ML_200ML'
        
        if '300ml' in description.lower():
            return 'REGULAR_TUB_300ML'
        
        if '200ml' in description.lower():
            return 'MINI_TUB_200ML'
    
    # Check for 200ml in name
    if '200ml' in raw_lower:
        return 'MINI_TUB_200ML'
    
    if '300ml' in raw_lower:
        return 'REGULAR_TUB_300ML'
    
    # Check weight-based inference for standalone entries
    if weight == 500:
        return 'FAMILY_TUB_500GMS'
    elif weight == 220:
        return 'REGULAR_TUB_220GMS'
    elif weight == 160:
        return 'MINI_TUB_160GMS'
    elif weight == 120:
        return 'REGULAR_SCOOP_120GMS'
    elif weight == 60:
        return 'JUNIOR_SCOOP_60GMS'
    
    # Check description for piece counts (desserts/extras)
    if description:
        if '2pc' in desc_lower or '2 pc' in desc_lower:
            return '2_PIECES'
        if '1pc' in desc_lower or '1 pc' in desc_lower:
            return '1_PIECE'
    
    # Default to 1_PIECE for desserts/extras with no other info
    return '1_PIECE'

def handle_special_cases(name, item_type, variant, raw_name, description, weight):
    """Handle special edge cases"""
    
    # Boston Cream Pie - all should be "Boston Cream Pie"
    if 'boston cream pie' in name.lower():
        name = 'Boston Cream Pie'
        item_type = 'Dessert'
    
    # Half In Half Combo
    if 'half in half' in name.lower():
        variant = 'HALF_IN_HALF_REGULAR_SCOOP'
    
    # Butter Waffle Cone/Cones
    if 'butter waffle' in name.lower():
        name = 'Butter Waffle Cone'
        item_type = 'Extra'
    
    # Waffle Cone standalone
    if name.lower() == 'waffle cone':
        item_type = 'Extra'
        variant = '1_PIECE'
    
    # Cup / Takeaway Cup
    if name.lower() in ['cup', 'takeaway cup']:
        item_type = 'Extra'
        variant = '1_PIECE'
    
    return name, item_type, variant


In [31]:
def build_clean_menu_from_orders(raw_entries: List[Dict]) -> pd.DataFrame:
    records = []

    for entry in raw_entries:
        raw_name = entry["raw_name"]
        if not raw_name:
            continue

        clean_name = normalize_name(raw_name, entry.get("description"), entry.get("weight"))
        item_type = determine_type(clean_name, raw_name)
        variant = determine_variant(raw_name, entry.get("description"), entry.get("weight"))
        name, item_type, variant = handle_special_cases(clean_name, item_type, variant, raw_name, description, weight)
        records.append({
            "name": clean_name,
            "type": item_type,
            "variant": variant,
            "source": "order_json"
        })

    df = pd.DataFrame(records)
    df.drop_duplicates(subset=["name", "type", "variant"], inplace=True)
    return df.sort_values(["name", "type", "variant"])


In [28]:
import requests
import json
import html
import re
from typing import Optional, List, Dict, Any

BASE_URL = "https://webhooks.db1-prod-dachnona.store/analytics"
API_KEY = "f3e1753aa4c44159fa7218a31cd8db1e"

HEADERS = {
    "X-API-Key": API_KEY,
}

def _safe_json_load(value):
    if value is None:
        return None
    if isinstance(value, (dict, list)):
        return value
    try:
        return json.loads(value)
    except (TypeError, ValueError):
        return value

def fetch_stream_raw(
    endpoint: str,
    limit: int = 500,
    start_cursor: Optional[int] = 0,
) -> List[Dict[str, Any]]:
    results = []
    last_stream_id = start_cursor or 0

    while True:
        params = {
            "limit": limit,
            "cursor": last_stream_id,
        }

        resp = requests.get(
            f"{BASE_URL}/{endpoint}/",
            headers=HEADERS,
            params=params,
            timeout=60,
        )
        resp.raise_for_status()

        payload = resp.json()
        batch = payload.get("data", [])

        if not batch:
            break

        results.extend(batch)
        last_stream_id = batch[-1]["stream_id"]

        if len(batch) < limit:
            break

    return results

records = fetch_stream_raw("orders")

In [29]:
menu_entries = extract_raw_menu_entries_from_orders(records)

In [32]:
df = build_clean_menu_from_orders(menu_entries)

In [52]:
df[df['type'] == 'Ice Cream']['name'].unique()

array(['400 Pidge/porter Delivery Charges',
       '500 Pidge/porter Delivery Charges', 'Affogato',
       'Alphonso Mango Ice Cream (Regular)', 'Alphonso Mango Ice Cream)',
       'Americano', 'Assorted Cookie Duo ( Dark Chocolate + Choco Chip )',
       'Banoffee Ice Cream', 'Banoffee Ice Cream (120gm))',
       'Banoffee Ice Cream (220gms))', 'Banoffee Ice Cream (500gms))',
       'Banoffee Ice Cream (60gm))', 'Banoffee Ice Cream)',
       'Bean-to-Bar 70% Dark Chocolate Ice Cream',
       'Bean-to-Bar 70% Dark Chocolate Ice Cream (Family Feast (725ml))',
       'Bean-to-Bar 70% Dark Chocolate Ice Cream)',
       'Bean-to-Bar Chocolate 70% Dark Ice Cream',
       'Bean-to-Bar Chocolate 70% Dark Ice Cream (200ml))',
       'Bean-to-Bar Chocolate 70% Dark Ice Cream)',
       'Bean-to-Bar Dark Chocolate Ice Cream',
       'Bean-to-Bar Dark Chocolate Ice Cream (120gm))',
       'Bean-to-Bar Dark Chocolate Ice Cream (200ml))',
       'Bean-to-Bar Dark Chocolate Ice Cream (220gms))',
    

In [50]:
df['type'].unique()

array(['Ice Cream', 'Dessert', 'Extra', 'Combo'], dtype=object)