# Data Pre processing


### **Data Preparation**
<p>Populating Necessary Data</p>

##### **Extracting NER Ingredients From Original Dataset**

In [None]:
#Open Excel File
import openpyxl
import os

# Load the workbook
workbook = openpyxl.load_workbook('dataset.xlsx')

# Select the active worksheet
worksheet = workbook["Sheet1"]

In [None]:
# Import pandas for data manipulation
import pandas as pd
import numpy as np

data = []
headers = []

# Get headers from the first row
for col in range(1, worksheet.max_column + 1):
    headers.append(worksheet.cell(row=1, column=col).value)

# Get data from remaining rows
for row in range(2, worksheet.max_row + 1):
    row_data = []
    for col in range(1, worksheet.max_column + 1):
        row_data.append(worksheet.cell(row=row, column=col).value)
    data.append(row_data)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

print(headers)

# Display first few rows to verify
print(f"Dataset shape: {df.shape}")
df.head()


In [None]:
# Display all column names to verify
print("Column names:", df.columns.tolist())

# Drop None value column if it exists
none_columns = [col for col in df.columns if col is None]
if none_columns:
    df = df.drop(columns=none_columns)
    print(f"Dropped {len(none_columns)} None column(s)")

# Display all column names to verify
print("Column post-clean:", df.columns.tolist())
print(f"Dataset shape: {df.shape}")


In [None]:
#renamed columns
df = df.rename(columns={
    'Food Name' : 'food_name',
    'Ingredients' : 'ingredient',
    'Instructions' : 'instructions',
    'Min Age Group ': 'min_age_group',
    'Max Age Group ': 'max_age_group',
    'NER Ingredient': 'ner_ingredient',
    'Texture': 'texture',
    'Prep Time': 'prep_time',
    'Cook Time': 'cook_time',
    'Serving': 'serving',
    'Difficulty': 'difficulty',
    'Origin': 'origin',
    'Region': 'region',
    'Description': 'description',
    'Image Link ': 'image_link',
    'Link ': 'recipe_link',
    'Credibility ': 'credibility',
    'Meal Type': 'meal_type',
    'Flavor_type': 'flavor_type',
    'Dietary Tags': 'dietary_tags',
    'Choking Hazards': 'choking_hazards',
    'Nutrion Value': 'nutrition_value',
    'tips': 'tips',
    'Allergen': 'allergen',
    'Hypoallergenic': 'hypoallergenic',
})

print("Column name post-clean:", df.columns.tolist())
df.head()

<p>Checking Any Null Value</p>

In [None]:
# check every column for null values
for col in df.columns:
    null_count = df[col].isnull().sum()
    if null_count > 0:
        print(f"Column '{col}' has {null_count} null values.")
    else:
        print(f"Column '{col}' has no null values.")


In [None]:
#drop data if imporant columns are empty
important_columns = ['food_name', 'ingredient', 'instructions', 'recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            df = df.dropna(subset=[col])

# After dropping rows, you may want to reset the index if needed
df = df.reset_index(drop=True)
# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.shape)
df.head(10)

In [None]:
df['dietary_tags'] = ''
df['choking_hazards'] = ''
df['difficulty'] = ''
df['allergen'] = ''
df['hypoallergenic'] = ''

In [None]:
df_real = df.copy()
df.shape

<p>Formatting Ingredient and Instruction</p>

In [None]:
import re

def is_already_formatted(text):
    return '\n' in text or '\\n' in text

In [None]:
# --- Normalize Ingredients ---
def normalize_ingredients(text):
    if is_already_formatted(text):
        return text.replace('\n', '\\n') if '\n' in text else text

    lines = []
    text = re.sub(r'\s+', ' ', text).strip()

    # Try splitting by common delimiters
    if ' - ' in text:
        parts = text.split(' - ')
    elif '•' in text:
        parts = [p.strip() for p in text.split('•') if p.strip()]
    elif ',' in text:
        parts = [p.strip() for p in text.split(',') if p.strip()]
    elif re.search(r'\d+\. ', text):
        parts = re.split(r'\d+\. ', text)
    else:
        parts = [text]

    for part in parts:
        part = re.sub(r'^\d+\. ?', '', part).strip()
        if part:
            lines.append('- ' + part)

    return '\\n'.join(lines)



In [None]:
# --- Normalize Instructions ---
def normalize_instructions(text):
    if is_already_formatted(text):
        return text.replace('\n', '\\n') if '\n' in text else text

    lines = []
    text = re.sub(r'\r\n|\r', '\n', text).strip()

    # Match numbered or step-based patterns
    step_pattern = r'(?:Step\s*\d+:\s*|\d+\.\s*)([^:.]+?)(?=\s*(?:Step\s*\d+:\s*|\d+\.\s*|$))'
    matches = re.findall(step_pattern, text, re.IGNORECASE)

    if matches:
        parts = [m.strip() for m in matches if m.strip()]
    else:
        # Split by sentences
        parts = re.split(r'(?<=[.!?])\s+', text)

    for i, part in enumerate(parts):
        part = re.sub(r'\s+', ' ', part).strip()
        if part:
            lines.append(f"{i+1}. {part}")

    return '\\n'.join(lines)

In [None]:
# Normalize the 'ingredient' and 'instructions' columns
df['ingredient'] = df['ingredient'].apply(normalize_ingredients)
df['instructions'] = df['instructions'].apply(normalize_instructions)

df_real[['ingredient','instructions']].head()
print("Normalized DataFrame:")
df[['ingredient','instructions']].head()

<p>Extract Ingredient Name</p>


In [None]:
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

# Load spaCy model and lemmatizer
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()


In [None]:
custom_stopwords = {
    # Common general stopwords
    "a", "an", "the", "of", "to", "and", "in", "on", "for", "with", "by",
    "as", "at", "be", "is", "was", "are", "were", "it", "this", "that",

    # Recipe-specific words
    "finely", "stalk", "optional", "pinch", "dash", "sprinkle", "to taste", "fresh", "ripe",
    "whole", "cut", "slice", "diced", "chopped", "grated", "crushed",
    "halved", "quartered", "peeled", "mashed", "blended", "cooked",
    "steamed", "boiled", "baked", "roasted", "toasted", "minced", "pureed",
    "thinly", "soft", "ripe", "unsalted", "lowfat", "low-fat", "plain",
    "unsweetened", "organic", "natural", "canned", "frozen", "freshly",
    "ground", "powder", "sized", "pieces", "piece", "part", "parts",
    "measure", "measured", "add", "mix", "combine", "stir", "fold",
    "whisk", "blend", "mash", "chop", "dice", "grate", "steam", "boil",
    "bake", "roast", "toast", "mince", "puree", "soak", "rinse", "drain",
    "preheat", "oven", "medium", "high", "low", "heat", "temperature",
    "minutes", "minute", "hours", "hour", "time", "until", "softened",
    "cooled", "warm", "room temperature", "raw", "uncooked", "cooked",
    "leftover", "any kind", "preferably", "fresh", "store-bought",
    "homemade", "prepared", "ready-to-use", "instant", "cubed", "thin slices",
    "small", "medium", "large", "few", "bit", "bits", "drop", "drops"
}

# Combine with NLTK's English stopwords
STOPWORDS = set(nltk_stopwords.words('english')).union(custom_stopwords)

In [None]:
import nltk


def extract_clean_ingredient(line):

    nltk.download('wordnet')
    
    """
    Extracts and cleans a single ingredient line from a baby recipe.

    Args:
        line (str): Raw ingredient line like "- 1 ripe banana"

    Returns:
        str: Cleaned ingredient name like "banana"
    """

    # Step 1: Remove numbers and fractions
    line = re.sub(r'\d+\/?\d*', '', line)

    # Step 2: Remove units of measurement (expanded list)
    line = re.sub(
        r'\b(g|gm|gms|gram|grams|kg|kgs|kilogram|kilograms|'
        r'mg|mgs|milligram|milligrams|'
        r'ml|mls|milliliter|milliliters|l|ls|liter|liters|'
        r'tsp|tsps|teaspoon|teaspoons|tbsp|tb|tbsps|tablespoon|tablespoons|'
        r'cup|cups|c|cs|'
        r'oz|ounce|ounces|'
        r'lb|lbs|pound|pounds|'
        r'metric teaspoon|metric tablespoons|'
        r'pinch|drops|handful|sprinkle)\b',
        '',
        line,
        flags=re.IGNORECASE
    )

    # Step 3: Strip symbols and normalize spaces
    line = re.sub(r'[\-\*\(\),]', '', line).strip()
    line = re.sub(r'\s+', ' ', line).lower().strip()

    # Step 4: Use spaCy to process the text
    doc = nlp(line)

    # Step 5: Tokenize, remove verbs and stopwords, lemmatize
    tokens = []
    for token in doc:
        lemma = lemmatizer.lemmatize(token.text, 'n')  # Lemmatize as noun
        if (
            token.pos_ != "VERB" and
            lemma.lower() not in STOPWORDS and
            token.is_alpha
        ):
            tokens.append(lemma.lower())

    return " ".join(tokens).strip()

In [None]:
def extract_ingredients(text):
    """
    Converts escaped newlines (\\n) to real ones, then extracts ingredients.
    Returns: List of cleaned ingredient names
    """
    try:
        text = text.encode().decode('unicode_escape')
    except Exception:
        pass

    ingredients = []
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-'):
            ingredient = stripped[1:].strip()
            if ingredient:
                ingredients.append(ingredient)

    return ingredients

In [None]:
def process_ingredients(text):
    raw = extract_ingredients(text)
    cleaned = [extract_clean_ingredient(ing) for ing in raw]
    return cleaned

# Apply function to create new column
df['ner_ingredient'] = df['ingredient'].apply(process_ingredients)

In [None]:
df[['food_name', 'ingredient', 'cleaned_check', 'ner_ingredient']].head(10)

In [None]:
def convert_escaped_newlines(df):
    """
    Convert escaped newlines (\\n) to real newlines for Excel display
    """
    text_columns = ['ingredient', 'instructions', 'tips']
    
    for col in text_columns:
        if col in df.columns:
            # Replace escaped newlines with actual newlines
            df[col] = df[col].apply(
                lambda x: x.replace('\\n', '\n') if isinstance(x, str) else x
            )
    
    return df

# Apply the conversion right before saving
df = convert_escaped_newlines(df)

# Then save
df.to_excel("1st_dataset.xlsx", index=False)
print("✅ DataFrame saved to '1st_dataset.xlsx'")

##### **New Dataset After Checking NER INgredients**

In [2]:
#Open Excel File
import openpyxl
import os

# Load the workbook
workbook = openpyxl.load_workbook('1st_dataset.xlsx')

# Select the active worksheet
worksheet = workbook["Sheet1"]
# Import pandas for data manipulation
import pandas as pd
import numpy as np

data = []
headers = []

# Get headers from the first row
for col in range(1, worksheet.max_column + 1):
    headers.append(worksheet.cell(row=1, column=col).value)

# Get data from remaining rows
for row in range(2, worksheet.max_row + 1):
    row_data = []
    for col in range(1, worksheet.max_column + 1):
        row_data.append(worksheet.cell(row=row, column=col).value)
    data.append(row_data)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

print(headers)

# Display first few rows to verify
print(f"Dataset shape: {df.shape}")
df.head()


['name', 'ingredients', 'ner_ingredient', 'instructions', 'min_age', 'max_age', 'texture', 'prep_time', 'cook_time', 'serving', 'origin', 'recipe_link', 'credibility', 'image_link', 'region', 'difficulty', 'meal_type', 'description', 'dietary_tags', 'choking_hazard', 'tips', 'allergen', 'hypoallergenic', 'nutrition_value', 'ID', 'Energy / Calorie', 'Carbohydrate (g)', 'Protein (g)', 'Fat (g)', 'List of Micros', None, None, None, None, None]
Dataset shape: (1322, 35)


Unnamed: 0,name,ingredients,ner_ingredient,instructions,min_age,max_age,texture,prep_time,cook_time,serving,...,Energy / Calorie,Carbohydrate (g),Protein (g),Fat (g),List of Micros,None,None.1,None.2,None.3,None.4
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","['cassava', 'fish', 'chicken', 'coconut oil', ...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,,15 min,45 min,1,...,,,,,,,,,,
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,"['beef', 'potato starch', 'milk', 'egg', 'marg...",1. Stir-fry blended spices until fragrant. \n2...,9,11,,30 minutes,30 minutes,10 servings,...,,,,,,,,,,
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","['cauliflower', 'broccoli', 'margarine', 'flou...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,,~10 min,~20 min,,...,,,,,,,,,,
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...","['carrot', 'potato', 'sweet potato']",1. Steam or microwave vegetables until tender....,6,12,,~5 min,~10 min,,...,,,,,,,,,,
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...","['onion', 'vegetable oil', 'beef', 'steak', 'c...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,,~10 min,~2.5 hours,,...,,,,,,,,,,


In [3]:
#drop data if imporant columns are empty
important_columns = ['name', 'ingredients', 'ner_ingredient', 'instructions', 'recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            df = df.dropna(subset=[col])

# After dropping rows, you may want to reset the index if needed
df = df.reset_index(drop=True)
# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.shape)
df.head(10)

Cleaned DataFrame:
(1322, 35)


Unnamed: 0,name,ingredients,ner_ingredient,instructions,min_age,max_age,texture,prep_time,cook_time,serving,...,Energy / Calorie,Carbohydrate (g),Protein (g),Fat (g),List of Micros,None,None.1,None.2,None.3,None.4
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","['cassava', 'fish', 'chicken', 'coconut oil', ...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,,15 min,45 min,1,...,,,,,,,,,,
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,"['beef', 'potato starch', 'milk', 'egg', 'marg...",1. Stir-fry blended spices until fragrant. \n2...,9,11,,30 minutes,30 minutes,10 servings,...,,,,,,,,,,
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","['cauliflower', 'broccoli', 'margarine', 'flou...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,,~10 min,~20 min,,...,,,,,,,,,,
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...","['carrot', 'potato', 'sweet potato']",1. Steam or microwave vegetables until tender....,6,12,,~5 min,~10 min,,...,,,,,,,,,,
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...","['onion', 'vegetable oil', 'beef', 'steak', 'c...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,,~10 min,~2.5 hours,,...,,,,,,,,,,
5,Toast Fingers,- 1 slice thick wholemeal bread,['wholemeal bread'],1. Toast bread and allow it to cool.\n2. Cut i...,6,12,,~5 min,~10 min,,...,,,,,,,,,,
6,Pumpkin Polenta Fingers,"- 3 cups water\n- 1 cup polenta (a coarse, gra...","['water', 'polenta', 'pumpkin', 'parmesan chee...",1. Bring water to the boil in a large saucepan...,6,12,,~10 min,~40 min,8,...,,,,,,,,,,
7,Rice Pudding,- 1 cup cooked rice\n- 1 cup milk\n- ½ - 1 tab...,"['rice', 'milk', 'sugar', 'vanilla essence']","1. In a saucepan mix together rice, milk, and ...",6,12,,~5 min,~15 min,,...,,,,,,,,,,
8,Hummus,"- 400g canned chickpeas, drained\n- 1 clove ga...","['chickpeas', 'garlic', 'lemon juice', 'milk',...",1. Combine all ingredients.\n2. Mash or puree ...,12,12,,~5 min,~10 min,2 cups,...,,,,,,,,,,
9,Baked Bean Pie,- 820g canned baked beans\n- 1 medium zucchini...,"['baked beans', 'zucchini', 'potatoes', 'milk'...",1. Preheat oven to 180°C.\n2. Place baked bean...,12,12,,~10 min,~30 min,,...,,,,,,,,,,


##### Reformatting NER Ingredient

In [4]:
import ast
import re 


def format_ner_ingredient(df):
    # First ensure ner_ingredient is properly parsed as a list
    df['ner_ingredient_list'] = df['ner_ingredient'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.strip().startswith('[') else 
                 x if isinstance(x, list) else 
                 [str(x)] if x else []
    )
    
    # Convert list to comma-separated string
    df['ner_ingredient_string'] = df['ner_ingredient_list'].apply(
        lambda x: ','.join(str(item) for item in x) if isinstance(x, list) else ''
    )
    
    return df

# Apply the function
df = format_ner_ingredient(df)

# Display the results
df[['name', 'ner_ingredient', 'ner_ingredient_string']].head()

Unnamed: 0,name,ner_ingredient,ner_ingredient_string
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"['cassava', 'fish', 'chicken', 'coconut oil', ...","cassava,fish,chicken,coconut oil,chicken broth..."
1,Bitterballs (Bitterballen),"['beef', 'potato starch', 'milk', 'egg', 'marg...","beef,potato starch,milk,egg,margarine,salt,che..."
2,Broccoli/Cauliflower Cheese,"['cauliflower', 'broccoli', 'margarine', 'flou...","cauliflower,broccoli,margarine,flour,milk,ched..."
3,Vegetable Fingers,"['carrot', 'potato', 'sweet potato']","carrot,potato,sweet potato"
4,Beef Casserole,"['onion', 'vegetable oil', 'beef', 'steak', 'c...","onion,vegetable oil,beef,steak,carrots,potatoe..."


In [5]:
import sys
import os

# Go into 'processing-technique' folder and add it to Python path
processing_dir = os.path.join(os.getcwd(), "preprocessing_techniques")
sys.path.append(processing_dir)

##### **Formatting Min and Max Age Group**

In [7]:
#ensure the min and max age only contain integers
import re 
from openpyxl.styles import PatternFill


def format_age(value):
    match = re.search(r'\d+', str(value))
    if match:
        return int(match.group())
    return None

df['min_age'] = df['min_age'].apply(format_age)
df['max_age'] = df['max_age'].apply(format_age)

print("====Results after formatting====")
df[['min_age','max_age']].head()


====Results after formatting====


Unnamed: 0,min_age,max_age
0,6.0,8.0
1,9.0,11.0
2,6.0,12.0
3,6.0,12.0
4,6.0,12.0


In [8]:
# Now you can import age_label from mapper
from mappers.age_label import age_rules
from pprint import pprint

pprint(age_rules)

{'age_keywords': {'chopped': {'max_age': 36,
                              'min_age': 10,
                              'reason': 'Small soft pieces for chewing '
                                        'practice'},
                  'finger food': {'max_age': 36,
                                  'min_age': 8,
                                  'reason': 'Self-feeding for older babies'},
                  'mash': {'max_age': 10,
                           'min_age': 6,
                           'reason': 'Lumpy texture for advancing skills'},
                  'porridge': {'max_age': 12,
                               'min_age': 4,
                               'reason': 'Easy-to-swallow grains'},
                  'puree': {'max_age': 8,
                            'min_age': 4,
                            'reason': 'Smooth texture for early weaning'},
                  'stew': {'max_age': 36,
                           'min_age': 6,
                           'reason': 'Soft, mixed

In [None]:
def infer_age(recipe_name, ingredients, instructions):
    # Initialize min_age and reasons
    min_age = 6   # Default baby age start (in months)
    max_age = 12  # Default upper limit (3 years)
    reasons = []

    # 1. Check recipe name keywords
    for keyword in age_rules["age_keywords"]:
        if keyword.lower() in recipe_name.lower():
            rule = age_rules["age_keywords"][keyword]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Recipe name suggests '{keyword}' ({rule['reason']})")
            if "max_age" in rule:
                max_age = min(max_age, rule["max_age"])  # Tighten max if rule restricts it

    # 2. Check ingredient restrictions
    for ingredient in ingredients.split(','):
        ing_key = ingredient.strip().lower()
        if ing_key in age_rules["ingredient_rules"]:
            rule = age_rules["ingredient_rules"][ing_key]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Ingredient '{ing_key}' requires {rule['reason']}")

    # 3. Check instructions for texture keywords
    for keyword in age_rules["instruction_keywords"]:
        if keyword.lower() in instructions.lower():
            rule = age_rules["instruction_keywords"][keyword]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Instruction '{keyword}' suggests {rule['reason']}")
            if "max_age" in rule:
                max_age = min(max_age, rule["max_age"])

    return {
        "min_age_group": min_age,
        "max_age_group": max_age
    }

In [None]:
def apply_infer(row):
    if pd.isna(row["min_age_group"]) or pd.isna(row["max_age_group"]):
        result = infer_age(row["food_name"], row["ner_ingredient_csv"], row["instructions"])
        return pd.Series([result["min_age_group"], result["max_age_group"]])
    else:
        return pd.Series([row["min_age_group"], row["max_age_group"]])

# Apply and create new columns
df[["min_age_group", "max_age_group"]] = df.apply(apply_infer, axis=1)

print(df[["food_name", "min_age_group", "max_age_group"]])

##### **Format and Fill the Texture**

In [9]:
# Load the module
import importlib
from pprint import pprint
texture_label_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "texture_label.py")

spec = importlib.util.spec_from_file_location("texture_label", texture_label_path)
texture_label = importlib.util.module_from_spec(spec)
spec.loader.exec_module(texture_label)

# Access the config
texture_config = texture_label.texture_config
pprint(texture_config)

{'family food': {'keywords': ['for adults',
                              'seasoned for adults',
                              'serve as is',
                              'family style',
                              'serve with family meal',
                              'adult portion',
                              'shared with family',
                              'serve at table',
                              'table food',
                              'family dinner',
                              'not modified',
                              'no blending',
                              'whole meal',
                              'non-baby portion',
                              'spaghetti',
                              'pizza',
                              'fried chicken',
                              'steak',
                              'burger',
                              'taco',
                              'sandwich',
                              'soup with chun

In [10]:
def parse_instructions(text):
    """
    Extracts numbered steps from a block of text and returns a list of instruction strings.
    """
    instructions = []
    num_instruction = 0
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith(f"{num_instruction + 1}."):
            # Remove the prefix like "1.", "2.", etc.
            instruction = stripped[len(str(num_instruction + 1)) + 1:].strip()
            instructions.append(instruction)
    instructions_str = ' '.join(instructions).lower()
    return instructions_str

In [11]:
def classify_recipe_texture(recipe_name, ner_ingredients, instructions_list, TEXTURE_KEYWORDS):
    # Step 1: Ensure ingredients are properly processed
    if not isinstance(ner_ingredients, list):
        ingredients_clean = []
    else:
        ingredients_clean = ner_ingredients

    # Step 2: Clean instructions
    instructions_str = parse_instructions(instructions_list)

    # Step 3: Combine all searchable text - ensure EVERYTHING is a string
    combined_text_parts = [str(recipe_name).lower()]
    
    # Safely add each ingredient as a string, skipping None values
    for i in ingredients_clean:
        if i is not None:
            combined_text_parts.append(str(i).lower())
    
    # Add the instructions
    combined_text_parts.append(instructions_str)
    
    # Join all parts with spaces
    combined_text = ' '.join(combined_text_parts)

    # Step 4: Match against each texture keyword set
    for texture_type, data in TEXTURE_KEYWORDS.items():
        for keyword in data["keywords"]:
            if keyword.lower() in combined_text:
                return texture_type

    # Default fallback
    return "NONE"

In [12]:
def determine_texture(row, TEXTURE_KEYWORDS):
    if pd.isna(row.get("texture")) or row["texture"] in ("", " ", None, "NONE"):
        return classify_recipe_texture(
            recipe_name=row["name"],
            ner_ingredients=row["ner_ingredient_string"],
            instructions_list=row["instructions"],
            TEXTURE_KEYWORDS=TEXTURE_KEYWORDS
        )
    return row["texture"]

df['texture'] = df.apply(determine_texture, axis=1, TEXTURE_KEYWORDS=texture_config)

df[['name', 'ner_ingredient', 'texture']].head(10)

Unnamed: 0,name,ner_ingredient,texture
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"['cassava', 'fish', 'chicken', 'coconut oil', ...",NONE
1,Bitterballs (Bitterballen),"['beef', 'potato starch', 'milk', 'egg', 'marg...",family food
2,Broccoli/Cauliflower Cheese,"['cauliflower', 'broccoli', 'margarine', 'flou...",NONE
3,Vegetable Fingers,"['carrot', 'potato', 'sweet potato']",NONE
4,Beef Casserole,"['onion', 'vegetable oil', 'beef', 'steak', 'c...",family food
5,Toast Fingers,['wholemeal bread'],NONE
6,Pumpkin Polenta Fingers,"['water', 'polenta', 'pumpkin', 'parmesan chee...",NONE
7,Rice Pudding,"['rice', 'milk', 'sugar', 'vanilla essence']",NONE
8,Hummus,"['chickpeas', 'garlic', 'lemon juice', 'milk',...",NONE
9,Baked Bean Pie,"['baked beans', 'zucchini', 'potatoes', 'milk'...",NONE


In [14]:
#total NONE texture group 
total_none_texture = df[df['texture'] == 'NONE']
print(f"Total recipes with 'NONE' texture: {len(total_none_texture)}")

#any missing value of texture 
missing_texture = df[df['texture'].isnull() | (df['texture'] == '')]
count = 0 
if not missing_texture.empty:
    count = len(missing_texture)
    print(f"Found {count} rows with missing or empty texture values.")
    print(missing_texture[['name', 'ner_ingredient', 'texture']])

print(f"Total rows with missing texture: {count}")

Total recipes with 'NONE' texture: 814
Total rows with missing texture: 0


<p>PS: With the current dictionary, 814 recipes aren't detected, therefore need manual checking before finalizing </p>

##### **Origin and Region Mapping**

Mapping origin values to origin_id using the countrymap sheet

In [16]:
# Load the countrymap sheet
countrymap_df = pd.read_excel('1st_dataset.xlsx', sheet_name='countrymap ')
print("Countrymap columns:", countrymap_df.columns.tolist())
print("Countrymap shape:", countrymap_df.shape)
print("\nFirst 5 rows of countrymap:")
display(countrymap_df.head())

Countrymap columns: ['pk', 'country', 'region', 'flag_code']
Countrymap shape: (76, 4)

First 5 rows of countrymap:


Unnamed: 0,pk,country,region,flag_code
0,1,China,Asian,cn
1,2,Japan,Asian,jp
2,3,South Korea,Asian,kr
3,4,Mongolia,Asian,mn
4,5,Nepal,Asian,np


In [17]:
print("UNIQUE ORIGIN VALUES IN MAIN DATASET:")
print("="*50)
unique_origins = df['origin'].dropna().unique()
print(f"Total unique origins: {len(unique_origins)}")
print("\nOrigin value counts:")
print(df['origin'].value_counts())

UNIQUE ORIGIN VALUES IN MAIN DATASET:
Total unique origins: 31

Origin value counts:
origin
Indonesia      758
french         141
uk             100
UK              56
New Zealand     52
Australia       30
America         30
Malaysia        24
Indonesian      22
China           20
Philippines     14
India           14
Afghanistan      8
Germany          7
Bhutan           5
Thailand         5
Japan            5
Singapore        5
Nepal            5
Mongolia         4
South Korea      3
Lao PDR          3
Bangladesh       2
Brunei           2
Laos             1
Myanmar          1
Maldives         1
Cambodia         1
Japan            1
China            1
Belgium          1
Name: count, dtype: int64


In [18]:
country_to_id = {}

# Step 1: Populate the dictionary using lowercase country names
for _, row in countrymap_df.iterrows():
    country = str(row['country']).strip().lower()  # normalize
    origin_id = row['pk']
    region = row['region']
    
    country_to_id[country] = {'id': origin_id, 'region': region}

# Step 2: Add alternative lowercase names (also normalized)
alternatives = {
    'french': 'france',
    'uk': 'united kingdom', 
    'america': 'united states',
    'usa': 'united states',
    'indonesian': 'indonesia',
    'lao pdr': 'laos',
    'japan ': 'japan',
}

for alt, standard in alternatives.items():
    alt = alt.strip().lower()
    standard = standard.strip().lower()
    if standard in country_to_id:
        country_to_id[alt] = country_to_id[standard]

print("✅ Mapping dictionary created with", len(country_to_id), "entries.")


✅ Mapping dictionary created with 80 entries.


In [19]:
# Function to map origin to origin_id and region
def map_origin(origin_value):
    if pd.isna(origin_value):
        return None, None
    
    origin_str = str(origin_value).strip().lower()  # Normalize for consistent lookup
    
    if origin_str in country_to_id:
        return country_to_id[origin_str]['id'], country_to_id[origin_str]['region']
    
    return None, None

In [20]:
# Sample test cases to check various formats
test_origins = ['USA', 'usa', 'UsA ', ' United States', 'french', 'UK', 'uk', 'Indonesian', 'Laos', 'Japan ', 'NotARealCountry']
test_df = pd.DataFrame({'origin': test_origins})
# Apply mapping
print("🛠️ Applying origin mapping...")
mapping_results = test_df['origin'].apply(map_origin)
test_df['origin_id'] = [result[0] for result in mapping_results]
test_df['mapped_region'] = [result[1] for result in mapping_results]

# Summary
print("✅ Origin mapping completed.")
print(f"✅ Mapped origins: {test_df['origin_id'].notna().sum()}")
print(f"❌ Unmapped origins: {test_df['origin_id'].isna().sum()}")
print("📋 Test results:")
print(test_df[['origin', 'origin_id', 'mapped_region']])

🛠️ Applying origin mapping...
✅ Origin mapping completed.
✅ Mapped origins: 10
❌ Unmapped origins: 1
📋 Test results:
             origin  origin_id       mapped_region
0               USA       46.0            American
1               usa       46.0            American
2              UsA        46.0            American
3     United States       46.0            American
4            french       38.0  Western / European
5                UK       39.0  Western / European
6                uk       39.0  Western / European
7        Indonesian        9.0     Southeast Asian
8              Laos       14.0     Southeast Asian
9            Japan         2.0               Asian
10  NotARealCountry        NaN                None


In [21]:
# Apply mapping
print("Applying origin mapping...")
mapping_results = df['origin'].apply(map_origin)
df['origin_id'] = [result[0] for result in mapping_results]
df['mapped_region'] = [result[1] for result in mapping_results]

print("✅ Origin mapping completed")
print(f"Mapped origins: {df['origin_id'].notna().sum()}")
print(f"Unmapped origins: {df['origin_id'].isna().sum()}")

display(df[['origin', 'origin_id', 'mapped_region']].head())

Applying origin mapping...
✅ Origin mapping completed
Mapped origins: 1322
Unmapped origins: 0


Unnamed: 0,origin,origin_id,mapped_region
0,Indonesia,9,Southeast Asian
1,Indonesia,9,Southeast Asian
2,Australia,68,Oceanic
3,Australia,68,Oceanic
4,Australia,68,Oceanic


##### **Formatting Difficulty**

In [25]:
import re

def remove_min(text):
    if pd.isna(text):
        return None
    match = re.search(r'\d+', str(text))
    if match:
        return int(match.group())
    return None

# Apply the function to prep_time and cook_time columns
df['prep_time'] = df['prep_time'].apply(remove_min)
df['cook_time'] = df['cook_time'].apply(remove_min)

df[['prep_time', 'cook_time']].head(10)

Unnamed: 0,prep_time,cook_time
0,15.0,45.0
1,30.0,30.0
2,10.0,20.0
3,5.0,10.0
4,10.0,2.0
5,5.0,10.0
6,10.0,40.0
7,5.0,15.0
8,5.0,10.0
9,10.0,30.0


In [26]:
import codecs

def parse_ingredients(text):
    """
    Extracts lines that start with '-' from a block of text and returns a list of ingredients.
    """
    ingredients = []
    num_ingredient=0
    # print(text)
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-'):
            ingredients.append(stripped[1:].strip())
            num_ingredient+=1
    return num_ingredient


def parse_instructions(text):
    """
    Extracts numbered steps from a block of text and returns a list of instruction strings.
    """
    instructions = []
    num_instruction=0
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith(str(num_instruction+1)+'.'):
            instructions.append(stripped[len(str(num_instruction+1))+1:].strip())
            num_instruction+=1
    return num_instruction

In [27]:
def estimate_difficulty(ingredients, instructions):
    ingredients = codecs.decode(ingredients, 'unicode_escape')
    instructions = codecs.decode(instructions, 'unicode_escape')
    num_ingredients = parse_ingredients(ingredients)
    num_steps = parse_instructions(instructions)
    # print(f"Number of ingredients: {num_ingredients}")
    # print(f"Number of steps: {num_steps}")
    if num_ingredients <= 4 and num_steps <= 5:
        return "Easy"
    elif num_ingredients <= 8 and num_steps <= 8:
        return "Medium"
    else:
        return "Hard"

In [28]:
# Estimate difficulty
df['difficulty'] = df.apply(
    lambda row: estimate_difficulty(row['ingredients'], row['instructions']),
    axis=1
)

df[['name', 'difficulty']].head(10)


Unnamed: 0,name,difficulty
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,Medium
1,Bitterballs (Bitterballen),Hard
2,Broccoli/Cauliflower Cheese,Medium
3,Vegetable Fingers,Easy
4,Beef Casserole,Medium
5,Toast Fingers,Easy
6,Pumpkin Polenta Fingers,Medium
7,Rice Pudding,Easy
8,Hummus,Medium
9,Baked Bean Pie,Medium


##### **Determine Choking Hazard**

In [339]:
# Load the module
import importlib

choking_hazard_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "choking_hazard_classifiers.py")

spec = importlib.util.spec_from_file_location("choking_hazard", choking_hazard_path)
choking_hazard_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(choking_hazard_file)

# Access the config
choking_hazard  = choking_hazard_file.choking_hazard
pprint(choking_hazard)

{'all_choking_hazards': ['whole blueberries',
                         'grapes',
                         'whole cherry tomatoes',
                         'whole strawberries',
                         'raw apple chunks',
                         'raw pear chunks',
                         'whole grapes',
                         'whole raspberries',
                         'whole blackberries',
                         'whole peaches',
                         'melon balls',
                         'raw carrot sticks',
                         'raw bell pepper',
                         'raw cucumber',
                         'raw zucchini',
                         'raw broccoli florets',
                         'raw cauliflower',
                         'raw green beans',
                         'whole edamame',
                         'whole peas',
                         'whole lentils',
                         'whole chickpeas',
                         'whole kidney be

In [340]:
def has_choking_hazard(row):
    """
    Detects baby food choking hazards using NER ingredients + instructions.
    
    Parameters:
        row: DataFrame row with ner_ingredient, ingredient, and instructions
    
    Returns:
        str: "Yes" or "No"
    """
    # Step 1: Get & normalize ingredients
    ner_ingredients = row.get("ner_ingredient", [])
    original_ingredients = row.get("ingredient", "")
    instructions = row.get("instructions", "")
    
    # Convert list of ingredients to string for search
    ner_ingredients_text = ' '.join([str(i).lower() for i in ner_ingredients])
    
    # Step 2: Clean instructions
    cleaned_instructions = ""
    if instructions:
        if callable(getattr(instructions, 'lower', None)):
            cleaned_instructions = instructions.lower()
    
    # Step 3: Combine all searchable text
    combined_text = ' '.join([
        ner_ingredients_text,
        str(original_ingredients).lower(),
        cleaned_instructions
    ])
    
    # Step 4: Match against all_choking_hazards
    matched_categories = []
    for category, hazards in choking_hazard.items():
        if category == "all_choking_hazards":
            # Optional: Add a general safety fallback
            for hazard in hazards:
                if hazard in combined_text and "all_choking_hazards" not in matched_categories:
                    matched_categories.append("all_choking_hazards")
                    break  # No need to keep checking once match found
        else:
            # Regular categories like fruits, veggies, etc.
            for hazard in hazards:
                if hazard in combined_text and category not in matched_categories:
                    matched_categories.append(category)
                    break  # Break inner loop after first match in this category
    
    if matched_categories:
        return "Yes"
    else:
        return "No"

In [341]:
# Apply to every row in the DataFrame
df['choking_hazards'] = df.apply(has_choking_hazard, axis=1)
df[['name', 'ner_ingredient_string', 'choking_hazards']].head(10)

Unnamed: 0,name,ner_ingredient_string,choking_hazards
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"cassava,fish,chicken,coconut oil,chicken broth...",No
1,Bitterballs (Bitterballen),"beef,potato starch,milk,egg,margarine,salt,che...",No
2,Broccoli/Cauliflower Cheese,"cauliflower,broccoli,margarine,flour,milk,ched...",No
3,Vegetable Fingers,"carrot,potato,sweet potato",No
4,Beef Casserole,"onion,vegetable oil,beef,steak,carrots,potatoe...",No
5,Toast Fingers,wholemeal bread,No
6,Pumpkin Polenta Fingers,"water,polenta,pumpkin,parmesan cheese,oil",No
7,Rice Pudding,"rice,milk,sugar,vanilla essence",No
8,Hummus,"chickpeas,garlic,lemon juice,milk,tahini",No
9,Baked Bean Pie,"baked beans,zucchini,potatoes,milk,cheese,chives",No


##### Formatting Unique NER Ingredients To **Map Ingredient w/ Allergen Group**

In [35]:
df_ingredients = df.copy()

In [37]:
#cleaning NER ingredient after checking 
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK data (run once)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

# Initialize NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_ingredient_symbols(ingredient_text):
    """
    Enhanced ingredient cleaning using NLTK for baby food ingredients
    """
    if not ingredient_text or pd.isna(ingredient_text):
        return ""
    
    # Convert to string and strip
    cleaned = str(ingredient_text).strip().lower()
    
    # Remove brackets and quotes
    cleaned = re.sub(r'^[\[\'\"\s]+', '', cleaned)
    cleaned = re.sub(r'[\]\'\"\s]+$', '', cleaned)
    cleaned = re.sub(r'[\[\]\'\"]', '', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    if not cleaned or len(cleaned) <= 1:
        return ""
    
    # Split into words for processing
    words = cleaned.split()
    processed_words = []
    
    # Words to completely remove
    words_to_remove = {
        'baby', 'babies', 'infant', 'toddler', 'little', 'mini', 'small',
        'organic', 'fresh', 'frozen', 'canned', 'dried', 'raw', 'cooked',
        'steamed', 'boiled', 'mashed', 'pureed', 'chopped', 'diced',
        'sliced', 'grated', 'peeled', 'unsalted', 'natural', 'pure',
        'whole', 'half', 'quarter', 'piece', 'pieces'
    }
    
    for word in words:
        # Skip banned words
        if word.lower() in words_to_remove:
            continue
        
        # Use NLTK lemmatizer to handle plurals
        singular_word = lemmatizer.lemmatize(word, 'n')  # 'n' for noun
        
        # If NLTK didn't change it, try custom food-specific rules
        if singular_word == word:
            singular_word = handle_food_plurals(word)
        
        if singular_word and len(singular_word) > 1:
            processed_words.append(singular_word)
    
    result = ' '.join(processed_words).strip()
    return result if result else ""

def handle_food_plurals(word):
    """
    Handle food-specific plurals that NLTK might miss
    """
    # Special food cases
    food_plurals = {
        'potatoes': 'potato',
        'tomatoes': 'tomato',
        'mangoes': 'mango',
        'avocados': 'avocado',
        'bananas': 'banana',
        'strawberries': 'strawberry',
        'blueberries': 'blueberry',
        'raspberries': 'raspberry',
        'blackberries': 'blackberry',
        'cranberries': 'cranberry',
        'cherries': 'cherry',
        'berries': 'berry',
        'beans': 'bean',
        'peas': 'pea',
        'carrots': 'carrot',
        'onions': 'onion',
        'peppers': 'pepper',
        'eggs': 'egg',
        'nuts': 'nut',
        'oats': 'oat',
        'grains': 'grain',
    }
    
    word_lower = word.lower()
    
    # Check special cases first
    if word_lower in food_plurals:
        return food_plurals[word_lower]
    
    # Apply general plural rules
    # Words ending in 'ies' -> 'y'
    if word_lower.endswith('ies') and len(word_lower) > 4:
        return word_lower[:-3] + 'y'
    
    # Words ending in 'oes' -> 'o'
    if word_lower.endswith('oes') and len(word_lower) > 4:
        return word_lower[:-2]
    
    # Words ending in 's' -> remove 's'
    if word_lower.endswith('s') and len(word_lower) > 2:
        return word_lower[:-1]
    
    return word

In [38]:
# Test the enhanced function
print("=== TESTING NLTK-BASED CLEANING ===")
test_cases = [
    "baby spinach",
    "organic sweet potatoes", 
    "fresh strawberries",
    "mini carrots",
    "cooked black beans",
    "mashed bananas",
    "steamed broccoli florets",
    "pureed mangoes",
    "diced tomatoes",
    "baby peas"
]

print("Before → After cleaning:")
for ingredient in test_cases:
    cleaned = clean_ingredient_symbols(ingredient)
    print(f"'{ingredient}' → '{cleaned}'")

=== TESTING NLTK-BASED CLEANING ===
Before → After cleaning:
'baby spinach' → 'spinach'
'organic sweet potatoes' → 'sweet potato'
'fresh strawberries' → 'strawberry'
'mini carrots' → 'carrot'
'cooked black beans' → 'black bean'
'mashed bananas' → 'banana'
'steamed broccoli florets' → 'broccoli floret'
'pureed mangoes' → 'mango'
'diced tomatoes' → 'tomato'
'baby peas' → 'pea'


In [42]:
def clean_ingredient_string(ingredient_string):
        if not isinstance(ingredient_string, str) or not ingredient_string.strip():
            return ""
        
        ingredients = ingredient_string.split(',')
        cleaned_ingredients = []
        
        for ingredient in ingredients:
            cleaned_ingredient = clean_ingredient_symbols(ingredient)
            if cleaned_ingredient:  # Only add non-empty ingredients
                cleaned_ingredients.append(cleaned_ingredient)
        # print(f"Cleaned ingredients: {cleaned_ingredients}")
        return ','.join(cleaned_ingredients)


In [43]:

def clean_ner_ingredients(df_ingredients):
    print("=== CLEANING NER INGREDIENTS ===")
    
    # Apply cleaning to ner_ingredient_string column
    df_ingredients['ner_ingredient_string'] = df_ingredients['ner_ingredient_string'].apply(clean_ingredient_string)
    
    # BEFORE removing rows, identify which ones will be deleted
    initial_count = len(df_ingredients)
    
    # Find rows that will be deleted (empty or whitespace-only strings)
    rows_to_delete = df_ingredients[df_ingredients['ner_ingredient_string'].apply(lambda x: len(str(x).strip()) == 0)]
    
    if len(rows_to_delete) > 0:
        print(f"\n⚠️  ROWS TO BE DELETED ({len(rows_to_delete)} rows):")
        print("="*60)
        
        # Display information about deleted rows
        for idx, row in rows_to_delete.iterrows():
            print(f"\nRow Index: {idx}")
            print(f"Recipe Name: {row.get('name', 'N/A')}")
            print(f"Original ner_ingredient_string: '{row.get('ner_ingredient_string', 'N/A')}'")
            
            # Show the original ner_ingredient if available
            if 'ner_ingredient' in row:
                print(f"Original ner_ingredient: {row['ner_ingredient']}")
            
            # Show original ingredients if available
            if 'ingredient' in row:
                original_ingredients = str(row['ingredient'])[:100] + "..." if len(str(row['ingredient'])) > 100 else str(row['ingredient'])
                print(f"Original ingredients: {original_ingredients}")
            
            print("-" * 40)
        
        # Optional: Save deleted rows to a separate file for analysis
        rows_to_delete.to_excel('deleted_rows_analysis.xlsx', index=True)
        print(f"\n💾 Saved deleted rows to 'deleted_rows_analysis.xlsx' for detailed analysis")
    
    # Now remove the rows
    df_ingredients = df_ingredients[df_ingredients['ner_ingredient_string'].apply(lambda x: len(str(x).strip()) > 0)]
    removed_count = initial_count - len(df_ingredients)
    
    if removed_count > 0:
        print(f"\n✅ Removed {removed_count} rows with no valid ingredients after cleaning")
        print(f"Remaining rows: {len(df_ingredients)}")
    else:
        print("\n✅ No rows were removed - all recipes retained valid ingredients")
    
    return df_ingredients

# Apply the enhanced cleaning function
df_ingredients = clean_ner_ingredients(df_ingredients)

=== CLEANING NER INGREDIENTS ===

✅ No rows were removed - all recipes retained valid ingredients


In [44]:
df[['name', 'ner_ingredient_string']].head(10)

Unnamed: 0,name,ner_ingredient_string
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"cassava,fish,chicken,coconut oil,chicken broth..."
1,Bitterballs (Bitterballen),"beef,potato starch,milk,egg,margarine,salt,che..."
2,Broccoli/Cauliflower Cheese,"cauliflower,broccoli,margarine,flour,milk,ched..."
3,Vegetable Fingers,"carrot,potato,sweet potato"
4,Beef Casserole,"onion,vegetable oil,beef,steak,carrots,potatoe..."
5,Toast Fingers,wholemeal bread
6,Pumpkin Polenta Fingers,"water,polenta,pumpkin,parmesan cheese,oil"
7,Rice Pudding,"rice,milk,sugar,vanilla essence"
8,Hummus,"chickpeas,garlic,lemon juice,milk,tahini"
9,Baked Bean Pie,"baked beans,zucchini,potatoes,milk,cheese,chives"


In [45]:

all_ingredients = []

for ingredient_string in df_ingredients['ner_ingredient_string']:
    if isinstance(ingredient_string, str) and ingredient_string.strip():
        # Split by comma and clean each ingredient
        ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
        all_ingredients.extend(ingredients)

display(df[['name', 'ner_ingredient_string']].head(10))
print(f"Total unique ingredients found: {len(set(all_ingredients))}")


Unnamed: 0,name,ner_ingredient_string
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"cassava,fish,chicken,coconut oil,chicken broth..."
1,Bitterballs (Bitterballen),"beef,potato starch,milk,egg,margarine,salt,che..."
2,Broccoli/Cauliflower Cheese,"cauliflower,broccoli,margarine,flour,milk,ched..."
3,Vegetable Fingers,"carrot,potato,sweet potato"
4,Beef Casserole,"onion,vegetable oil,beef,steak,carrots,potatoe..."
5,Toast Fingers,wholemeal bread
6,Pumpkin Polenta Fingers,"water,polenta,pumpkin,parmesan cheese,oil"
7,Rice Pudding,"rice,milk,sugar,vanilla essence"
8,Hummus,"chickpeas,garlic,lemon juice,milk,tahini"
9,Baked Bean Pie,"baked beans,zucchini,potatoes,milk,cheese,chives"


Total unique ingredients found: 930


In [46]:
# Check for problematic ingredients again
print("\n=== CHECKING FOR REMAINING ISSUES ===")
problematic_ingredients = []

for ingredient in all_ingredients:
    # Check for symbols, empty strings, or unwanted characters
    if any(char in ingredient for char in ['[', ']', '_', '___', '{', '}', '(', ')', '@', '#', '$', "'", '"']):
        problematic_ingredients.append(ingredient)

if problematic_ingredients:
    print(f"Still found {len(problematic_ingredients)} ingredients with symbols:")
    for ing in set(problematic_ingredients):
        count = problematic_ingredients.count(ing)
        print(f"  '{ing}' - appears {count} times")
else:
    print("✅ All problematic symbols have been cleaned!")

# Show before and after examples
print("\n=== CLEANING RESULTS ===")
print("Examples of cleaned ingredients:")
unique_ingredients = list(set(all_ingredients))
for i, ing in enumerate(unique_ingredients[:10]):
    print(f"{i+1:2d}. '{ing}'")

print(f"\nTotal unique ingredients after cleaning: {len(unique_ingredients)}")
print(f"Total ingredient instances: {len(all_ingredients)}")


=== CHECKING FOR REMAINING ISSUES ===
✅ All problematic symbols have been cleaned!

=== CLEANING RESULTS ===
Examples of cleaned ingredients:
 1. 'mayonnaise'
 2. 'chicken seasoning'
 3. 'tapioca'
 4. 'petit suisse'
 5. 'chicken abon'
 6. 'rice milk'
 7. 'kamote'
 8. 'ground beef'
 9. 'gla water'
10. 'yellow sweet corn'

Total unique ingredients after cleaning: 930
Total ingredient instances: 8162


In [360]:
if 'df' in locals() and 'ner_ingredient_string' in df.columns:
    
    # Create recipe-ingredient structure
    recipe_ingredient_data = []
    
    for index, row in df.iterrows():
        recipe_id = index + 1  # Creating recipe_id starting from 1
        recipe_name = row.get('name', f'Recipe_{recipe_id}')  # Use actual name or fallback
        ner_ingredient_string = str(row['ner_ingredient_string']).strip()
        
        if pd.isna(ner_ingredient_string) or ner_ingredient_string == 'nan' or ner_ingredient_string == '':
            # Handle empty ingredient strings
            recipe_ingredient_data.append({
                'recipe_id': recipe_id,
                'recipe_name': recipe_name,
                'single_ingredient': None,
                'ingredient_position': 0,
                'original_ingredient_string': ner_ingredient_string
            })
        else:
            # Split by comma and clean each ingredient
            individual_ingredients = [ing.strip() for ing in ner_ingredient_string.split(',')]
            individual_ingredients = [ing for ing in individual_ingredients if ing]  # Remove empty strings
            
            if individual_ingredients:
                for position, ingredient in enumerate(individual_ingredients, 1):
                    recipe_ingredient_data.append({
                        'recipe_id': recipe_id,
                        'recipe_name': recipe_name,
                        'single_ingredient': ingredient,
                        'ingredient_position': position,
                        'original_ingredient_string': ner_ingredient_string
                    })
            else:
                # Fallback for cases where split results in empty list
                recipe_ingredient_data.append({
                    'recipe_id': recipe_id,
                    'recipe_name': recipe_name,
                    'single_ingredient': ner_ingredient_string,
                    'ingredient_position': 1,
                    'original_ingredient_string': ner_ingredient_string
                })
    
    # Create recipe-ingredient dataframe
    recipe_ingredient_df = pd.DataFrame(recipe_ingredient_data)
    
    print(f"✅ Created recipe-ingredient structure:")
    print(f"   Original recipes: {len(df)}")
    print(f"   Recipe-ingredient records: {len(recipe_ingredient_df)}")
    print(f"   Average ingredients per recipe: {len(recipe_ingredient_df) / len(df):.2f}")
    
    # Show sample structure
    print(f"\n📋 Sample Recipe-Ingredient Structure:")
    sample_recipes = recipe_ingredient_df[recipe_ingredient_df['recipe_id'].isin([1, 2, 3])]
    display(sample_recipes[['recipe_id', 'recipe_name', 'single_ingredient', 'ingredient_position']].head(10))
    
else:
    print("❌ Dataset not found or 'ner_ingredient_string' column missing")
    if 'df' in locals():
        print("Available columns:", [col for col in df.columns if 'ingredient' in str(col).lower()])

✅ Created recipe-ingredient structure:
   Original recipes: 1322
   Recipe-ingredient records: 8176
   Average ingredients per recipe: 6.18

📋 Sample Recipe-Ingredient Structure:


Unnamed: 0,recipe_id,recipe_name,single_ingredient,ingredient_position
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,cassava,1
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,fish,2
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken,3
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,coconut oil,4
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken broth,5
5,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,lime juice,6
6,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,spinach,7
7,2,Bitterballs (Bitterballen),beef,1
8,2,Bitterballs (Bitterballen),potato starch,2
9,2,Bitterballs (Bitterballen),milk,3


In [361]:
# Get unique ingredients
unique_ingredients_list = recipe_ingredient_df['single_ingredient'].dropna().unique()
print(f"\n📊 Unique Ingredients Analysis:")
print(f"   Total unique ingredients: {len(unique_ingredients_list)}")
print(f"   Sample unique ingredients: {list(unique_ingredients_list[:10])}")


📊 Unique Ingredients Analysis:
   Total unique ingredients: 1055
   Sample unique ingredients: ['cassava', 'fish', 'chicken', 'coconut oil', 'chicken broth', 'lime juice', 'spinach', 'beef', 'potato starch', 'milk']


In [362]:
# Check for null values in specific columns
columns_to_check = ['single_ingredient', 'ingredient_position', 'recipe_name', 'original_ingredient_string']

# Method 1: Check if ANY of these columns have null values in ANY row
null_mask = recipe_ingredient_df[columns_to_check].isnull().any(axis=1)
rows_with_nulls = recipe_ingredient_df[null_mask]

print("Rows with null values in 'single_ingredient', 'ingredient_position', or 'recipe_name':")
print(f"Number of rows with nulls: {len(rows_with_nulls)}")
if len(rows_with_nulls) > 0:
    print("Sample rows with null values:")
    display(rows_with_nulls[columns_to_check].head(10))
else:
    print("✅ No null values found in these columns!")

#drop rows with null values in any of the if original_ingredient_string
print("Dropping rows with null values in original_ingredient_string,...")
recipe_ingredient_df = recipe_ingredient_df.dropna(subset=['original_ingredient_string'])
print(f"Remaining rows after dropping nulls: {len(recipe_ingredient_df)}")



Rows with null values in 'single_ingredient', 'ingredient_position', or 'recipe_name':
Number of rows with nulls: 13
Sample rows with null values:


Unnamed: 0,single_ingredient,ingredient_position,recipe_name,original_ingredient_string
2918,,0,Fish congee,
7783,,0,Fruity Winter Oats Recipe,
7784,,0,Italian Meatloaf with Salsa,
7785,,0,Mini Meatballs with Pasta,
7786,,0,Tuna Salad,
7787,,0,Pasta Chicken Bake,
7788,,0,Vegetable Patties,
7789,,0,Easy Chicken Dinner,
7790,,0,Biscuits,
7791,,0,Minestrone Soup,


Dropping rows with null values in original_ingredient_string,...
Remaining rows after dropping nulls: 8176


Standarization of NER Ingredients

In [363]:
print("🔄 LOADING NER STANDARDIZATION MAPPING")
print("=" * 60)

standardization_file = 'ner_data_standarization.txt'

# Load standardization mapping
standardization_mapping = {}
compound_ingredients = {}

try:
    with open(standardization_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        if ' --> ' in line:
            original, standardized = line.split(' --> ', 1)
            original = original.strip()
            standardized = standardized.strip()
            standardization_mapping[original] = standardized
    
    print(f"✅ Loaded {len(standardization_mapping)} standardization mappings")
    
    # Show sample mappings
    print("\n📋 Sample standardization mappings:")
    sample_items = list(standardization_mapping.items())[:10]
    for original, standardized in sample_items:
        print(f"   '{original}' → '{standardized}'")
    
except FileNotFoundError:
    print(f"❌ File not found: {standardization_file}")
except Exception as e:
    print(f"❌ Error loading file: {e}")

🔄 LOADING NER STANDARDIZATION MAPPING
✅ Loaded 930 standardization mappings

📋 Sample standardization mappings:
   'acai' → 'acai'
   'agar agar' → 'agar-agar powder'
   'agar-agar powder' → 'agar-agar powder'
   'all purpose flour' → 'all purpose flour'
   'all-purpose flour' → 'all purpose flour'
   'almond' → 'almond'
   'almond butter' → 'almond butter'
   'almond milk' → 'almond milk'
   'almond nut' → 'almond nut'
   'alphabet pasta' → 'pasta'


In [364]:
def detect_compound_ingredients(recipe_ingredient_df):
    """
    Step 1: Scan all ingredients and list compound sentences that need special handling.
    
    Args:
        recipe_ingredient_df: DataFrame with single_ingredient column
        
    Returns:
        dict: Dictionary of detected compound ingredients and their counts
    """
    print("DETECTING COMPOUND INGREDIENTS")
    
    compound_indicators = {
        'multiple_foods': [],      # Multiple food items in one string
        'long_sentences': [],      # More than 4 words
        'known_patterns': []       # Known compound patterns
    }
    
    # Criteria for compound detection
    def is_likely_compound(ingredient_text):
        if not ingredient_text or pd.isna(ingredient_text):
            return False, "null"
            
        words = str(ingredient_text).strip().split()
        word_count = len(words)
        
        # Check for multiple food indicators
        food_keywords = ['oil', 'sauce', 'salt', 'sugar', 'water', 'milk', 'juice', 'broth', 
                        'beef', 'chicken', 'pork', 'fish', 'tofu', 'rice', 'noodle', 'egg', 'seed',
                        'teri','sesame']
        
        food_count = sum(1 for word in words if word.lower() in food_keywords)
        
        # Classification logic
        if word_count > 6:
            return True, "long_sentence"
        elif food_count >= 3:
            return True, "multiple_foods"
        elif word_count > 4 and food_count >= 2:
            return True, "mixed_pattern"
        else:
            return False, "simple"
    
    # Scan all ingredients
    compound_findings = {}
    
    for index, row in recipe_ingredient_df.iterrows():
        ingredient = row['single_ingredient']
        is_compound, reason = is_likely_compound(ingredient)
        
        if is_compound:
            if ingredient not in compound_findings:
                compound_findings[ingredient] = {
                    'count': 0,
                    'reason': reason,
                    'recipe_ids': []
                }
            compound_findings[ingredient]['count'] += 1
            compound_findings[ingredient]['recipe_ids'].append(row['recipe_id'])
    
    print(f"✅ Found {len(compound_findings)} compound ingredients")
    
    # Sort by frequency
    sorted_compounds = sorted(compound_findings.items(), 
                            key=lambda x: x[1]['count'], 
                            reverse=True)
    
    print(f"\n📋 TOP COMPOUND INGREDIENTS (sorted by frequency):")
    print("-" * 80)
    
    for i, (ingredient, data) in enumerate(sorted_compounds[:15], 1):
        print(f"{i:2d}. '{ingredient}'")
        print(f"    Count: {data['count']} | Reason: {data['reason']}")
        print(f"    Recipe IDs: {data['recipe_ids'][:5]}{'...' if len(data['recipe_ids']) > 5 else ''}")
        print()
    
    return dict(sorted_compounds)

# Run compound detection
if 'recipe_ingredient_df' in locals():
    detected_compounds = detect_compound_ingredients(recipe_ingredient_df)
else:
    print("❌ Recipe ingredient DataFrame not found. Run previous steps first.")

DETECTING COMPOUND INGREDIENTS
✅ Found 8 compound ingredients

📋 TOP COMPOUND INGREDIENTS (sorted by frequency):
--------------------------------------------------------------------------------
 1. 'bumboo teri bubuk sesame seed'
    Count: 1 | Reason: multiple_foods
    Recipe IDs: [246]

 2. 'oil clove garlic beef sweet soy sauce salt sugar rice water tofu'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [338]

 3. 'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [607]

 4. 'rice red sweet potato chicken broth long bean â salt margarine'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [712]

 5. 'seed baby lemon juice water formula milk'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [783]

 6. 'beef chicken broth'
    Count: 1 | Reason: multiple_foods
    Recipe IDs: [929]

 7. 'low sodium vegetable chicken broth'
    Count: 1 | Reason: mixed_pattern
    Recipe IDs: [930]


In [365]:
PREDEFINED_COMPOUND_PATTERNS = {
    # Exact matches from your problematic cases
    'oil clove garlic beef sweet soy sauce salt sugar rice water tofu': [
        'oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu'
    ],
    'rice red sweet potato chicken broth long bean salt margarine': [
        'rice', 'red sweet potato', 'chicken broth', 'long bean', 'salt', 'margarine'
    ],
    'seed baby lemon juice water formula milk': [
        'lemon juice', 'water', 'formula milk'
    ],
    'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth': [
        'rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth'
    ],
    'cooking oil clove garlic beef sweet soy sauce salt sugar': [
        'cooking oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar'
    ],
    'bumboo teri bubuk sesame seed': [
        'anchovy powder', 'sesame seed'
    ],
    'roll egg noodle boiling water': [
        'noodle', 'water'
    ],
    'chicken upper lower thigh wing': [
        'chicken thigh', 'chicken wing'
    ]
}

print("📋 PREDEFINED COMPOUND PATTERNS")
print("=" * 60)
print(f"Total predefined patterns: {len(PREDEFINED_COMPOUND_PATTERNS)}")
print("\nPattern examples:")
for i, (compound, parts) in enumerate(list(PREDEFINED_COMPOUND_PATTERNS.items())[:5], 1):
    print(f"{i}. '{compound}'")
    print(f"   → {parts}")
    print()

📋 PREDEFINED COMPOUND PATTERNS
Total predefined patterns: 8

Pattern examples:
1. 'oil clove garlic beef sweet soy sauce salt sugar rice water tofu'
   → ['oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu']

2. 'rice red sweet potato chicken broth long bean salt margarine'
   → ['rice', 'red sweet potato', 'chicken broth', 'long bean', 'salt', 'margarine']

3. 'seed baby lemon juice water formula milk'
   → ['lemon juice', 'water', 'formula milk']

4. 'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth'
   → ['rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth']

5. 'cooking oil clove garlic beef sweet soy sauce salt sugar'
   → ['cooking oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar']



In [366]:
def standardize_simple_ingredient_only(ingredient, mapping_dict):
    """
    Args:
        ingredient (str): Single ingredient to standardize
        mapping_dict (dict): Standardization mapping rules
        
    Returns:
        str: Standardized ingredient name
    """
    if not ingredient or pd.isna(ingredient):
        return None
        
    ingredient_clean = str(ingredient).strip().lower()
    
    # Direct mapping lookup only
    if ingredient_clean in mapping_dict:
        return mapping_dict[ingredient_clean]
    
    # If no mapping found, return cleaned original
    return ingredient_clean

In [367]:
def is_compound_ingredient_only(ingredient, predefined_patterns):
    """
    PURE COMPOUND CHECKING FUNCTION - NO PROCESSING LOGIC
    
    Args:
        ingredient (str): Ingredient to check
        predefined_patterns (dict): Known compound patterns
        
    Returns:
        bool: True if compound, False if simple
    """
    if not ingredient or pd.isna(ingredient):
        return False
        
    ingredient_clean = str(ingredient).strip().lower()
    
    # Check if it's in our predefined compound patterns (exact match)
    if ingredient_clean in predefined_patterns:
        return True
    
    return False

In [368]:
# Test compound checking function
print("🧪 TESTING COMPOUND CHECKING FUNCTION")
print("=" * 50)

compound_test_cases = [
    'water',  # Simple
    'yoghurt',  # Simple
    'oil clove garlic beef sweet soy sauce salt sugar rice water tofu',  # Compound
    'rice red sweet potato chicken broth long bean salt margarine',  # Compound
    'chicken',  # Simple
    'cooking oil clove garlic chicken broth salt shrimp carrot celery'  # Compound
]

print("Compound detection tests:")
for ingredient in compound_test_cases:
    is_compound = is_compound_ingredient_only(ingredient, PREDEFINED_COMPOUND_PATTERNS)
    print(f"  '{ingredient}' → {'COMPOUND' if is_compound else 'SIMPLE'}")

🧪 TESTING COMPOUND CHECKING FUNCTION
Compound detection tests:
  'water' → SIMPLE
  'yoghurt' → SIMPLE
  'oil clove garlic beef sweet soy sauce salt sugar rice water tofu' → COMPOUND
  'rice red sweet potato chicken broth long bean salt margarine' → COMPOUND
  'chicken' → SIMPLE
  'cooking oil clove garlic chicken broth salt shrimp carrot celery' → SIMPLE


In [369]:
def process_compound_ingredient_only(ingredient, predefined_patterns, mapping_dict):
    """
    SIMPLIFIED COMPOUND PROCESSING - ONLY USES PREDEFINED PATTERNS
    
    Args:
        ingredient (str): Compound ingredient to split and standardize
        predefined_patterns (dict): Known compound splitting patterns
        mapping_dict (dict): Standardization mapping rules (for individual parts)
        
    Returns:
        list: List of standardized individual ingredients
    """
    if not ingredient or pd.isna(ingredient):
        return []
        
    ingredient_clean = str(ingredient).strip().lower()

    if ingredient_clean in predefined_patterns:
        # Use exact predefined pattern
        individual_parts = predefined_patterns[ingredient_clean]
        print(f"  🎯 Using predefined pattern: '{ingredient_clean}' → {individual_parts}")
    else:
        print(f"  ❌ ERROR: '{ingredient_clean}' not found in predefined patterns!")
        return [ingredient_clean]  # Return as single ingredient
    
    # Apply simple mapping to each individual part
    standardized_parts = []
    for part in individual_parts:
        standardized_part = standardize_simple_ingredient_only(part, mapping_dict)
        standardized_parts.append(standardized_part)
        print(f"    '{part}' → '{standardized_part}'")
    
    return standardized_parts

In [370]:
# Add compound column to existing recipe_ingredient_df
if 'recipe_ingredient_df' in locals() and 'PREDEFINED_COMPOUND_PATTERNS' in locals():
    print("🔄 ADDING COMPOUND COLUMN TO EXISTING DATAFRAME")
    print("=" * 50)
    
    # Apply compound detection to each ingredient
    recipe_ingredient_df['compound'] = recipe_ingredient_df['single_ingredient'].apply(
        lambda ingredient: is_compound_ingredient_only(ingredient, PREDEFINED_COMPOUND_PATTERNS)
    )

display(recipe_ingredient_df[['recipe_id', 'single_ingredient', 'compound']].head(10)   )

🔄 ADDING COMPOUND COLUMN TO EXISTING DATAFRAME


Unnamed: 0,recipe_id,single_ingredient,compound
0,1,cassava,False
1,1,fish,False
2,1,chicken,False
3,1,coconut oil,False
4,1,chicken broth,False
5,1,lime juice,False
6,1,spinach,False
7,2,beef,False
8,2,potato starch,False
9,2,milk,False


In [371]:
def standardized_ingredients_into_df(recipe_ingredient_df, predefined_patterns, mapping_dict):
    """
    Alternative function that directly embeds standardized_ingredient column into the existing DataFrame
    
    Args:
        recipe_ingredient_df: DataFrame with columns ['recipe_id', 'single_ingredient']
        predefined_patterns: Dictionary of known compound patterns
        mapping_dict: Dictionary for simple ingredient standardization
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with standardized_ingredient column
    """
    
    def get_standardized_ingredient(ingredient):
        """Helper function to get standardized ingredient for each row"""
        if pd.isna(ingredient) or not ingredient:
            return None
            
        try:
            # Check if compound
            is_compound = is_compound_ingredient_only(ingredient, predefined_patterns)
            
            if is_compound:
                # For compound ingredients, get the first part (or join all parts)
                individual_parts = process_compound_ingredient_only(ingredient, predefined_patterns, mapping_dict)
                if individual_parts:
                    # Option 1: Return first part
                    return individual_parts[0]
                    # Option 2: Return all parts joined (uncomment below)
                    # return ', '.join(individual_parts)
                else:
                    return str(ingredient).strip().lower()
            else:
                # For simple ingredients, apply direct mapping
                return standardize_simple_ingredient_only(ingredient, mapping_dict)
                
        except Exception as e:
            print(f"❌ Error processing '{ingredient}': {e}")
            return str(ingredient).strip().lower()
    
    # Apply standardization to create new column
    recipe_ingredient_df['standardized_ingredient'] = recipe_ingredient_df['single_ingredient'].apply(get_standardized_ingredient)
    
    # Add metadata columns
    recipe_ingredient_df['is_compound'] = recipe_ingredient_df['single_ingredient'].apply(
        lambda x: is_compound_ingredient_only(x, predefined_patterns) if pd.notna(x) else False
    )
    
    print(f"✅ Successfully embedded standardized_ingredient column!")
    print(f"   Total rows processed: {len(recipe_ingredient_df)}")
    print(f"   Compound ingredients: {recipe_ingredient_df['is_compound'].sum()}")
    print(f"   Simple ingredients: {(~recipe_ingredient_df['is_compound']).sum()}")
    
    return recipe_ingredient_df

standardized_ingredients_into_df(recipe_ingredient_df, PREDEFINED_COMPOUND_PATTERNS, standardization_mapping)

  🎯 Using predefined pattern: 'bumboo teri bubuk sesame seed' → ['anchovy powder', 'sesame seed']
    'anchovy powder' → 'anchovy powder'
    'sesame seed' → 'sesame seed'
  🎯 Using predefined pattern: 'oil clove garlic beef sweet soy sauce salt sugar rice water tofu' → ['oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu']
    'oil' → 'oil'
    'clove garlic' → 'garlic'
    'beef' → 'beef'
    'sweet soy sauce' → 'sweet soy sauce'
    'salt' → 'salt'
    'sugar' → 'sugar'
    'rice' → 'rice'
    'water' → 'water'
    'tofu' → 'tofu'
  🎯 Using predefined pattern: 'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth' → ['rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth']
    'rice' → 'rice'
    'cooking oil' → 'cooking oil'
    'clove garlic' → 'garlic'
    'chicken broth' → 'chicken broth'
    'salt' → 'salt'
    'shrimp' → 'shrimp'
    'carrot' →

Unnamed: 0,recipe_id,recipe_name,single_ingredient,ingredient_position,original_ingredient_string,compound,standardized_ingredient,is_compound
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,cassava,1,"cassava,fish,chicken,coconut oil,chicken broth...",False,cassava,False
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,fish,2,"cassava,fish,chicken,coconut oil,chicken broth...",False,fish,False
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken,3,"cassava,fish,chicken,coconut oil,chicken broth...",False,chicken,False
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,coconut oil,4,"cassava,fish,chicken,coconut oil,chicken broth...",False,coconut oil,False
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken broth,5,"cassava,fish,chicken,coconut oil,chicken broth...",False,chicken broth,False
...,...,...,...,...,...,...,...,...
8171,1322,Apple Crumble,flour,2,"apples,flour,sugar,coconut,rolled oats,margarine",False,flour,False
8172,1322,Apple Crumble,sugar,3,"apples,flour,sugar,coconut,rolled oats,margarine",False,sugar,False
8173,1322,Apple Crumble,coconut,4,"apples,flour,sugar,coconut,rolled oats,margarine",False,coconut,False
8174,1322,Apple Crumble,rolled oats,5,"apples,flour,sugar,coconut,rolled oats,margarine",False,rolled oats,False


Map Out Ingredient Ids

In [372]:
import re

def clean_ingredient_name_symbols(ingredient_name):
    """
    Clean unwanted symbols, quotes, and brackets from ingredient names
    
    Args:
        ingredient_name: String containing the ingredient name
        
    Returns:
        str: Cleaned ingredient name
    """
    if pd.isna(ingredient_name) or ingredient_name == '':
        return ingredient_name
        
    # Convert to string if not already
    cleaned = str(ingredient_name)
    
    # Remove quotes (single and double)
    cleaned = cleaned.replace("'", "").replace('"', "")
    
    # Remove brackets and parentheses
    cleaned = cleaned.replace("[", "").replace("]", "")
    cleaned = cleaned.replace("(", "").replace(")", "")
    
    # Remove other unwanted symbols but keep hyphens and spaces
    cleaned = re.sub(r'[^\w\s\-]', '', cleaned)
    
    # Clean up extra whitespace
    cleaned = ' '.join(cleaned.split())
    
    # Remove leading/trailing whitespace
    cleaned = cleaned.strip()
    
    return cleaned if cleaned else ingredient_name


In [373]:
def create_ingredient_master_df_with_na_check(recipe_ingredient_df, use_standardized=True):
    """
    Create a master ingredient DataFrame with unique ingredient IDs and handle NA values
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        tuple: (ingredient_df, ingredient_id_mapping, cleaned_df)
    """
    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return None, None, None
    
    print(f"🔍 CHECKING FOR NA VALUES IN '{ingredient_column}'")
    print("=" * 60)
    
    # Check for NA values in standardized ingredient column
    na_mask = recipe_ingredient_df[ingredient_column].isna()
    na_count = na_mask.sum()
    total_count = len(recipe_ingredient_df)
    
    print(f"📊 NA Analysis:")
    print(f"   Total rows: {total_count}")
    print(f"   NA values in '{ingredient_column}': {na_count}")
    print(f"   Percentage NA: {(na_count/total_count)*100:.2f}%")
    
    if na_count > 0:
        print(f"\n🔍 Analyzing NA rows...")
        na_rows = recipe_ingredient_df[na_mask].copy()
        
        # Check if there's an 'original_ingredient_string' column to reference
        original_column = None
        possible_original_columns = ['original_ingredient_string', 'single_ingredient', 'ingredient']
        
        for col in possible_original_columns:
            if col in recipe_ingredient_df.columns and col != ingredient_column:
                original_column = col
                break
        
        if original_column:
            print(f"   Using '{original_column}' as reference for original data")
            
            # Check which NA rows have valid original data
            na_with_original = na_rows[na_rows[original_column].notna() & (na_rows[original_column] != '')]
            na_without_original = na_rows[na_rows[original_column].isna() | (na_rows[original_column] == '')]
            
            print(f"\n📋 NA Row Analysis:")
            print(f"   NA rows with valid original data: {len(na_with_original)}")
            print(f"   NA rows without original data: {len(na_without_original)}")
            
            # Show NA rows with valid original data (these need attention)
            if len(na_with_original) > 0:
                print(f"\n⚠️  HIGHLIGHT: NA rows with valid original data (need investigation):")
                display_cols = ['recipe_id', 'recipe_name', original_column, ingredient_column]
                available_cols = [col for col in display_cols if col in na_with_original.columns]
                
                for idx, row in na_with_original.head(10).iterrows():
                    print(f"   Row {idx}:")
                    print(f"     Recipe: {row.get('recipe_name', 'N/A')}")
                    print(f"     Original: '{row.get(original_column, 'N/A')}'")
                    print(f"     Standardized: {row.get(ingredient_column, 'N/A')}")
                    print()
                
                if len(na_with_original) > 10:
                    print(f"     ... and {len(na_with_original) - 10} more rows")
            
            # Drop rows without original data
            if len(na_without_original) > 0:
                print(f"\n🗑️  DROPPING: {len(na_without_original)} rows with no original data")
                # Show sample of rows being dropped
                print("   Sample rows being dropped:")
                for idx, row in na_without_original.head(5).iterrows():
                    print(f"     Row {idx}: Recipe '{row.get('recipe_name', 'N/A')}' - No original ingredient data")
        
        else:
            print(f"   ⚠️  No original ingredient column found for reference")
            print(f"   Available columns: {list(recipe_ingredient_df.columns)}")
    
    # Create cleaned DataFrame (drop rows with NA in standardized ingredient)
    cleaned_df = recipe_ingredient_df.dropna(subset=[ingredient_column]).copy()
    dropped_count = total_count - len(cleaned_df)
    
    if dropped_count > 0:
        print(f"\n✅ Dropped {dropped_count} rows with NA values")
        print(f"   Remaining rows: {len(cleaned_df)}")
    
    # Continue with ingredient master creation using cleaned data
    print(f"\n🔄 Creating ingredient master DataFrame from cleaned data...")
    
    # Get unique ingredients and clean them
    unique_ingredients = cleaned_df[ingredient_column].dropna().unique()
    
    # Clean all ingredient names to remove unwanted symbols
    cleaned_ingredients = [clean_ingredient_name_symbols(ingredient) for ingredient in unique_ingredients]
    
    # Remove duplicates after cleaning and sort
    unique_cleaned_ingredients = sorted(list(set(cleaned_ingredients)))
    
    # Create ingredient master DataFrame
    ingredient_data = []
    ingredient_id_mapping = {}
    
    for idx, ingredient in enumerate(unique_cleaned_ingredients, start=1):
        if ingredient and ingredient.strip():  # Only add non-empty ingredients
            ingredient_data.append({
                'ingredient_id': idx,
                'ingredient_name': ingredient
            })
            ingredient_id_mapping[ingredient] = idx
    
    ingredient_df = pd.DataFrame(ingredient_data)
    
    print(f"\n✅ Created ingredient master DataFrame:")
    print(f"   Total unique ingredients: {len(ingredient_df)}")
    print(f"   Using column: {ingredient_column}")
    print(f"   Cleaned unwanted symbols from ingredient names")
    
    return ingredient_df, ingredient_id_mapping, cleaned_df

# Alternative: Simple function to just check NA values first
def check_na_in_standardized_ingredients(recipe_ingredient_df):
    """
    Quick function to check NA values in standardized ingredients
    """
    ingredient_column = 'standardized_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found")
        return
    
    # Check for NA values
    na_mask = recipe_ingredient_df[ingredient_column].isna()
    na_count = na_mask.sum()
    
    print(f"🔍 NA CHECK RESULTS:")
    print(f"   Total rows: {len(recipe_ingredient_df)}")
    print(f"   NA values in standardized_ingredient: {na_count}")
    print(f"   Percentage: {(na_count/len(recipe_ingredient_df))*100:.2f}%")
    
    if na_count > 0:
        na_rows = recipe_ingredient_df[na_mask]
        
        # Check original data
        original_cols = ['original_ingredient_string', 'single_ingredient']
        for col in original_cols:
            if col in recipe_ingredient_df.columns:
                has_original = na_rows[col].notna().sum()
                print(f"   NA rows with valid '{col}': {has_original}")
                
                # Show sample
                if has_original > 0:
                    print(f"\n📋 Sample NA rows with original data:")
                    sample = na_rows[na_rows[col].notna()].head(5)
                    for idx, row in sample.iterrows():
                        print(f"     Row {idx}: '{row[col]}'")
                break
    else:
        print("✅ No NA values found!")

# Usage examples:

# Quick check first
check_na_in_standardized_ingredients(recipe_ingredient_df)

# Then use the enhanced function
ingredient_df, ingredient_id_mapping, cleaned_recipe_df = create_ingredient_master_df_with_na_check(
    recipe_ingredient_df, 
    use_standardized=True
)

🔍 NA CHECK RESULTS:
   Total rows: 8176
   NA values in standardized_ingredient: 13
   Percentage: 0.16%
   NA rows with valid 'original_ingredient_string': 13

📋 Sample NA rows with original data:
     Row 2918: ''
     Row 7783: ''
     Row 7784: ''
     Row 7785: ''
     Row 7786: ''
🔍 CHECKING FOR NA VALUES IN 'standardized_ingredient'
📊 NA Analysis:
   Total rows: 8176
   NA values in 'standardized_ingredient': 13
   Percentage NA: 0.16%

🔍 Analyzing NA rows...
   Using 'original_ingredient_string' as reference for original data

📋 NA Row Analysis:
   NA rows with valid original data: 0
   NA rows without original data: 13

🗑️  DROPPING: 13 rows with no original data
   Sample rows being dropped:
     Row 2918: Recipe 'Fish congee' - No original ingredient data
     Row 7783: Recipe 'Fruity Winter Oats Recipe' - No original ingredient data
     Row 7784: Recipe 'Italian Meatloaf with Salsa' - No original ingredient data
     Row 7785: Recipe 'Mini Meatballs with Pasta' - No origin

In [374]:

if ingredient_df is not None:
    print(f"\n📋 Sample of cleaned ingredient_df:")
    display(ingredient_df.head(10))
    
    print(f"\n🔍 Checking for any remaining unwanted symbols...")
    # Check if any ingredient names still contain unwanted symbols
    has_quotes = ingredient_df['ingredient_name'].str.contains("'|\"", na=False).sum()
    has_brackets = ingredient_df['ingredient_name'].str.contains("\[|\]|\(|\)", na=False).sum()
    
    print(f"   Ingredients with quotes: {has_quotes}")
    print(f"   Ingredients with brackets: {has_brackets}")
    
    if has_quotes > 0 or has_brackets > 0:
        print("   ⚠️  Some unwanted symbols still found!")
        problematic = ingredient_df[
            ingredient_df['ingredient_name'].str.contains("'|\"|\[|\]|\(|\)", na=False)
        ]
        print("   Sample problematic ingredients:")
        display(problematic.head())
    else:
        print("   ✅ All ingredient names are clean!")




📋 Sample of cleaned ingredient_df:


Unnamed: 0,ingredient_id,ingredient_name
0,1,acai
1,2,agar-agar powder
2,3,all purpose flour
3,4,almond
4,5,almond butter
5,6,almond milk
6,7,almond nuts
7,8,aluminum foil
8,9,ambon banana
9,10,ambrosia fruit



🔍 Checking for any remaining unwanted symbols...
   Ingredients with quotes: 0
   Ingredients with brackets: 0
   ✅ All ingredient names are clean!


Map Out to the RecipeIngredient

In [375]:
def map_ingredient_ids_to_recipe_df(recipe_ingredient_df, ingredient_id_mapping, use_standardized=True):
    """
    Map ingredient IDs to the recipe ingredient DataFrame using cleaned ingredient names
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        ingredient_id_mapping: Dictionary mapping cleaned ingredient names to IDs
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with ingredient_id column
    """

    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return recipe_ingredient_df
    
    # Create a copy to avoid modifying the original
    final_recipe_ingredient = recipe_ingredient_df.copy()
    
    # Clean the ingredient names in the DataFrame and map to IDs
    def get_ingredient_id(ingredient_name):
        if pd.isna(ingredient_name):
            return None
        cleaned_name = clean_ingredient_name_symbols(ingredient_name)
        return ingredient_id_mapping.get(cleaned_name, None)
    
    final_recipe_ingredient['ingredient_id'] = final_recipe_ingredient[ingredient_column].apply(get_ingredient_id)
    
    # Check mapping results
    total_rows = len(final_recipe_ingredient)
    mapped_rows = final_recipe_ingredient['ingredient_id'].notna().sum()
    unmapped_rows = total_rows - mapped_rows
    
    print(f"✅ Ingredient ID mapping complete:")
    print(f"   Total rows: {total_rows}")
    print(f"   Successfully mapped: {mapped_rows} ({(mapped_rows/total_rows)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_rows} ({(unmapped_rows/total_rows)*100:.1f}%)")
    
    if unmapped_rows > 0:
        print(f"\n🔍 Sample unmapped ingredients:")
        unmapped_sample = final_recipe_ingredient[final_recipe_ingredient['ingredient_id'].isna()][ingredient_column].dropna().unique()[:5]
        for ingredient in unmapped_sample:
            cleaned = clean_ingredient_name_symbols(ingredient)
            print(f"   Original: '{ingredient}' → Cleaned: '{cleaned}'")
    
    return final_recipe_ingredient

In [376]:
recipe_ingredient_df.columns.to_list()

['recipe_id',
 'recipe_name',
 'single_ingredient',
 'ingredient_position',
 'original_ingredient_string',
 'compound',
 'standardized_ingredient',
 'is_compound']

In [377]:
# Step 2: Map ingredient IDs back to recipe DataFrame

recipe_ingredient_df_with_ids = map_ingredient_ids_to_recipe_df(
    recipe_ingredient_df,
    ingredient_id_mapping,
    use_standardized=True
)

print(f"\n📋 Sample of updated recipe_ingredient_df with IDs:")
sample_cols = ['recipe_id', 'single_ingredient', 'standardized_ingredient', 'ingredient_id']
available_cols = [col for col in sample_cols if col in recipe_ingredient_df_with_ids.columns]
display(recipe_ingredient_df_with_ids[available_cols].head(10))

print(f"\n📊 Final DataFrame Statistics:")
print(f"   Recipe DataFrame shape: {recipe_ingredient_df_with_ids.shape}")
print(f"   Unique recipes: {recipe_ingredient_df_with_ids['recipe_id'].nunique()}")
print(f"   Unique ingredients: {len(ingredient_df)}")
print(f"   Recipe-ingredient relationships: {len(recipe_ingredient_df_with_ids)}")


✅ Ingredient ID mapping complete:
   Total rows: 8176
   Successfully mapped: 8163 (99.8%)
   Unmapped: 13 (0.2%)

🔍 Sample unmapped ingredients:

📋 Sample of updated recipe_ingredient_df with IDs:


Unnamed: 0,recipe_id,single_ingredient,standardized_ingredient,ingredient_id
0,1,cassava,cassava,154.0
1,1,fish,fish,326.0
2,1,chicken,chicken,181.0
3,1,coconut oil,coconut oil,230.0
4,1,chicken broth,chicken broth,184.0
5,1,lime juice,lime juice,496.0
6,1,spinach,spinach,813.0
7,2,beef,beef,72.0
8,2,potato starch,potato starch,668.0
9,2,milk,milk,537.0



📊 Final DataFrame Statistics:
   Recipe DataFrame shape: (8176, 9)
   Unique recipes: 1322
   Unique ingredients: 990
   Recipe-ingredient relationships: 8176


In [378]:
def drop_unmapped_ingredients(recipe_ingredient_df_with_ids):
    """
    Drop rows where ingredient_id is null (unmapped ingredients)
    
    Args:
        recipe_ingredient_df_with_ids: DataFrame with ingredient_id column
        
    Returns:
        DataFrame: Cleaned DataFrame without unmapped ingredients
    """
    print("🗑️ DROPPING UNMAPPED INGREDIENTS")
    print("=" * 40)
    
    # Count before dropping
    total_before = len(recipe_ingredient_df_with_ids)
    unmapped_count = recipe_ingredient_df_with_ids['ingredient_id'].isna().sum()
    
    print(f"Before dropping:")
    print(f"   Total rows: {total_before}")
    print(f"   Unmapped ingredients: {unmapped_count}")
    
    if unmapped_count > 0:
        # Show sample of what will be dropped
        print(f"\n📋 Sample of rows to be dropped:")
        unmapped_sample = recipe_ingredient_df_with_ids[
            recipe_ingredient_df_with_ids['ingredient_id'].isna()
        ][['recipe_id', 'single_ingredient', 'standardized_ingredient']].head(5)
        
        for idx, row in unmapped_sample.iterrows():
            print(f"   Row {idx}: Recipe {row['recipe_id']} - '{row['single_ingredient']}'")
        
        # Drop unmapped ingredients
        cleaned_df = recipe_ingredient_df_with_ids.dropna(subset=['ingredient_id'])
        
        # Reset index
        cleaned_df = cleaned_df.reset_index(drop=True)
        
        # Show results
        total_after = len(cleaned_df)
        dropped_count = total_before - total_after
        
        print(f"\n✅ After dropping:")
        print(f"   Total rows: {total_after}")
        print(f"   Dropped rows: {dropped_count}")
        print(f"   Retention rate: {(total_after/total_before)*100:.1f}%")
        
        return cleaned_df
    
    else:
        print("✅ No unmapped ingredients found - nothing to drop!")
        return recipe_ingredient_df_with_ids

# Usage
cleaned_recipe_ingredient_df = drop_unmapped_ingredients(recipe_ingredient_df_with_ids)

🗑️ DROPPING UNMAPPED INGREDIENTS
Before dropping:
   Total rows: 8176
   Unmapped ingredients: 13

📋 Sample of rows to be dropped:
   Row 2918: Recipe 449 - 'None'
   Row 7783: Recipe 1246 - 'None'
   Row 7784: Recipe 1247 - 'None'
   Row 7785: Recipe 1248 - 'None'
   Row 7786: Recipe 1249 - 'None'

✅ After dropping:
   Total rows: 8163
   Dropped rows: 13
   Retention rate: 99.8%


In [379]:
recipe_ingredient = cleaned_recipe_ingredient_df.copy()

#take recipe_id and ingredient_id only
recipe_ingredient = recipe_ingredient[['recipe_id', 'ingredient_id']].drop_duplicates().reset_index(drop=True)

print(f"\n📋 Final Recipe-Ingredient DataFrame:")
print(f"   Shape: {recipe_ingredient.shape}")
print(f"   Unique recipe IDs: {recipe_ingredient['recipe_id'].nunique()}")
print(f"   Unique ingredient IDs: {recipe_ingredient['ingredient_id'].nunique()}")


📋 Final Recipe-Ingredient DataFrame:
   Shape: (8117, 2)
   Unique recipe IDs: 1309
   Unique ingredient IDs: 990


##### Determine Allergen (ingredient and recipe)

In [380]:
allergen_map = pd.read_excel('1st_dataset.xlsx', sheet_name='allergen')
print("Countrymap columns:", allergen_map.columns.tolist())
print("Countrymap shape:", allergen_map.shape)
print("\nFirst 5 rows of allergen:")
display(allergen_map.head())

Countrymap columns: ['pk', 'name', 'description']
Countrymap shape: (8, 3)

First 5 rows of allergen:


Unnamed: 0,pk,name,description
0,1,dairy,Common infant allergen. Found in dairy products.
1,2,egg,Can cause allergy even in small amounts.
2,3,soy,"Plant-based but common allergen, especially in..."
3,4,tree_nuts,Tree nuts are a top allergen. Avoid early intr...
4,5,peanuts,Peanuts are a major allergen. Introduce carefu...


In [381]:
# Load the module
import importlib

allergen_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "allergen_mapper.py")

spec = importlib.util.spec_from_file_location("allergen", allergen_path)
allergen_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(allergen_file)

# Access the config
allergen_tags = allergen_file.ALLERGEN_TAGS
pprint(allergen_tags)

{'egg': {'allergen_group': 'egg',
         'keywords': ['egg',
                      'eggs',
                      'egg white',
                      'egg yolk',
                      'omelette',
                      'lecithin',
                      'chicken egg',
                      'duck egg',
                      'quail egg'],
         'notes': 'Can cause allergy even in small amounts.'},
 'fish': {'allergen_group': 'fish',
          'keywords': ['salmon',
                       'cod',
                       'tuna',
                       'white fish',
                       'trout',
                       'sardines',
                       'catfish',
                       'tilapia',
                       'codfish',
                       'sardine',
                       'anchovy',
                       'dorado fish',
                       'mackerel',
                       'tenggiri fish',
                       'snapper',
                       'basa fish',
             

In [382]:
ingredient_df

Unnamed: 0,ingredient_id,ingredient_name
0,1,acai
1,2,agar-agar powder
2,3,all purpose flour
3,4,almond
4,5,almond butter
...,...,...
985,986,young beans
986,987,young coconut
987,988,young corn
988,989,zucchini


In [383]:
def calculate_match_confidence(ingredient_name, keyword):
    """Calculate confidence score for partial matches."""
    confidence = 0.6
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    return min(0.95, max(0.0, confidence))

In [384]:
def detect_allergen(ingredient_name, ALLERGEN_TAGS):
    """
    Detect allergen in an ingredient name using a two-step strategy:
    1. Check for exclusion terms
    2. Exact then partial keyword matching
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact'/'partial' or None,
            'matched_keyword': str or None,
            'confidence': float
        }
    """
    # Normalize input
    ingredient_lower = ingredient_name.lower().strip()

    # Exclusion Terms
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    if any(term in ingredient_lower for term in exclusion_terms):
        return {
            'allergen_group': None,
            'match_type': None,
            'matched_keyword': None,
            'confidence': 0.0
        }

    # Step 1: Exact match
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }

    # Step 2: Partial match
    best_match = None
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower in ingredient_lower:
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                if not best_match or confidence > best_match['confidence']:
                    best_match = {
                        'allergen_group': allergen_group,
                        'match_type': 'partial',
                        'matched_keyword': keyword,
                        'confidence': confidence
                    }

    if best_match and best_match['confidence'] >= 0.5:
        return best_match

    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

In [385]:
def get_allergen_group_id(allergen_group, allergen_mapping):
    """Map allergen group name to ID."""
    return allergen_mapping.get(allergen_group, None)

In [386]:
def process_ingredients_with_allergens(df, allergen_df):

    allergen_mapping = dict(zip(allergen_df['name'], allergen_df['pk']))

    results = []
    for _, row in df.iterrows():
        result = detect_allergen(row['ingredient_name'],allergen_tags)
        result['ingredient_id'] = row['ingredient_id']
        result['ingredient_name'] = row['ingredient_name']
        result['allergen_group_id'] = get_allergen_group_id(result['allergen_group'], allergen_mapping)
        result['isAllergen'] = result['allergen_group_id'] is not None
        results.append(result)

    result_df = pd.DataFrame(results)
    return result_df[result_df.columns.tolist()]

In [388]:
test_data = [
    {"ingredient_id": 1, "ingredient_name": "Whole Milk"},
    {"ingredient_id": 2, "ingredient_name": "Breast Milk"},
    {"ingredient_id": 3, "ingredient_name": "Almond Butter"},
    {"ingredient_id": 4, "ingredient_name": "Chicken Breast"},
    {"ingredient_id": 5, "ingredient_name": "Salmon Fillet"}
]
testing = pd.DataFrame(test_data)

allergen_df = pd.DataFrame(allergen_tags)

final_df = process_ingredients_with_allergens(testing, allergen_map)
print(final_df.to_string(index=False))
display(final_df)

allergen_group match_type matched_keyword  confidence  ingredient_id ingredient_name  allergen_group_id  isAllergen
         dairy      exact      whole milk         1.0              1      Whole Milk                1.0        True
          None       None            None         0.0              2     Breast Milk                NaN       False
     tree_nuts      exact   almond butter         1.0              3   Almond Butter                4.0        True
          None       None            None         0.0              4  Chicken Breast                NaN       False
          fish    partial          salmon         0.9              5   Salmon Fillet                6.0        True


Unnamed: 0,allergen_group,match_type,matched_keyword,confidence,ingredient_id,ingredient_name,allergen_group_id,isAllergen
0,dairy,exact,whole milk,1.0,1,Whole Milk,1.0,True
1,,,,0.0,2,Breast Milk,,False
2,tree_nuts,exact,almond butter,1.0,3,Almond Butter,4.0,True
3,,,,0.0,4,Chicken Breast,,False
4,fish,partial,salmon,0.9,5,Salmon Fillet,6.0,True


In [390]:
ingredient_df =process_ingredients_with_allergens(testing, allergen_map)

display(ingredient_df.head(10))

Unnamed: 0,allergen_group,match_type,matched_keyword,confidence,ingredient_id,ingredient_name,allergen_group_id,isAllergen
0,dairy,exact,whole milk,1.0,1,Whole Milk,1.0,True
1,,,,0.0,2,Breast Milk,,False
2,tree_nuts,exact,almond butter,1.0,3,Almond Butter,4.0,True
3,,,,0.0,4,Chicken Breast,,False
4,fish,partial,salmon,0.9,5,Salmon Fillet,6.0,True


In [393]:
final_ingredient_df = ingredient_df.copy()
final_ingredient_df = final_ingredient_df.rename(columns={
    'ingredient_id': 'pk',
    'ingredient_name': 'name',
    'allergen_group_id': 'allergen_group_id',
    'isAllergen': 'isAllergen'})

#final_ingredient_df drop column other than pk, name, allergen_group_id, isAllergen
final_ingredient_df = final_ingredient_df[['pk', 'name', 'allergen_group_id', 'isAllergen']]
display(final_ingredient_df.head(10))

Unnamed: 0,pk,name,allergen_group_id,isAllergen
0,1,Whole Milk,1.0,True
1,2,Breast Milk,,False
2,3,Almond Butter,4.0,True
3,4,Chicken Breast,,False
4,5,Salmon Fillet,6.0,True


recipe

In [394]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [395]:
df['allergen'] = df.apply(
    lambda row: detect_allergens(row, allergen_tags),
    axis=1
)

# Update hypoallergenic column based on allergen data
df['hypoallergenic'] = df['allergen'].apply(
    lambda allergens: "Yes" if not allergens or len(allergens) == 0 else "No"
)

# Display results to verify
df[['name', 'allergen', 'hypoallergenic', 'choking_hazards']].head(10)

Unnamed: 0,name,allergen,hypoallergenic,choking_hazards
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,[],Yes,No
1,Bitterballs (Bitterballen),[],Yes,No
2,Broccoli/Cauliflower Cheese,[],Yes,No
3,Vegetable Fingers,[],Yes,No
4,Beef Casserole,[],Yes,No
5,Toast Fingers,[],Yes,No
6,Pumpkin Polenta Fingers,[],Yes,No
7,Rice Pudding,[],Yes,No
8,Hummus,[],Yes,No
9,Baked Bean Pie,[],Yes,No


##### Classify Category of The Recipe

In [314]:
# Load the module
import importlib

category_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "category_dict.py")

spec = importlib.util.spec_from_file_location("category", category_path)
category_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(category_file)

# Access the config
dietary_tags  = category_file.dietary_tags
pprint(dietary_tags)

{'dairy_free': {'excluded_allergen_groups': ['milk'],
                'excluded_ingredients': ['cheese',
                                         'butter',
                                         'yogurt',
                                         'milk',
                                         'whey',
                                         'casein',
                                         'cream',
                                         'ice cream']},
 'egg_free': {'excluded_allergen_groups': ['egg'],
              'excluded_ingredients': ['eggs',
                                       'egg white',
                                       'egg yolk',
                                       'lecithin',
                                       'omelette']},
 'gluten_free': {'excluded_allergen_groups': ['gluten'],
                 'excluded_ingredients': ['wheat',
                                          'barley',
                                          'rye',
                        

In [396]:
category_map = pd.read_excel('1st_dataset.xlsx', sheet_name='category')
print("Category columns:", category_map.columns.tolist())
print("Category shape:", category_map.shape)
print("\nFirst 5 rows of Category:")
display(category_map.head())

Category columns: ['pk', 'name', 'description']
Category shape: (11, 3)

First 5 rows of Category:


Unnamed: 0,pk,name,description
0,1,vegan,Allowed categories: plant_based\nExcluded alle...
1,2,vegetarian,"Allowed categories: plant_based, dairy, egg\nE..."
2,3,pescetarian,"Allowed categories: plant_based, dairy, egg, s..."
3,4,dairy free,Excluded allergen groups: milk\nExcluded ingre...
4,5,egg free,Excluded allergen groups: egg\nExcluded ingred...


In [315]:
def classify_recipe_categories(recipe_data, dietary_tags):
    """
    Classifies a recipe into multiple dietary preference categories.
    
    Parameters:
        recipe_data (dict): Must contain:
            - 'ner_ingredients' (list or str)
            - 'allergens' (list of detected allergen groups like ['milk', 'soy'])
            - 'texture' (optional, for stage-based filtering)
        
        dietary_tags (dict): Dictionary mapping diet name → rules
    
    Returns:
        dict: {
            "matched_tags": list[str]  # e.g., ['vegan', 'soy', 'halal']
        }
    """
    ner_ingredient = recipe_data.get("ner_ingredient", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredient if i]
    combined_text = ' '.join(cleaned_ingredients)

    detected_allergens = recipe_data.get("allergen", [])
    allergen = [str(i).strip().lower() for i in detected_allergens if i]
    allergen_text = ' '.join(allergen).lower()
    matched_tags = []

    for tag, rules in dietary_tags.items():
        exclude_found = False

        # Step 1: Check excluded ingredients
        if "excluded_ingredients" in rules:
            for word in rules["excluded_ingredients"]:
                if word.lower() in combined_text:
                    exclude_found = True
                    break

        # Step 2: Check excluded allergen groups
        if "excluded_allergen_groups" in rules:
            for group in rules["excluded_allergen_groups"]:
                if group in allergen_text:
                    exclude_found = True
                    break

        # Step 3: Required ingredients check (for non_halal etc.)
        if "required_ingredients" in rules:
            required_match = any(word in combined_text for word in rules["required_ingredients"])
            if not required_match:
                continue  # Skip unless match found

        # If no exclusion → match this tag
        if not exclude_found:
            matched_tags.append(tag)
            
    if not matched_tags and "non_veg" in dietary_tags:
        matched_tags.append("non_veg")    
    
    
    return matched_tags

In [316]:
def classify_recipe(recipe_data, dietary_tags):
    ner_ingredient = recipe_data.get("ner_ingredient", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredient if i]
    combined_text = ' '.join(cleaned_ingredients)

    detected_allergens = recipe_data.get("allergen", [])
    allergen = [str(i).strip().lower() for i in detected_allergens if i]
    allergen_text = ' '.join(allergen).lower()

    matched_tags = []

    # Step 1: Match all possible tags
    for tag, rules in dietary_tags.items():
        exclude_found = False

        # Step 1a: Check excluded ingredients
        if "excluded_ingredients" in rules:
            for word in rules["excluded_ingredients"]:
                if word.lower() in combined_text:
                    exclude_found = True
                    break

        # Step 1b: Check excluded allergen groups
        if "excluded_allergen_groups" in rules:
            for group in rules["excluded_allergen_groups"]:
                if group in allergen_text:
                    exclude_found = True
                    break

        # Step 1c: Required ingredients check (e.g., for non_halal)
        if "required_ingredients" in rules:
            required_match = any(word in combined_text for word in rules["required_ingredients"])
            if not required_match:
                continue

        if not exclude_found:
            matched_tags.append(tag)

    # Define categories
    GENERAL_DIET_TAGS = ["vegan", "vegetarian", "pescetarian", "non_veg"]
    RELIGIOUS_TAGS = ["halal", "non_halal"]
    ALLERGEN_TAGS = ["dairy_free", "egg_free", "soy_free", "nut_free", "gluten_free"]

    result = {
        "general_diet": None,
        "religious_tag": None,
        "allergen_tags": [],
    }

    # Step 2: Select only one general diet (priority-based)
    for tag in GENERAL_DIET_TAGS:
        if tag in matched_tags:
            result["general_diet"] = tag
            break
    else:
        # Fallback (shouldn't happen unless none of the 4 are valid)
        result["general_diet"] = "non_veg"

    # Step 3: Select at most one religious tag (prefer halal over non_halal)
    for tag in RELIGIOUS_TAGS:
        if tag in matched_tags:
            result["religious_tag"] = tag
            break

    # Step 4: Select all applicable allergen-friendly tags
    result["allergen_tags"] = [tag for tag in matched_tags if tag in ALLERGEN_TAGS]

    values = []
    
    # Add general diet if present
    if result.get("general_diet"):
        values.append(result["general_diet"])
    
    # Add religious tag if present
    if result.get("religious_tag"):
        values.append(result["religious_tag"])
    
    # Add allergen tags if present
    allergen_tags = result.get("allergen_tags", [])
    if allergen_tags:
        values.extend(allergen_tags)
    
    # Join all values with commas
    return ", ".join(values)

    return result

In [317]:
sample_row = df.iloc[0].copy()  
classified_tags = classify_recipe(sample_row, dietary_tags)
print("Classified Tags:")
print(classified_tags)

Classified Tags:
vegan, halal, dairy_free, egg_free, soy_free, nut_free, gluten_free


In [318]:
df['dietary_tags'] = df.apply(
    lambda row: classify_recipe(row, dietary_tags),
    axis=1)

df[['ner_ingredient', 'dietary_tags', 'choking_hazard']].head(10)

Unnamed: 0,ner_ingredient,dietary_tags,choking_hazard
0,"['cassava', 'fish', 'chicken', 'coconut oil', ...","vegan, halal, dairy_free, egg_free, soy_free, ...",
1,"['beef', 'potato starch', 'milk', 'egg', 'marg...","vegan, halal, dairy_free, egg_free, soy_free, ...",
2,"['cauliflower', 'broccoli', 'margarine', 'flou...","vegan, halal, dairy_free, egg_free, soy_free, ...",
3,"['carrot', 'potato', 'sweet potato']","vegan, halal, dairy_free, egg_free, soy_free, ...",
4,"['onion', 'vegetable oil', 'beef', 'steak', 'c...","vegan, halal, dairy_free, egg_free, soy_free, ...",
5,['wholemeal bread'],"vegan, halal, dairy_free, egg_free, soy_free, ...",
6,"['water', 'polenta', 'pumpkin', 'parmesan chee...","vegan, halal, dairy_free, egg_free, soy_free, ...",
7,"['rice', 'milk', 'sugar', 'vanilla essence']","vegan, halal, dairy_free, egg_free, soy_free, ...",
8,"['chickpeas', 'garlic', 'lemon juice', 'milk',...","vegan, halal, dairy_free, egg_free, soy_free, ...",
9,"['baked beans', 'zucchini', 'potatoes', 'milk'...","vegan, halal, dairy_free, egg_free, soy_free, ...",


Populating Nutrition Value

In [319]:
# Check count of texture None values
none_count = df[df['texture'].isna()].shape[0]  # For actual None/NaN values
print(f"Number of None values in texture column: {none_count}")

# Check count of "None" string values
none_string_count = df[df['texture'] == "None"].shape[0]
print(f"Number of 'None' string values in texture column: {none_string_count}")

# Check count of empty strings
empty_string_count = df[df['texture'] == ""].shape[0]  
print(f"Number of empty strings in texture column: {empty_string_count}")

# Check count of "NONE" uppercase string values (as seen in your code elsewhere)
none_upper_count = df[df['texture'] == "NONE"].shape[0]
print(f"Number of 'NONE' values in texture column: {none_upper_count}")

Number of None values in texture column: 0
Number of 'None' string values in texture column: 0
Number of empty strings in texture column: 0
Number of 'NONE' values in texture column: 814


In [320]:
# Flatten the list of lists to get all unique ingredients
all_ingredients = set()
for ingredient_list in df['ner_ingredient']:
    if isinstance(ingredient_list, list):
        all_ingredients.update(ingredient_list)

print(f"Total unique ingredients found: {len(all_ingredients)}")
print("Unique ingredients:")
for ingredient in sorted(all_ingredients):
    print(f"- {ingredient}")

Total unique ingredients found: 0
Unique ingredients:


In [321]:
# Count occurrences of each unique ingredient
ingredient_counts = {}
for ingredient_list in df['ner_ingredient']:
    if isinstance(ingredient_list, list):
        for ingredient in ingredient_list:
            if ingredient in ingredient_counts:
                ingredient_counts[ingredient] += 1
            else:
                ingredient_counts[ingredient] = 1

# Sort ingredients by frequency (most common first)
sorted_ingredients = sorted(ingredient_counts.items(), key=lambda x: x[1], reverse=True)

# Print the results
print(f"Total unique ingredients found: {len(ingredient_counts)}")
print("\nIngredient frequency (sorted by most common):")
for ingredient, count in sorted_ingredients:
    print(f"- {ingredient}: {count} recipes")

# Alternatively, print alphabetically with counts
print("\nIngredient frequency (sorted alphabetically):")
for ingredient in sorted(ingredient_counts.keys()):
    count = ingredient_counts[ingredient]
    print(f"- {ingredient}: {count} recipes")

Total unique ingredients found: 0

Ingredient frequency (sorted by most common):

Ingredient frequency (sorted alphabetically):


In [322]:
# Step 1: Create unique ingredients dataframe with ingredient IDs
unique_ingredients_list = sorted(list(all_ingredients))

# Create ingredients dataframe with IDs
ingredients_df = pd.DataFrame({
    'ingredient_id': range(1, len(unique_ingredients_list) + 1),
    'ingredient_name': unique_ingredients_list
})

print(f"Created ingredients dataframe with {len(ingredients_df)} unique ingredients")
print("\nFirst 10 ingredients:")
print(ingredients_df.head(10))
print("\nLast 5 ingredients:")
print(ingredients_df.tail())

Created ingredients dataframe with 0 unique ingredients

First 10 ingredients:
Empty DataFrame
Columns: [ingredient_id, ingredient_name]
Index: []

Last 5 ingredients:
Empty DataFrame
Columns: [ingredient_id, ingredient_name]
Index: []


In [323]:
# Step 2: Function to detect allergens for a single ingredient
def detect_ingredient_allergens(ingredient_name, allergen_tags):
    """
    Detect which allergens an ingredient contains based on allergen tags.
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags with keywords
    
    Returns:
        list: List of allergen names that match the ingredient
    """
    ingredient_lower = ingredient_name.lower().strip()
    detected_allergens = []
    
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)  # Get allergen_group, fallback to allergen_name
                
        # Check if any keyword matches the ingredient
        for keyword in keywords:
            # print(f"Checking ingredient '{ingredient_name}' against allergen '{allergen_name}' with keyword '{keyword}'")
            if keyword.lower().strip() == ingredient_lower:
                # print(f"Detected allergen '{allergen_name}' for ingredient '{ingredient_name}'")
                if allergen_name not in detected_allergens:
                    detected_allergens.append(allergen_group)
                    # detected_allergens.append(allergen_name)
                break  # No need to check other keywords for this allergen
    
    return detected_allergens

# Test the function with a few examples
test_ingredients = ['milk', 'egg', 'tofu', 'rice', 'peanut butter']
print("Testing allergen detection:")
for ingredient in test_ingredients:
    allergens = detect_ingredient_allergens(ingredient, allergen_tags)
    print(f"- {ingredient}: {allergens}")

Testing allergen detection:
- milk: ['dairy']
- egg: ['egg']
- tofu: ['soy']
- rice: []
- peanut butter: ['peanuts']


In [324]:
print("\nDetecting allergens for all ingredients...")
ingredients_df['detected_allergens'] = ingredients_df['ingredient_name'].apply(
    lambda x: detect_ingredient_allergens(x, allergen_tags)
)

# Filter ingredients with no allergens (empty list)
ingredients_with_no_allergens = ingredients_df[ingredients_df['detected_allergens'].apply(len) == 0]

# Store them in a separate dataframe
null_allergen_ingredients_df = ingredients_with_no_allergens[['ingredient_id', 'ingredient_name']].copy()


Detecting allergens for all ingredients...


In [325]:

print(f"\nFound {len(null_allergen_ingredients_df)} ingredients with NO allergens detected:")
print("\nIngredients with no allergens:")
for _, row in null_allergen_ingredients_df.iterrows():
    print(f"- ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}'")

# Show summary statistics
print(f"\n📊 Allergen Detection Summary:")
print(f"- Total ingredients: {len(ingredients_df)}")
print(f"- Ingredients with allergens: {len(ingredients_df) - len(null_allergen_ingredients_df)}")
print(f"- Ingredients with NO allergens: {len(null_allergen_ingredients_df)}")
print(f"- Percentage with no allergens: {len(null_allergen_ingredients_df)/len(ingredients_df)*100:.1f}%")

# Save the null allergen ingredients to a separate Excel file if needed
null_allergen_ingredients_df.to_excel('ingredients_with_no_allergens.xlsx', index=False)
print(f"\n✅ Saved {len(null_allergen_ingredients_df)} ingredients with no allergens to 'ingredients_with_no_allergens.xlsx'")


Found 0 ingredients with NO allergens detected:

Ingredients with no allergens:

📊 Allergen Detection Summary:
- Total ingredients: 0
- Ingredients with allergens: 0
- Ingredients with NO allergens: 0


ZeroDivisionError: division by zero

## 🔗 Connect Allergen Group IDs with Ingredient IDs

Based on allergen tags mapping to ensure 1 ingredient maps to 1 primary allergen only.

In [None]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [None]:

def detect_allergen_with_strategy(ingredient_name, allergen_tags):
    """
    Advanced allergen detection using two-step matching strategy:
    1. Exact matching - check if ingredient name exactly matches any keyword
    2. Partial keyword matching - check if any allergen keyword appears in ingredient name
    
    Args:
        ingredient_name (str): Name of the ingredient to check
        allergen_tags (dict): Dictionary of allergen information
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact' or 'partial' or None,
            'matched_keyword': str or None,
            'confidence': float (0.0 to 1.0)
        }
    """
    ingredient_lower = ingredient_name.lower().strip()
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # Step 1: Exact matching (highest priority)
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }
    
    # Step 2: Partial keyword matching (lower priority)
    partial_matches = []
    
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            
            # Check if keyword appears as a word in ingredient name
            if keyword_lower in ingredient_lower:
                # Calculate confidence based on match quality
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                
                partial_matches.append({
                    'allergen_group': allergen_group,
                    'match_type': 'partial',
                    'matched_keyword': keyword,
                    'confidence': confidence,
                    'allergen_name': allergen_name
                })
    
    # If partial matches found, return the best one
    if partial_matches:
        # Sort by confidence (highest first)
        partial_matches.sort(key=lambda x: x['confidence'], reverse=True)
        best_match = partial_matches[0]
        
        # Only return if confidence is above threshold
        if best_match['confidence'] >= 0.5:
            return {
                'allergen_group': best_match['allergen_group'],
                'match_type': best_match['match_type'],
                'matched_keyword': best_match['matched_keyword'],
                'confidence': best_match['confidence']
            }
    
    # No match found
    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

def calculate_match_confidence(ingredient_name, keyword):
    """
    Calculate confidence score for partial matches based on various factors.
    
    Args:
        ingredient_name (str): The ingredient name (lowercase)
        keyword (str): The allergen keyword (lowercase)
    
    Returns:
        float: Confidence score between 0.0 and 1.0
    """
    # Base confidence for any partial match
    confidence = 0.6
    
    # Boost confidence if keyword is a significant portion of ingredient name
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    
    # Boost confidence if keyword appears at start or end
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    
    # Boost confidence if keyword appears as a complete word (surrounded by spaces or boundaries)
    import re
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    
    # Reduce confidence for very short keywords in long ingredient names
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    
    # Cap confidence at 0.95 for partial matches (exact matches get 1.0)
    return min(0.95, max(0.0, confidence))

def detect_allergen_simple(ingredient_name, allergen_tags):
    """
    Simplified version that returns just the allergen group (backward compatibility).
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags
    
    Returns:
        str or None: Allergen group name or None
    """
    result = detect_allergen_with_strategy(ingredient_name, allergen_tags)
    return result.get('allergen_group')

def test_allergen_detection():
    """
    Test function to demonstrate the matching strategy.
    """
    test_ingredients = [
        "milk",                    # Exact match
        "whole milk",             # Partial match  
        "milk chocolate",         # Partial match
        "almond butter",          # Exact match (tree nuts)
        "peanut oil",            # Exact match (peanuts)
        "wheat flour",           # Partial match (gluten)
        "salmon fillet",         # Partial match (fish)
        "chicken breast",        # No match
        "egg white powder",      # Partial match (egg)
        "soy sauce concentrate"  # Partial match (soy)
    ]
    
    print("🧪 TESTING ALLERGEN DETECTION STRATEGY")
    print("=" * 50)
    
    for ingredient in test_ingredients:
        result = detect_allergen_with_strategy(ingredient, ALLERGEN_TAGS)
        
        if result['allergen_group']:
            print(f"✅ '{ingredient}':")
            print(f"   → Allergen: {result['allergen_group']}")
            print(f"   → Match type: {result['match_type']}")
            print(f"   → Matched keyword: '{result['matched_keyword']}'")
            print(f"   → Confidence: {result['confidence']:.2f}")
        else:
            print(f"❌ '{ingredient}': No allergen detected")
        print()

# Uncomment to run test
# test_allergen_detection()

# Quick test examples
if __name__ == "__main__":
    # Test the function with a few examples
    test_cases = ["milk", "wheat flour", "almond butter", "chicken", "egg white"]
    
    print("Quick Test Results:")
    for ingredient in test_cases:
        result = detect_allergen_with_strategy(ingredient, ALLERGEN_TAGS)
        if result['allergen_group']:
            print(f"  {ingredient} → {result['allergen_group']} ({result['match_type']}, {result['confidence']:.2f})")
        else:
            print(f"  {ingredient} → No allergen detected")

In [None]:
def detect_primary_allergen_for_ingredient(ingredient_name, allergen_tags):
    """
    Detect the primary allergen for a single ingredient based on allergen tags.
    Ensures 1 ingredient maps to 1 primary allergen only.
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags with keywords
    
    Returns:
        str or None: Primary allergen group name, or None if no allergen detected
    """
    ingredient_lower = ingredient_name.lower().strip()
    detected_allergens = []
    
    # Priority order for allergens (most specific to least specific)
    allergen_priority = [
        'shellfish', 'fish', 'peanuts', 'tree_nuts', 'egg', 'dairy', 'soy', 'gluten'
    ]
    
    # First pass: detect all matching allergens
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        # Check if any keyword matches the ingredient exactly
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                detected_allergens.append(allergen_group)
                break  # No need to check other keywords for this allergen
    
    # If no allergens detected, return None
    if not detected_allergens:
        return None
    
    # If only one allergen detected, return it
    if len(detected_allergens) == 1:
        return detected_allergens[0]
    
    # If multiple allergens detected, select based on priority
    for priority_allergen in allergen_priority:
        if priority_allergen in detected_allergens:
            return priority_allergen
    
    # Fallback: return the first detected allergen
    return detected_allergens[0]


def add_allergen_group_id_to_ingredients(ingredient_df, allergen_df, allergen_tags):
    """
    Add allergen_group_id column to ingredient_df based on allergen detection.
    
    Args:
        ingredient_df: DataFrame with ingredient_id and ingredient_name
        allergen_df: DataFrame with allergen group information (pk, name, description)
        allergen_tags: Dictionary of allergen tags for detection
    
    Returns:
        DataFrame: Updated ingredient_df with allergen_group_id column
    """
    print("🔗 CONNECTING ALLERGEN GROUP IDs WITH INGREDIENT IDs")
    print("=" * 60)
    
    # Create a mapping from allergen group name to allergen group ID
    allergen_name_to_id = dict(zip(allergen_df['name'], allergen_df['pk']))
    
    print(f"📊 Available allergen groups:")
    for name, pk in allergen_name_to_id.items():
        print(f"   - {name} (ID: {pk})")
    
    # Create a copy of ingredient_df to avoid modifying the original
    updated_ingredient_df = ingredient_df.copy()
    
    # Detect primary allergen for each ingredient
    print(f"\n🔍 Detecting primary allergens for {len(updated_ingredient_df)} ingredients...")
    
    primary_allergens = []
    allergen_group_ids = []
    
    for _, row in updated_ingredient_df.iterrows():
        ingredient_name = row['ingredient_name']
        primary_allergen = detect_primary_allergen_for_ingredient(ingredient_name, allergen_tags)
        primary_allergens.append(primary_allergen)
        
        # Map to allergen group ID
        if primary_allergen and primary_allergen in allergen_name_to_id:
            allergen_group_id = allergen_name_to_id[primary_allergen]
        else:
            allergen_group_id = None  # No allergen detected
        
        allergen_group_ids.append(allergen_group_id)
    
    # Add the new columns
    updated_ingredient_df['primary_allergen'] = primary_allergens
    updated_ingredient_df['allergen_group_id'] = allergen_group_ids
    
    # Calculate statistics
    total_ingredients = len(updated_ingredient_df)
    ingredients_with_allergens = updated_ingredient_df['allergen_group_id'].notna().sum()
    ingredients_without_allergens = total_ingredients - ingredients_with_allergens
    
    print(f"\n✅ ALLERGEN MAPPING RESULTS:")
    print(f"   Total ingredients: {total_ingredients}")
    print(f"   Ingredients with allergens: {ingredients_with_allergens} ({ingredients_with_allergens/total_ingredients*100:.1f}%)")
    print(f"   Ingredients without allergens: {ingredients_without_allergens} ({ingredients_without_allergens/total_ingredients*100:.1f}%)")
    
    # Show breakdown by allergen group
    allergen_counts = updated_ingredient_df['primary_allergen'].value_counts()
    print(f"\n📊 Breakdown by allergen group:")
    for allergen, count in allergen_counts.items():
        if allergen:  # Skip None values
            allergen_id = allergen_name_to_id.get(allergen, 'Unknown')
            print(f"   - {allergen} (ID: {allergen_id}): {count} ingredients")
    
    # Show sample of mapped ingredients
    print(f"\n📋 Sample mapped ingredients:")
    sample_with_allergens = updated_ingredient_df[updated_ingredient_df['allergen_group_id'].notna()].head(10)
    for _, row in sample_with_allergens.iterrows():
        print(f"   - ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', Allergen: {row['primary_allergen']} (Group ID: {row['allergen_group_id']})")
    
    # Show sample of ingredients without allergens
    sample_without_allergens = updated_ingredient_df[updated_ingredient_df['allergen_group_id'].isna()].head(5)
    if len(sample_without_allergens) > 0:
        print(f"\n📋 Sample ingredients without allergens:")
        for _, row in sample_without_allergens.iterrows():
            print(f"   - ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}'")
    
    return updated_ingredient_df


# Apply the function to add allergen group IDs
ingredient_df_with_allergens = add_allergen_group_id_to_ingredients(
    ingredient_df, 
    allergen_df, 
    allergen_tags
)

🔗 CONNECTING ALLERGEN GROUP IDs WITH INGREDIENT IDs
📊 Available allergen groups:
   - dairy (ID: 1)
   - egg (ID: 2)
   - soy (ID: 3)
   - tree_nuts (ID: 4)
   - peanuts (ID: 5)
   - fish (ID: 6)
   - shellfish (ID: 7)
   - gluten (ID: 8)

🔍 Detecting primary allergens for 990 ingredients...

✅ ALLERGEN MAPPING RESULTS:
   Total ingredients: 990
   Ingredients with allergens: 67 (6.8%)
   Ingredients without allergens: 923 (93.2%)

📊 Breakdown by allergen group:
   - dairy (ID: 1): 17 ingredients
   - gluten (ID: 8): 11 ingredients
   - fish (ID: 6): 10 ingredients
   - soy (ID: 3): 8 ingredients
   - tree_nuts (ID: 4): 7 ingredients
   - egg (ID: 2): 7 ingredients
   - shellfish (ID: 7): 5 ingredients
   - peanuts (ID: 5): 2 ingredients

📋 Sample mapped ingredients:
   - ID: 5, Name: 'almond butter', Allergen: tree_nuts (Group ID: 4.0)
   - ID: 6, Name: 'almond milk', Allergen: tree_nuts (Group ID: 4.0)
   - ID: 12, Name: 'anchovy', Allergen: fish (Group ID: 6.0)
   - ID: 60, Name: 'b

In [333]:
# Validate and analyze the allergen mapping results
print("🔍 VALIDATION AND ANALYSIS OF ALLERGEN MAPPING")
print("=" * 50)

# Check the structure of the updated dataframe
print(f"📊 Updated ingredient_df structure:")
print(f"   Columns: {list(ingredient_df_with_allergens.columns)}")
print(f"   Shape: {ingredient_df_with_allergens.shape}")

# Show detailed breakdown
print(f"\n📈 Detailed allergen group mapping:")
allergen_mapping_summary = ingredient_df_with_allergens.groupby(['primary_allergen', 'allergen_group_id']).size().reset_index(name='count')
for _, row in allergen_mapping_summary.iterrows():
    if pd.notna(row['primary_allergen']):
        print(f"   {row['primary_allergen']} (Group ID: {row['allergen_group_id']}): {row['count']} ingredients")

# Show some specific examples
print(f"\n🔍 Example allergen mappings:")
example_allergens = ['dairy', 'egg', 'soy', 'gluten', 'tree_nuts']
for allergen in example_allergens:
    examples = ingredient_df_with_allergens[ingredient_df_with_allergens['primary_allergen'] == allergen].head(3)
    if len(examples) > 0:
        print(f"\n   {allergen.upper()} examples:")
        for _, row in examples.iterrows():
            print(f"     - '{row['ingredient_name']}' → Group ID: {row['allergen_group_id']}")

# Save the updated dataframe
output_filename = "ingredient_master_with_allergens.xlsx"
ingredient_df_with_allergens.to_excel(output_filename, index=False)
print(f"\n💾 Saved updated ingredient dataframe to '{output_filename}'")

# Display sample of final structure
print(f"\n📋 Final ingredient dataframe sample:")
display(ingredient_df_with_allergens[['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id']].head(10))

🔍 VALIDATION AND ANALYSIS OF ALLERGEN MAPPING
📊 Updated ingredient_df structure:
   Columns: ['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id']
   Shape: (990, 4)

📈 Detailed allergen group mapping:
   dairy (Group ID: 1.0): 17 ingredients
   egg (Group ID: 2.0): 7 ingredients
   fish (Group ID: 6.0): 10 ingredients
   gluten (Group ID: 8.0): 11 ingredients
   peanuts (Group ID: 5.0): 2 ingredients
   shellfish (Group ID: 7.0): 5 ingredients
   soy (Group ID: 3.0): 8 ingredients
   tree_nuts (Group ID: 4.0): 7 ingredients

🔍 Example allergen mappings:

   DAIRY examples:
     - 'butter' → Group ID: 1.0
     - 'cheese' → Group ID: 1.0
     - 'coconut milk' → Group ID: 1.0

   EGG examples:
     - 'chicken egg' → Group ID: 2.0
     - 'duck egg' → Group ID: 2.0
     - 'egg' → Group ID: 2.0

   SOY examples:
     - 'edamame' → Group ID: 3.0
     - 'miso' → Group ID: 3.0
     - 'soy sauce' → Group ID: 3.0

   GLUTEN examples:
     - 'barley' → Group ID: 8.0
     -

Unnamed: 0,ingredient_id,ingredient_name,primary_allergen,allergen_group_id
0,1,acai,,
1,2,agar-agar powder,,
2,3,all purpose flour,,
3,4,almond,,
4,5,almond butter,tree_nuts,4.0
5,6,almond milk,tree_nuts,4.0
6,7,almond nuts,,
7,8,aluminum foil,,
8,9,ambon banana,,
9,10,ambrosia fruit,,


In [334]:
# Enhanced allergen detection with exclusion terms and isAllergen column
def add_exclusion_terms_and_allergen_flag(ingredient_df_with_allergens):
    """
    Add exclusion terms logic and isAllergen column to ingredient dataframe.
    
    Exclusion terms like "breast milk", "formula milk" should NOT be flagged as allergens
    even if they contain allergen keywords like "milk".
    
    Args:
        ingredient_df_with_allergens: DataFrame with allergen mapping results
        
    Returns:
        DataFrame: Enhanced dataframe with exclusion logic and isAllergen column
    """
    print("🚫 APPLYING EXCLUSION TERMS AND ADDING isAllergen COLUMN")
    print("=" * 60)
    
    # Define exclusion terms - ingredients that should NOT be considered allergens
    exclusion_terms = [
        "breast milk", "breastmilk", "formula milk", "formula", "breast"
    ]
    
    print(f"📋 Exclusion terms: {exclusion_terms}")
    
    # Create a copy to work with
    enhanced_df = ingredient_df_with_allergens.copy()
    
    # Check for exclusions and nullify allergen data if needed
    exclusion_count = 0
    excluded_ingredients = []
    
    for idx, row in enhanced_df.iterrows():
        ingredient_name = row['ingredient_name'].lower().strip()
        
        # Check if ingredient matches any exclusion term
        is_excluded = False
        for exclusion_term in exclusion_terms:
            if exclusion_term.lower() in ingredient_name:
                is_excluded = True
                exclusion_count += 1
                excluded_ingredients.append({
                    'ingredient_name': row['ingredient_name'],
                    'original_allergen': row['primary_allergen'],
                    'original_allergen_id': row['allergen_group_id'],
                    'exclusion_term': exclusion_term
                })
                break
        
        # If excluded, nullify allergen information
        if is_excluded:
            enhanced_df.at[idx, 'primary_allergen'] = None
            enhanced_df.at[idx, 'allergen_group_id'] = None
    
    # Add isAllergen column based on allergen_group_id
    enhanced_df['isAllergen'] = enhanced_df['allergen_group_id'].notna()
    
    # Calculate statistics
    total_ingredients = len(enhanced_df)
    ingredients_with_allergens = enhanced_df['isAllergen'].sum()
    ingredients_without_allergens = total_ingredients - ingredients_with_allergens
    
    print(f"\n✅ EXCLUSION AND FLAG RESULTS:")
    print(f"   Total ingredients processed: {total_ingredients}")
    print(f"   Excluded due to exclusion terms: {exclusion_count}")
    print(f"   Final ingredients with allergens: {ingredients_with_allergens}")
    print(f"   Final ingredients without allergens: {ingredients_without_allergens}")
    print(f"   Allergen rate: {(ingredients_with_allergens/total_ingredients)*100:.1f}%")
    
    # Show excluded ingredients
    if excluded_ingredients:
        print(f"\n🚫 Excluded ingredients (due to exclusion terms):")
        for item in excluded_ingredients:
            print(f"   - '{item['ingredient_name']}' (was: {item['original_allergen']}) → excluded by '{item['exclusion_term']}'")
    
    # Show updated allergen breakdown
    print(f"\n📊 Updated allergen breakdown:")
    allergen_counts = enhanced_df[enhanced_df['isAllergen']]['primary_allergen'].value_counts()
    for allergen, count in allergen_counts.items():
        print(f"   - {allergen}: {count} ingredients")
    
    # Show sample of final structure
    print(f"\n📋 Sample of enhanced dataframe:")
    sample_cols = ['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id', 'isAllergen']
    print("With allergens:")
    sample_with = enhanced_df[enhanced_df['isAllergen']].head(5)
    for _, row in sample_with.iterrows():
        print(f"   ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', Allergen: {row['primary_allergen']}, isAllergen: {row['isAllergen']}")
    
    print("\nWithout allergens:")
    sample_without = enhanced_df[~enhanced_df['isAllergen']].head(5)
    for _, row in sample_without.iterrows():
        print(f"   ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', isAllergen: {row['isAllergen']}")
    
    return enhanced_df

# Apply the enhancement
ingredient_df_final = add_exclusion_terms_and_allergen_flag(ingredient_df_with_allergens)

🚫 APPLYING EXCLUSION TERMS AND ADDING isAllergen COLUMN
📋 Exclusion terms: ['breast milk', 'breastmilk', 'formula milk', 'formula', 'breast']

✅ EXCLUSION AND FLAG RESULTS:
   Total ingredients processed: 990
   Excluded due to exclusion terms: 9
   Final ingredients with allergens: 64
   Final ingredients without allergens: 926
   Allergen rate: 6.5%

🚫 Excluded ingredients (due to exclusion terms):
   - 'boneless skinless chicken breast' (was: None) → excluded by 'breast'
   - 'breast milk' (was: None) → excluded by 'breast milk'
   - 'chicken breast' (was: None) → excluded by 'breast'
   - 'formula' (was: None) → excluded by 'formula'
   - 'formula milk' (was: dairy) → excluded by 'formula milk'
   - 'formula milk powder' (was: dairy) → excluded by 'formula milk'
   - 'infant formula milk' (was: None) → excluded by 'formula milk'
   - 'milk formula' (was: dairy) → excluded by 'formula'
   - 'skinless boneless chicken breast' (was: None) → excluded by 'breast'

📊 Updated allergen bre

In [None]:
# Final validation and export of enhanced ingredient dataframe
print("📊 FINAL INGREDIENT DATAFRAME VALIDATION")
print("=" * 50)

# Check the final structure
print(f"Final dataframe columns: {list(ingredient_df_final.columns)}")
print(f"Final dataframe shape: {ingredient_df_final.shape}")

# Data type validation
print(f"\n📋 Column data types:")
for col in ingredient_df_final.columns:
    print(f"   {col}: {ingredient_df_final[col].dtype}")

# isAllergen column validation
allergen_true_count = ingredient_df_final['isAllergen'].sum()
allergen_false_count = (~ingredient_df_final['isAllergen']).sum()

print(f"\n✅ isAllergen column validation:")
print(f"   True (has allergen): {allergen_true_count}")
print(f"   False (no allergen): {allergen_false_count}")
print(f"   Total: {allergen_true_count + allergen_false_count}")

# Cross-validation: isAllergen should match allergen_group_id presence
validation_passed = True
mismatch_count = 0

for _, row in ingredient_df_final.iterrows():
    has_allergen_id = pd.notna(row['allergen_group_id'])
    is_allergen_flag = row['isAllergen']
    
    if has_allergen_id != is_allergen_flag:
        mismatch_count += 1
        if mismatch_count <= 5:  # Show first 5 mismatches
            print(f"   ⚠️ Mismatch: {row['ingredient_name']} - allergen_group_id: {row['allergen_group_id']}, isAllergen: {is_allergen_flag}")
        validation_passed = False

if validation_passed:
    print("   ✅ All isAllergen flags match allergen_group_id presence")
else:
    print(f"   ❌ Found {mismatch_count} mismatches between isAllergen and allergen_group_id")

# Show final breakdown by allergen type
print(f"\n📈 Final allergen distribution:")
allergen_breakdown = ingredient_df_final[ingredient_df_final['isAllergen']]['primary_allergen'].value_counts()
for allergen, count in allergen_breakdown.items():
    percentage = (count / allergen_true_count) * 100
    print(f"   {allergen}: {count} ingredients ({percentage:.1f}%)")

# Export final results
final_output_filename = "ingredient_master_final_with_allergens.xlsx"
ingredient_df_final.to_excel(final_output_filename, index=False)
print(f"\n💾 Exported final ingredient dataframe to '{final_output_filename}'")

# Display final structure sample
print(f"\n📋 Final dataframe structure sample:")
display(ingredient_df_final[['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id', 'isAllergen']].head(15))

Finish Populating Data

In [None]:
#save the final dataset
df.to_excel("cfirstversion_current_dataset.xlsx", index=False)
print("✅ DataFrame saved to 'cfirstversion_current_dataset.xlsx'")

Creating Category With Recipe Id

In [None]:
# Define categories and assign IDs
categories = [
    'vegan', 'vegetarian', 'pescetarian', 'dairy_free', 
    'egg_free', 'soy_free', 'nut_free', 'gluten_free',
    'halal', 'non_halal', 'non_veg'
]

category_id_map = {category: idx for idx, category in enumerate(categories, 1)}

# Create a reference DataFrame for categories
category_df = pd.DataFrame({
    'category_id': list(category_id_map.values()),
    'category_name': list(category_id_map.keys())
})

print("Category DataFrame:")
print(category_df)

In [None]:
df.shape
df.head(10)

## 🆔 Create Ingredient Master DataFrame with Unique IDs

##### Detail

In [192]:
def map_ingredient_ids_to_recipe_df(recipe_ingredient_df, ingredient_id_mapping, use_standardized=True):
    """
    Map ingredient IDs to the recipe ingredient DataFrame using cleaned ingredient names
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        ingredient_id_mapping: Dictionary mapping cleaned ingredient names to IDs
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with ingredient_id column
    """
    print("🔗 MAPPING INGREDIENT IDS TO RECIPE DATAFRAME")
    print("-" * 50)
    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return recipe_ingredient_df
    
    # Create a copy to avoid modifying the original
    updated_df = recipe_ingredient_df.copy()
    
    # Clean the ingredient names in the DataFrame and map to IDs
    def get_ingredient_id(ingredient_name):
        if pd.isna(ingredient_name):
            return None
        cleaned_name = clean_ingredient_name_symbols(ingredient_name)
        return ingredient_id_mapping.get(cleaned_name, None)
    
    updated_df['ingredient_id'] = updated_df[ingredient_column].apply(get_ingredient_id)
    
    # Check mapping results
    total_rows = len(updated_df)
    mapped_rows = updated_df['ingredient_id'].notna().sum()
    unmapped_rows = total_rows - mapped_rows
    
    print(f"✅ Ingredient ID mapping complete:")
    print(f"   Total rows: {total_rows}")
    print(f"   Successfully mapped: {mapped_rows} ({(mapped_rows/total_rows)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_rows} ({(unmapped_rows/total_rows)*100:.1f}%)")
    
    if unmapped_rows > 0:
        print(f"\n🔍 Sample unmapped ingredients:")
        unmapped_sample = updated_df[updated_df['ingredient_id'].isna()][ingredient_column].dropna().unique()[:5]
        for ingredient in unmapped_sample:
            cleaned = clean_ingredient_name_symbols(ingredient)
            print(f"   Original: '{ingredient}' → Cleaned: '{cleaned}'")
    
    return updated_df

In [None]:
# Step 2: Map ingredient IDs back to recipe DataFrame
if 'ingredient_id_mapping' in locals() and 'recipe_ingredient_df' in locals():
    print("\n📋 Step 2: Mapping ingredient IDs to recipe DataFrame...")
    
    # Update the recipe_ingredient_df with ingredient IDs
    recipe_ingredient_df_with_ids = map_ingredient_ids_to_recipe_df(
        recipe_ingredient_df,
        ingredient_id_mapping,
        use_standardized=True
    )
    
    print(f"\n📋 Sample of updated recipe_ingredient_df with IDs:")
    sample_cols = ['recipe_id', 'single_ingredient', 'standardized_ingredient', 'ingredient_id']
    available_cols = [col for col in sample_cols if col in recipe_ingredient_df_with_ids.columns]
    display(recipe_ingredient_df_with_ids[available_cols].head(10))
    
    print(f"\n📊 Final DataFrame Statistics:")
    print(f"   Recipe DataFrame shape: {recipe_ingredient_df_with_ids.shape}")
    print(f"   Unique recipes: {recipe_ingredient_df_with_ids['recipe_id'].nunique()}")
    print(f"   Unique ingredients: {len(ingredient_df)}")
    print(f"   Recipe-ingredient relationships: {len(recipe_ingredient_df_with_ids)}")
    
else:
    print("❌ Required variables not found for ID mapping")




📋 Step 2: Mapping ingredient IDs to recipe DataFrame...
🔗 MAPPING INGREDIENT IDS TO RECIPE DATAFRAME
--------------------------------------------------
✅ Ingredient ID mapping complete:
   Total rows: 8176
   Successfully mapped: 8163 (99.8%)
   Unmapped: 13 (0.2%)

🔍 Sample unmapped ingredients:

📋 Sample of updated recipe_ingredient_df with IDs:


Unnamed: 0,recipe_id,single_ingredient,standardized_ingredient,ingredient_id
0,1,cassava,cassava,155.0
1,1,fish,fish,327.0
2,1,chicken,chicken,182.0
3,1,coconut oil,coconut oil,231.0
4,1,chicken broth,chicken broth,185.0
5,1,lime juice,lime juice,497.0
6,1,spinach,spinach,818.0
7,2,beef,beef,72.0
8,2,potato starch,potato starch,670.0
9,2,milk,milk,538.0



📊 Final DataFrame Statistics:
   Recipe DataFrame shape: (8176, 8)
   Unique recipes: 1322
   Unique ingredients: 995
   Recipe-ingredient relationships: 8176


In [182]:
# Step 3: Final validation and export
print("📋 Step 3: Final Data Validation and Export")
print("-" * 50)

if 'ingredient_df' in locals() and 'recipe_ingredient_df_with_ids' in locals():
    
    # Validation checks
    print("🔍 FINAL VALIDATION CHECKS:")
    
    # Check 1: Ingredient DataFrame cleanliness
    total_ingredients = len(ingredient_df)
    clean_ingredients = ingredient_df['ingredient_name'].notna().sum()
    empty_ingredients = ingredient_df['ingredient_name'].isna().sum()
    
    print(f"✅ Ingredient Master DataFrame:")
    print(f"   Total unique ingredients: {total_ingredients}")
    print(f"   Clean ingredients: {clean_ingredients}")
    print(f"   Empty/null ingredients: {empty_ingredients}")
    
    # Check 2: Recipe-Ingredient mapping completeness
    total_relationships = len(recipe_ingredient_df_with_ids)
    mapped_relationships = recipe_ingredient_df_with_ids['ingredient_id'].notna().sum()
    unmapped_relationships = total_relationships - mapped_relationships
    
    print(f"\n✅ Recipe-Ingredient Relationships:")
    print(f"   Total relationships: {total_relationships}")
    print(f"   Successfully mapped: {mapped_relationships} ({(mapped_relationships/total_relationships)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_relationships} ({(unmapped_relationships/total_relationships)*100:.1f}%)")
    
    # Check 3: Data consistency
    unique_ingredient_ids_in_recipes = recipe_ingredient_df_with_ids['ingredient_id'].dropna().nunique()
    total_ingredient_ids_in_master = ingredient_df['ingredient_id'].nunique()
    
    print(f"\n✅ Data Consistency:")
    print(f"   Unique ingredient IDs in recipes: {unique_ingredient_ids_in_recipes}")
    print(f"   Total ingredient IDs in master: {total_ingredient_ids_in_master}")
    print(f"   Coverage: {(unique_ingredient_ids_in_recipes/total_ingredient_ids_in_master)*100:.1f}%")
    
    # Export cleaned datasets
    print(f"\n💾 EXPORTING CLEANED DATASETS:")
    
    try:
        # Export ingredient master DataFrame
        ingredient_filename = "cleaned_ingredient_master.xlsx"
        ingredient_df.to_excel(ingredient_filename, index=False)
        print(f"✅ Exported ingredient master: {ingredient_filename}")
        
        # Export recipe-ingredient DataFrame with IDs
        recipe_ingredient_filename = "cleaned_recipe_ingredient_with_ids.xlsx"
        recipe_ingredient_df_with_ids.to_excel(recipe_ingredient_filename, index=False)
        print(f"✅ Exported recipe-ingredient data: {recipe_ingredient_filename}")
        
        print(f"\n🎉 DATA PROCESSING COMPLETE!")
        print(f"📁 Files ready for downstream modeling and analysis")
        
    except Exception as e:
        print(f"❌ Export error: {str(e)}")
        
else:
    print("❌ Required DataFrames not found for validation")

print("\n" + "=" * 60)

📋 Step 3: Final Data Validation and Export
--------------------------------------------------
🔍 FINAL VALIDATION CHECKS:
✅ Ingredient Master DataFrame:
   Total unique ingredients: 995
   Clean ingredients: 995
   Empty/null ingredients: 0

✅ Recipe-Ingredient Relationships:
   Total relationships: 8176
   Successfully mapped: 8163 (99.8%)
   Unmapped: 13 (0.2%)

✅ Data Consistency:
   Unique ingredient IDs in recipes: 995
   Total ingredient IDs in master: 995
   Coverage: 100.0%

💾 EXPORTING CLEANED DATASETS:
✅ Exported ingredient master: cleaned_ingredient_master.xlsx
✅ Exported recipe-ingredient data: cleaned_recipe_ingredient_with_ids.xlsx

🎉 DATA PROCESSING COMPLETE!
📁 Files ready for downstream modeling and analysis



In [178]:
# Debug: Check the format of standardized ingredients
print("🔍 DEBUGGING STANDARDIZED INGREDIENT FORMAT")
print("-" * 50)

if 'recipe_ingredient_df' in locals():
    # Sample some standardized ingredients to see their actual format
    sample_ingredients = recipe_ingredient_df['standardized_ingredient'].dropna().head(20).tolist()
    
    print("📋 Sample standardized ingredients (raw format):")
    for i, ingredient in enumerate(sample_ingredients[:10], 1):
        print(f"   {i}. {repr(ingredient)}")  # repr shows quotes and special characters
    
    # Test the cleaning function on these samples
    print(f"\n🧹 Testing cleaning function:")
    for i, ingredient in enumerate(sample_ingredients[:5], 1):
        cleaned = clean_ingredient_name_symbols(ingredient)
        print(f"   {i}. Original: {repr(ingredient)}")
        print(f"      Cleaned:  {repr(cleaned)}")
        print()

else:
    print("❌ recipe_ingredient_df not found")

🔍 DEBUGGING STANDARDIZED INGREDIENT FORMAT
--------------------------------------------------
📋 Sample standardized ingredients (raw format):
   1. 'cassava'
   2. 'fish'
   3. 'chicken'
   4. 'coconut oil'
   5. 'chicken broth'
   6. 'lime juice'
   7. 'spinach'
   8. 'beef'
   9. 'potato starch'
   10. 'milk'

🧹 Testing cleaning function:
   1. Original: 'cassava'
      Cleaned:  'cassava'

   2. Original: 'fish'
      Cleaned:  'fish'

   3. Original: 'chicken'
      Cleaned:  'chicken'

   4. Original: 'coconut oil'
      Cleaned:  'coconut oil'

   5. Original: 'chicken broth'
      Cleaned:  'chicken broth'

