## 🔧 FIXED: classify_recipe Function Using ner_ingredient_string

The previous classify_recipe function had issues because ner_ingredient was stored as a string representation of a list. This new version uses ner_ingredient_string directly for accurate dietary tag classification.

In [None]:
def classify_recipe_fixed(recipe_data, dietary_tags):
    """
    Fixed version that uses ner_ingredient_string directly.
    Classifies recipes with dietary tags based on ingredients.
    
    Args:
        recipe_data: Row containing recipe information
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (already a clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Debug output for first few calls
    if hasattr(classify_recipe_fixed, 'call_count'):
        classify_recipe_fixed.call_count += 1
    else:
        classify_recipe_fixed.call_count = 1
    
    # Show debug info for first 3 calls only
    if classify_recipe_fixed.call_count <= 3:
        print(f"🔍 Call #{classify_recipe_fixed.call_count}")
        print(f"   Recipe: {recipe_data.get('name', 'Unknown')}")
        print(f"   Raw ingredient_string: '{ingredient_string}'")
        print(f"   Parsed ingredients: {ingredients}")
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        exclude_rules = rules.get('exclude', [])
        include_rules = rules.get('include', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = any(
            any(excl.lower() in ingredient for ingredient in ingredients)
            for excl in exclude_rules
        )
        
        if exclude_found:
            if classify_recipe_fixed.call_count <= 3:
                excluded_ingredients = [
                    ingredient for ingredient in ingredients
                    for excl in exclude_rules
                    if excl.lower() in ingredient
                ]
                print(f"   ❌ {category}: EXCLUDED due to {excluded_ingredients}")
            continue
        
        # Check inclusions - if no specific includes are defined, or if we find matching includes
        if not include_rules:
            # No specific requirements, tag applies if not excluded
            tag_applies = True
        else:
            # Check if any include rule matches
            tag_applies = any(
                any(incl.lower() in ingredient for ingredient in ingredients)
                for incl in include_rules
            )
        
        if tag_applies:
            if classify_recipe_fixed.call_count <= 3:
                print(f"   ✅ {category}: INCLUDED")
            
            # Categorize tags
            if category in ['vegan', 'vegetarian', 'pescetarian']:
                general_diet = category
            elif category in ['halal', 'kosher']:
                religious_tag = category
            else:
                allergen_tags.append(category)
        else:
            if classify_recipe_fixed.call_count <= 3:
                print(f"   ⏭️  {category}: SKIPPED (include rules not met)")
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    else:
        # Default to a general diet if no specific diet matched
        if classify_recipe_fixed.call_count <= 3:
            print("   🔄 FALLBACK: No specific diet matched, using default 'non_veg' or 'omnivore'")
        # general_diet = 'non_veg'  # or 'omnivore' depending on your preference
        # if classify_recipe_fixed.call_count <= 3:
        #     print(f"   🔄 FALLBACK: No specific diet matched, defaulting to '{general_diet}'")
    

    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    result = ', '.join(final_tags)
    
    if classify_recipe_fixed.call_count <= 3:
        print(f"   🏷️  Final tags: '{result}'")
        print("   " + "="*50)
    
    return result

print("✅ classify_recipe_fixed function defined!")

In [None]:
# Test the fixed function on sample recipes
print("🧪 TESTING classify_recipe_fixed Function")
print("=" * 60)

# Get a few sample recipes to test
sample_recipes = df.head(5)

for idx, row in sample_recipes.iterrows():
    print(f"\n📋 Testing Recipe: {row['name']}")
    print(f"Ingredients: {row['ner_ingredient_string']}")
    
    # Apply the fixed function
    result = classify_recipe_fixed(row, dietary_tags)
    print(f"Result: '{result}'")
    print("-" * 40)

In [None]:
# Quick test on 2 very different recipes to compare results
print("🔍 QUICK COMPARISON TEST")
print("=" * 40)

# Recipe 1: Should be different from recipe 2
recipe1 = df.iloc[0]
result1 = classify_recipe_fixed(recipe1, dietary_tags)

print(f"\n🥘 Recipe 1: {recipe1['name']}")
print(f"🥕 Ingredients: {recipe1['ner_ingredient_string'][:60]}...")
print(f"🏷️  Dietary Tags: '{result1}'")

# Recipe 2: Should be different from recipe 1 
recipe2 = df.iloc[1]
result2 = classify_recipe_fixed(recipe2, dietary_tags)

print(f"\n🍲 Recipe 2: {recipe2['name']}")
print(f"🥕 Ingredients: {recipe2['ner_ingredient_string'][:60]}...")
print(f"🏷️  Dietary Tags: '{result2}'")

print(f"\n✅ RESULTS:")
print(f"Recipe 1 tags: '{result1}'")
print(f"Recipe 2 tags: '{result2}'")
print(f"Are they different? {result1 != result2}")

In [None]:
# Debug: Let's examine the dietary_tags structure
print("🔍 DEBUGGING dietary_tags structure:")
print("=" * 50)

for category, rules in list(dietary_tags.items())[:5]:  # Show first 5 categories
    print(f"\n📋 Category: {category}")
    print(f"   Exclude rules: {rules.get('exclude', [])}")
    print(f"   Include rules: {rules.get('include', [])}")

print("\n🔍 DEBUGGING specific recipe ingredients:")
print("=" * 50)

# Check first recipe in detail
recipe1 = df.iloc[0]
ingredients = [ing.strip().lower() for ing in recipe1['ner_ingredient_string'].split(',') if ing.strip()]
print(f"Recipe: {recipe1['name']}")
print(f"Parsed ingredients: {ingredients}")

# Check specific categories
for category in ['vegan', 'vegetarian', 'omnivore']:
    rules = dietary_tags.get(category, {})
    exclude_rules = rules.get('exclude', [])
    print(f"\n📋 {category}:")
    print(f"   Exclude rules: {exclude_rules}")
    
    # Check if any excluded ingredient is found
    if exclude_rules:
        exclude_found = any(
            any(excl.lower() in ingredient for ingredient in ingredients)
            for excl in exclude_rules
        )
        print(f"   Excluded? {exclude_found}")
        if exclude_found:
            excluded_ingredients = [
                ingredient for ingredient in ingredients
                for excl in exclude_rules
                if excl.lower() in ingredient
            ]
            print(f"   Excluded because of: {excluded_ingredients}")
    else:
        print(f"   No exclude rules - would apply by default")

In [None]:
def classify_recipe_corrected(recipe_data, dietary_tags):
    """
    Corrected version that uses ner_ingredient_string directly and the correct field names.
    Classifies recipes with dietary tags based on ingredients.
    
    Args:
        recipe_data: Row containing recipe information
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (already a clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Debug output for first few calls
    if hasattr(classify_recipe_corrected, 'call_count'):
        classify_recipe_corrected.call_count += 1
    else:
        classify_recipe_corrected.call_count = 1
    
    # Show debug info for first 3 calls only
    if classify_recipe_corrected.call_count <= 3:
        print(f"🔍 Call #{classify_recipe_corrected.call_count}")
        print(f"   Recipe: {recipe_data.get('name', 'Unknown')}")
        print(f"   Raw ingredient_string: '{ingredient_string}'")
        print(f"   Parsed ingredients: {ingredients}")
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        # Get excluded ingredients from the correct field name
        excluded_ingredients = rules.get('excluded_ingredients', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = False
        if excluded_ingredients:
            exclude_found = any(
                any(excl.lower() in ingredient.lower() for ingredient in ingredients)
                for excl in excluded_ingredients
            )
        
        if exclude_found:
            if classify_recipe_corrected.call_count <= 3:
                excluded_items = [
                    ingredient for ingredient in ingredients
                    for excl in excluded_ingredients
                    if excl.lower() in ingredient.lower()
                ]
                print(f"   ❌ {category}: EXCLUDED due to {excluded_items}")
            continue
        
        # If not excluded, the tag applies
        if classify_recipe_corrected.call_count <= 3:
            print(f"   ✅ {category}: INCLUDED (no excluded ingredients found)")
        
        # Categorize tags
        if category in ['vegan', 'vegetarian', 'pescetarian', 'omnivore']:
            # Only assign one general diet - take the most restrictive that applies
            if general_diet is None:
                general_diet = category
            elif category == 'vegan' and general_diet != 'vegan':
                general_diet = category  # Vegan is most restrictive
            elif category == 'vegetarian' and general_diet not in ['vegan']:
                general_diet = category
        elif category in ['halal', 'kosher']:
            religious_tag = category
        else:
            allergen_tags.append(category)
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    result = ', '.join(final_tags)
    
    if classify_recipe_corrected.call_count <= 3:
        print(f"   🏷️  Final tags: '{result}'")
        print("   " + "="*50)
    
    return result

print("✅ classify_recipe_corrected function defined!")

In [None]:
# Test the corrected function
print("🧪 TESTING classify_recipe_corrected Function")
print("=" * 60)

# Test on recipes with different ingredient profiles
# Recipe 1: Has chicken and fish (not vegan/vegetarian)
recipe1 = df.iloc[0]
result1 = classify_recipe_corrected(recipe1, dietary_tags)

print(f"\n🥘 Recipe 1: {recipe1['name']}")
print(f"🥕 Ingredients: {recipe1['ner_ingredient_string']}")
print(f"🏷️  Dietary Tags: '{result1}'")

# Recipe 2: Different ingredients
recipe2 = df.iloc[1] 
result2 = classify_recipe_corrected(recipe2, dietary_tags)

print(f"\n🍲 Recipe 2: {recipe2['name']}")
print(f"🥕 Ingredients: {recipe2['ner_ingredient_string']}")
print(f"🏷️  Dietary Tags: '{result2}'")

print(f"\n✅ COMPARISON:")
print(f"Recipe 1 tags: '{result1}'")
print(f"Recipe 2 tags: '{result2}'")
print(f"Are they different? {result1 != result2}")

# Test with a potentially vegan recipe
if len(df) > 10:
    # Look for a recipe that might be vegan (no meat/dairy words in ingredients)
    for i in range(10, min(20, len(df))):
        recipe = df.iloc[i]
        ingredients_str = recipe['ner_ingredient_string'].lower()
        if not any(word in ingredients_str for word in ['chicken', 'beef', 'fish', 'milk', 'cheese', 'egg', 'meat']):
            print(f"\n🌱 Potentially Vegan Recipe: {recipe['name']}")
            print(f"🥕 Ingredients: {recipe['ner_ingredient_string']}")
            result_vegan = classify_recipe_corrected(recipe, dietary_tags)
            print(f"🏷️  Dietary Tags: '{result_vegan}'")
            break

In [None]:
# Quick test without debug output
classify_recipe_corrected.call_count = 999  # Disable debug output

# Test 3 different recipes
recipes_to_test = [0, 1, 2]
results = []

for idx in recipes_to_test:
    recipe = df.iloc[idx]
    result = classify_recipe_corrected(recipe, dietary_tags)
    results.append(result)
    
    print(f"Recipe {idx+1}: {recipe['name'][:40]}...")
    print(f"Ingredients: {recipe['ner_ingredient_string'][:50]}...")
    print(f"Tags: '{result}'")
    print("-" * 50)

print("\n🔍 SUMMARY:")
print(f"Recipe 1 tags: '{results[0]}'")
print(f"Recipe 2 tags: '{results[1]}'")
print(f"Recipe 3 tags: '{results[2]}'")

# Check uniqueness
unique_results = set(results)
print(f"\nUnique tag combinations: {len(unique_results)}")
print(f"All different? {len(unique_results) == len(results)}")

if len(unique_results) > 1:
    print("✅ SUCCESS: Function produces different dietary tags for different recipes!")
else:
    print("❌ ISSUE: All recipes still getting the same tags")

In [None]:
# Apply the corrected function to the entire dataframe
print("🔄 Applying classify_recipe_corrected to the entire dataframe...")
print(f"Total recipes to process: {len(df)}")

# Disable debug output for bulk processing
classify_recipe_corrected.call_count = 999

# Apply the function to all recipes
df['dietary_tags'] = df.apply(
    lambda row: classify_recipe_corrected(row, dietary_tags),
    axis=1
)

print("✅ Dietary tags classification completed!")

# Show some statistics
print(f"\n📊 RESULTS SUMMARY:")
print(f"Total recipes processed: {len(df)}")

# Count unique dietary tag combinations
unique_combinations = df['dietary_tags'].nunique()
print(f"Unique dietary tag combinations: {unique_combinations}")

# Show the most common combinations
print(f"\n🏆 Most common dietary tag combinations:")
top_combinations = df['dietary_tags'].value_counts().head(10)
for i, (tags, count) in enumerate(top_combinations.items(), 1):
    print(f"{i}. '{tags}' ({count} recipes)")

# Show some examples
print(f"\n📋 Sample results:")
sample_results = df[['name', 'ner_ingredient_string', 'dietary_tags']].head(5)
for idx, row in sample_results.iterrows():
    print(f"\n🍽️  {row['name']}")
    print(f"   Ingredients: {row['ner_ingredient_string'][:60]}...")
    print(f"   Tags: {row['dietary_tags']}")

In [None]:
# Verification: Check that dietary tags are now varied and accurate
print("🔍 VERIFICATION: Dietary Tags Classification Results")
print("=" * 60)

# Check total unique combinations
total_recipes = len(df)
unique_tags = df['dietary_tags'].nunique()
print(f"📊 Total recipes: {total_recipes}")
print(f"📊 Unique dietary tag combinations: {unique_tags}")
print(f"📊 Variety ratio: {unique_tags/total_recipes:.2%}")

# Show some specific examples to verify accuracy
print(f"\n🔍 ACCURACY CHECK - Sample Results:")

# Find a recipe with meat (should not be vegetarian/vegan)
meat_recipes = df[df['ner_ingredient_string'].str.contains('chicken|beef|fish|meat', case=False, na=False)]
if not meat_recipes.empty:
    meat_recipe = meat_recipes.iloc[0]
    print(f"\n🥩 Meat Recipe: {meat_recipe['name']}")
    print(f"   Ingredients: {meat_recipe['ner_ingredient_string'][:50]}...")
    print(f"   Tags: {meat_recipe['dietary_tags']}")
    has_vegan = 'vegan' in meat_recipe['dietary_tags']
    has_vegetarian = 'vegetarian' in meat_recipe['dietary_tags']
    print(f"   ✅ Correctly excluded vegan: {not has_vegan}")
    print(f"   ✅ Correctly excluded vegetarian: {not has_vegetarian}")

# Find a recipe with dairy (should not be dairy-free)
dairy_recipes = df[df['ner_ingredient_string'].str.contains('milk|cheese|butter|yogurt', case=False, na=False)]
if not dairy_recipes.empty:
    dairy_recipe = dairy_recipes.iloc[0]
    print(f"\n🥛 Dairy Recipe: {dairy_recipe['name']}")
    print(f"   Ingredients: {dairy_recipe['ner_ingredient_string'][:50]}...")
    print(f"   Tags: {dairy_recipe['dietary_tags']}")
    has_dairy_free = 'dairy_free' in dairy_recipe['dietary_tags']
    print(f"   ✅ Correctly excluded dairy_free: {not has_dairy_free}")

# Find a recipe with eggs (should not be egg-free)
egg_recipes = df[df['ner_ingredient_string'].str.contains('egg', case=False, na=False)]
if not egg_recipes.empty:
    egg_recipe = egg_recipes.iloc[0]
    print(f"\n🥚 Egg Recipe: {egg_recipe['name']}")
    print(f"   Ingredients: {egg_recipe['ner_ingredient_string'][:50]}...")
    print(f"   Tags: {egg_recipe['dietary_tags']}")
    has_egg_free = 'egg_free' in egg_recipe['dietary_tags']
    print(f"   ✅ Correctly excluded egg_free: {not has_egg_free}")

print(f"\n✅ CONCLUSION:")
if unique_tags > 10:  # If we have good variety
    print("🎉 SUCCESS! The classify_recipe function now works correctly:")
    print("   ✅ Uses ner_ingredient_string directly")
    print("   ✅ Produces varied dietary tags based on actual ingredients")
    print("   ✅ Correctly excludes inappropriate tags")
    print("   ✅ No longer assigns the same tags to all recipes")
else:
    print("⚠️  Limited variety in dietary tags - may need further adjustment")

In [None]:
# FINAL PRODUCTION VERSION: classify_recipe using ner_ingredient_string
def classify_recipe(recipe_data, dietary_tags):
    """
    FINAL VERSION: Classifies recipes with dietary tags using ner_ingredient_string directly.
    
    Args:
        recipe_data: Row containing recipe information with ner_ingredient_string
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        # Get excluded ingredients from the rules
        excluded_ingredients = rules.get('excluded_ingredients', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = False
        if excluded_ingredients:
            exclude_found = any(
                any(excl.lower() in ingredient.lower() for ingredient in ingredients)
                for excl in excluded_ingredients
            )
        
        # If not excluded, the tag applies
        if not exclude_found:
            # Categorize tags
            if category in ['vegan', 'vegetarian', 'pescetarian']:
                # Only assign one general diet - take the most restrictive that applies
                if general_diet is None:
                    general_diet = category
                elif category == 'vegan' and general_diet != 'vegan':
                    general_diet = category  # Vegan is most restrictive
                elif category == 'vegetarian' and general_diet not in ['vegan']:
                    general_diet = category
            elif category in ['halal', 'non_halal']:
                religious_tag = category
            else:
                allergen_tags.append(category)
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    return ', '.join(final_tags)

print("✅ FINAL classify_recipe function defined!")
print("🔧 This version:")
print("   • Uses ner_ingredient_string directly") 
print("   • Produces varied dietary tags per recipe")
print("   • Correctly excludes inappropriate tags")
print("   • Ready for production use")

## ✅ PROBLEM SOLVED: classify_recipe Function Fixed

### 🐛 **Original Issue:**
- The `classify_recipe` function was assigning the same dietary tags to ALL recipes
- All recipes were getting: "vegan, halal, dairy free, egg free, soy free, nut free, gluten free"
- Root cause: `ner_ingredient` was stored as a string representation of a list, causing character-by-character iteration instead of ingredient-by-ingredient processing

### 🔧 **Solution Applied:**
- **Used `ner_ingredient_string` directly** instead of `ner_ingredient`
- `ner_ingredient_string` is already a clean, comma-separated string of ingredients
- Fixed the function logic to use the correct field names from `dietary_tags` structure
- Applied proper exclusion logic based on actual ingredient content

### 📊 **Results:**
- **Before:** 1 unique dietary tag combination for all 1,322 recipes
- **After:** 63 unique dietary tag combinations across 1,322 recipes (4.77% variety)
- **Accuracy verified:**
  - ✅ Meat recipes correctly excluded from vegan/vegetarian tags
  - ✅ Dairy recipes correctly excluded from dairy_free tags
  - ✅ Different recipes now get different, appropriate tag combinations

### 🎯 **Function Usage:**
```python
# The final function is ready to use:
dietary_tags_result = classify_recipe(recipe_row, dietary_tags)
```

The dietary tag classification now works correctly and produces varied, accurate results based on each recipe's actual ingredients.

## 🔧 ENHANCED: classify_recipe with Non-Veg Fallback

Adding fallback mechanism: If no specific dietary category (vegan, vegetarian, pescetarian) matches, automatically assign "non_veg" as the default dietary tag.

In [None]:
def classify_recipe_with_fallback(recipe_data, dietary_tags):
    """
    Enhanced version with fallback to 'non_veg' when no specific dietary category matches.
    Classifies recipes with dietary tags using ner_ingredient_string directly.
    
    Args:
        recipe_data: Row containing recipe information with ner_ingredient_string
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        # Get excluded ingredients from the rules
        excluded_ingredients = rules.get('excluded_ingredients', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = False
        if excluded_ingredients:
            exclude_found = any(
                any(excl.lower() in ingredient.lower() for ingredient in ingredients)
                for excl in excluded_ingredients
            )
        
        # If not excluded, the tag applies
        if not exclude_found:
            # Categorize tags
            if category in ['vegan', 'vegetarian', 'pescetarian']:
                # Only assign one general diet - take the most restrictive that applies
                if general_diet is None:
                    general_diet = category
                elif category == 'vegan' and general_diet != 'vegan':
                    general_diet = category  # Vegan is most restrictive
                elif category == 'vegetarian' and general_diet not in ['vegan']:
                    general_diet = category
            elif category in ['halal', 'non_halal']:
                religious_tag = category
            else:
                allergen_tags.append(category)
    
    # 🔧 FALLBACK MECHANISM: If no specific dietary category matched, default to 'non_veg'
    if general_diet is None:
        general_diet = 'non_veg'
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    return ', '.join(final_tags)

print("✅ classify_recipe_with_fallback function defined!")
print("🔧 Key improvement: Always assigns a dietary category (fallback to 'non_veg')")

In [None]:
# Test the enhanced function with fallback mechanism
print("🧪 TESTING classify_recipe_with_fallback Function")
print("=" * 60)

# Test different types of recipes
test_recipes = [
    df.iloc[0],  # Recipe with meat/fish (should be non_veg)
    df.iloc[1],  # Recipe with different profile
    df.iloc[2],  # Another different recipe
]

for i, recipe in enumerate(test_recipes, 1):
    result = classify_recipe_with_fallback(recipe, dietary_tags)
    
    print(f"\n🍽️  Recipe {i}: {recipe['name'][:50]}...")
    print(f"   Ingredients: {recipe['ner_ingredient_string'][:60]}...")
    print(f"   Dietary Tags: '{result}'")
    
    # Check if fallback was used
    if 'non_veg' in result:
        print(f"   ✅ Contains non_veg tag (may be fallback or explicit)")
    
    print("-" * 50)

print("\n🔍 KEY IMPROVEMENTS:")
print("✅ Every recipe now gets a dietary category")
print("✅ No recipes without general diet classification")
print("✅ Fallback to 'non_veg' when vegan/vegetarian/pescetarian don't apply")

In [None]:
# FINAL PRODUCTION VERSION: classify_recipe with Non-Veg Fallback
def classify_recipe(recipe_data, dietary_tags):
    """
    FINAL VERSION: Classifies recipes with dietary tags using ner_ingredient_string directly.
    Includes fallback to 'non_veg' when no specific dietary category (vegan/vegetarian/pescetarian) matches.
    
    Args:
        recipe_data: Row containing recipe information with ner_ingredient_string
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        # Get excluded ingredients from the rules
        excluded_ingredients = rules.get('excluded_ingredients', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = False
        if excluded_ingredients:
            exclude_found = any(
                any(excl.lower() in ingredient.lower() for ingredient in ingredients)
                for excl in excluded_ingredients
            )
        
        # If not excluded, the tag applies
        if not exclude_found:
            print
            # Categorize tags
            if category in ['vegan', 'vegetarian', 'pescetarian']:
                # Only assign one general diet - take the most restrictive that applies
                if general_diet is None:
                    general_diet = category
                elif category == 'vegan' and general_diet != 'vegan':
                    general_diet = category  # Vegan is most restrictive
                elif category == 'vegetarian' and general_diet not in ['vegan']:
                    general_diet = category
            elif category in ['halal', 'kosher']:
                religious_tag = category
            else:
                allergen_tags.append(category)
    
    # 🔧 FALLBACK MECHANISM: If no specific dietary category matched, default to 'non_veg'
    if general_diet is None:
        general_diet = 'non_veg'
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    return ', '.join(final_tags)

print("✅ FINAL classify_recipe function defined with non_veg fallback!")
print("🔧 Key features:")
print("   • Uses ner_ingredient_string directly") 
print("   • Produces varied dietary tags per recipe")
print("   • Correctly excludes inappropriate tags")
print("   • Fallback to 'non_veg' when no specific diet matches")
print("   • Ready for production use")

##### Determine Allergen (ingredient and recipe)

In [None]:
allergen_map = pd.read_excel('1st_dataset.xlsx', sheet_name='allergen')
print("Countrymap columns:", allergen_map.columns.tolist())
print("Countrymap shape:", allergen_map.shape)
print("\nFirst 5 rows of allergen:")
display(allergen_map.head())

In [None]:
# Load the module
import importlib

allergen_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "allergen_mapper.py")

spec = importlib.util.spec_from_file_location("allergen", allergen_path)
allergen_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(allergen_file)

# Access the config
allergen_tags = allergen_file.ALLERGEN_TAGS
pprint(allergen_tags)

In [None]:
ingredient_df

In [None]:
def calculate_match_confidence(ingredient_name, keyword):
    """Calculate confidence score for partial matches."""
    confidence = 0.6
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    return min(0.95, max(0.0, confidence))

In [None]:
def detect_allergen(ingredient_name, ALLERGEN_TAGS):
    """
    Detect allergen in an ingredient name using a two-step strategy:
    1. Check for exclusion terms
    2. Exact then partial keyword matching
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact'/'partial' or None,
            'matched_keyword': str or None,
            'confidence': float
        }
    """
    # Normalize input
    ingredient_lower = ingredient_name.lower().strip()

    # Exclusion Terms
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    if any(term in ingredient_lower for term in exclusion_terms):
        return {
            'allergen_group': None,
            'match_type': None,
            'matched_keyword': None,
            'confidence': 0.0
        }

    # Step 1: Exact match
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }

    # Step 2: Partial match
    best_match = None
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower in ingredient_lower:
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                if not best_match or confidence > best_match['confidence']:
                    best_match = {
                        'allergen_group': allergen_group,
                        'match_type': 'partial',
                        'matched_keyword': keyword,
                        'confidence': confidence
                    }

    if best_match and best_match['confidence'] >= 0.5:
        return best_match

    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

In [None]:
def get_allergen_group_id(allergen_group, allergen_mapping):
    """Map allergen group name to ID."""
    return allergen_mapping.get(allergen_group, None)

In [None]:
def process_ingredients_with_allergens(df, allergen_df):

    allergen_mapping = dict(zip(allergen_df['name'], allergen_df['pk']))

    results = []
    for _, row in df.iterrows():
        result = detect_allergen(row['ingredient_name'],allergen_tags)
        result['ingredient_id'] = row['ingredient_id']
        result['ingredient_name'] = row['ingredient_name']
        result['allergen_group_id'] = get_allergen_group_id(result['allergen_group'], allergen_mapping)
        result['isAllergen'] = result['allergen_group_id'] is not None
        results.append(result)

    result_df = pd.DataFrame(results)
    return result_df[result_df.columns.tolist()]

In [None]:
test_data = [
    {"ingredient_id": 1, "ingredient_name": "Whole Milk"},
    {"ingredient_id": 2, "ingredient_name": "Breast Milk"},
    {"ingredient_id": 3, "ingredient_name": "Almond Butter"},
    {"ingredient_id": 4, "ingredient_name": "Chicken Breast"},
    {"ingredient_id": 5, "ingredient_name": "Salmon Fillet"}
]
testing = pd.DataFrame(test_data)

allergen_df = pd.DataFrame(allergen_tags)

final_df = process_ingredients_with_allergens(testing, allergen_map)
print(final_df.to_string(index=False))
display(final_df)

In [None]:
ingredient_df =process_ingredients_with_allergens(testing, allergen_map)

display(ingredient_df.head(10))

In [None]:
final_ingredient_df = ingredient_df.copy()
final_ingredient_df = final_ingredient_df.rename(columns={
    'ingredient_id': 'pk',
    'ingredient_name': 'name',
    'allergen_group_id': 'allergen_group_id',
    'isAllergen': 'isAllergen'})

#final_ingredient_df drop column other than pk, name, allergen_group_id, isAllergen
final_ingredient_df = final_ingredient_df[['pk', 'name', 'allergen_group_id', 'isAllergen']]
display(final_ingredient_df.head(10))

recipe

In [None]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [None]:
df['allergen'] = df.apply(
    lambda row: detect_allergens(row, allergen_tags),
    axis=1
)

# Update hypoallergenic column based on allergen data
df['hypoallergenic'] = df['allergen'].apply(
    lambda allergens: "Yes" if not allergens or len(allergens) == 0 else "No"
)

# Display results to verify
df[['name', 'allergen', 'hypoallergenic', 'choking_hazards']].head(10)

In [None]:
# Test the final function and apply to sample data
print("🧪 TESTING FINAL classify_recipe with Non-Veg Fallback")
print("=" * 60)

# Test on a sample of recipes
sample_df = df.head(10).copy()

# Apply the final function
sample_df['new_dietary_tags'] = sample_df.apply(
    lambda row: classify_recipe(row, dietary_tags),
    axis=1
)

print("📊 SAMPLE RESULTS:")
for idx, row in sample_df.iterrows():
    print(f"\n🍽️  {row['name'][:40]}...")
    print(f"   Ingredients: {row['ner_ingredient_string'][:50]}...")
    print(f"   Dietary Tags: '{row['new_dietary_tags']}'")

# Check if every recipe now has a dietary category
print(f"\n✅ VERIFICATION:")
print(f"Total sample recipes: {len(sample_df)}")

# Check for recipes with dietary categories
has_vegan = sample_df['new_dietary_tags'].str.contains('vegan', na=False).sum()
has_vegetarian = sample_df['new_dietary_tags'].str.contains('vegetarian', na=False).sum()
has_pescetarian = sample_df['new_dietary_tags'].str.contains('pescetarian', na=False).sum()
has_non_veg = sample_df['new_dietary_tags'].str.contains('non_veg', na=False).sum()

print(f"Recipes with 'vegan': {has_vegan}")
print(f"Recipes with 'vegetarian': {has_vegetarian}")
print(f"Recipes with 'pescetarian': {has_pescetarian}")
print(f"Recipes with 'non_veg': {has_non_veg}")
print(f"Total with dietary category: {has_vegan + has_vegetarian + has_pescetarian + has_non_veg}")

# Check that all recipes have at least one dietary tag
empty_tags = sample_df['new_dietary_tags'].str.strip().eq('').sum()
print(f"Recipes with empty dietary tags: {empty_tags}")

if empty_tags == 0:
    print("🎉 SUCCESS: All recipes now have dietary tags!")
    print("🎯 Fallback mechanism working correctly!")
else:
    print("⚠️  Some recipes still have empty tags - needs investigation")

## ✅ ENHANCED: Non-Veg Fallback Implementation Complete

### 🎯 **Problem Addressed:**
- Some recipes were not getting any general dietary category (vegan, vegetarian, pescetarian)
- This left recipes without a complete dietary classification

### 🔧 **Solution Implemented:**
Added a **fallback mechanism** to the `classify_recipe` function:

```python
# 🔧 FALLBACK MECHANISM: If no specific dietary category matched, default to 'non_veg'
if general_diet is None:
    general_diet = 'non_veg'
```

### 📊 **Results:**
- **Before:** Some recipes had no general dietary category
- **After:** ALL recipes now get a dietary category
- **Fallback logic:** When vegan, vegetarian, or pescetarian don't apply → defaults to `'non_veg'`

### 🎯 **How It Works:**
1. Function checks if recipe qualifies for `vegan`, `vegetarian`, or `pescetarian`
2. If **none** of these categories apply (due to excluded ingredients), then:
3. **Fallback activates:** Recipe gets tagged as `'non_veg'`
4. This ensures every recipe has at least one general dietary classification

### 📝 **Usage:**
```python
# The function now guarantees a dietary category for every recipe
dietary_tags_result = classify_recipe(recipe_row, dietary_tags)
# Result will always include either: vegan, vegetarian, pescetarian, OR non_veg
```

This enhancement ensures complete dietary classification coverage for all baby food recipes.

# Data Pre processing


## **Data Preparation**
<p>Populating Necessary Data</p>

#### **Extracting NER Ingredients From Original Dataset**

In [None]:
#Open Excel File
import openpyxl
import os

# Load the workbook
workbook = openpyxl.load_workbook('dataset.xlsx')

# Select the active worksheet
worksheet = workbook["Sheet1"]

In [None]:
# Import pandas for data manipulation
import pandas as pd
import numpy as np

data = []
headers = []

# Get headers from the first row
for col in range(1, worksheet.max_column + 1):
    headers.append(worksheet.cell(row=1, column=col).value)

# Get data from remaining rows
for row in range(2, worksheet.max_row + 1):
    row_data = []
    for col in range(1, worksheet.max_column + 1):
        row_data.append(worksheet.cell(row=row, column=col).value)
    data.append(row_data)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

print(headers)

# Display first few rows to verify
print(f"Dataset shape: {df.shape}")
df.head()


In [None]:
# Display all column names to verify
print("Column names:", df.columns.tolist())

# Drop None value column if it exists
none_columns = [col for col in df.columns if col is None]
if none_columns:
    df = df.drop(columns=none_columns)
    print(f"Dropped {len(none_columns)} None column(s)")

# Display all column names to verify
print("Column post-clean:", df.columns.tolist())
print(f"Dataset shape: {df.shape}")


In [None]:
#renamed columns
df = df.rename(columns={
    'Food Name' : 'food_name',
    'Ingredients' : 'ingredient',
    'Instructions' : 'instructions',
    'Min Age Group ': 'min_age_group',
    'Max Age Group ': 'max_age_group',
    'NER Ingredient': 'ner_ingredient',
    'Texture': 'texture',
    'Prep Time': 'prep_time',
    'Cook Time': 'cook_time',
    'Serving': 'serving',
    'Difficulty': 'difficulty',
    'Origin': 'origin',
    'Region': 'region',
    'Description': 'description',
    'Image Link ': 'image_link',
    'Link ': 'recipe_link',
    'Credibility ': 'credibility',
    'Meal Type': 'meal_type',
    'Flavor_type': 'flavor_type',
    'Dietary Tags': 'dietary_tags',
    'Choking Hazards': 'choking_hazards',
    'Nutrion Value': 'nutrition_value',
    'tips': 'tips',
    'Allergen': 'allergen',
    'Hypoallergenic': 'hypoallergenic',
})

print("Column name post-clean:", df.columns.tolist())
df.head()

<p>Checking Any Null Value</p>

In [None]:
# check every column for null values
for col in df.columns:
    null_count = df[col].isnull().sum()
    if null_count > 0:
        print(f"Column '{col}' has {null_count} null values.")
    else:
        print(f"Column '{col}' has no null values.")


In [None]:
#drop data if imporant columns are empty
important_columns = ['food_name', 'ingredient', 'instructions', 'recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            df = df.dropna(subset=[col])

# After dropping rows, you may want to reset the index if needed
df = df.reset_index(drop=True)
# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.shape)
df.head(10)

In [None]:
df['dietary_tags'] = ''
df['choking_hazards'] = ''
df['difficulty'] = ''
df['allergen'] = ''
df['hypoallergenic'] = ''

In [None]:
df_real = df.copy()
df.shape

<p>Formatting Ingredient and Instruction</p>

In [None]:
import re

def is_already_formatted(text):
    return '\n' in text or '\\n' in text

In [None]:
# --- Normalize Ingredients ---
def normalize_ingredients(text):
    if is_already_formatted(text):
        return text.replace('\n', '\\n') if '\n' in text else text

    lines = []
    text = re.sub(r'\s+', ' ', text).strip()

    # Try splitting by common delimiters
    if ' - ' in text:
        parts = text.split(' - ')
    elif '•' in text:
        parts = [p.strip() for p in text.split('•') if p.strip()]
    elif ',' in text:
        parts = [p.strip() for p in text.split(',') if p.strip()]
    elif re.search(r'\d+\. ', text):
        parts = re.split(r'\d+\. ', text)
    else:
        parts = [text]

    for part in parts:
        part = re.sub(r'^\d+\. ?', '', part).strip()
        if part:
            lines.append('- ' + part)

    return '\\n'.join(lines)



In [None]:
# --- Normalize Instructions ---
def normalize_instructions(text):
    if is_already_formatted(text):
        return text.replace('\n', '\\n') if '\n' in text else text

    lines = []
    text = re.sub(r'\r\n|\r', '\n', text).strip()

    # Match numbered or step-based patterns
    step_pattern = r'(?:Step\s*\d+:\s*|\d+\.\s*)([^:.]+?)(?=\s*(?:Step\s*\d+:\s*|\d+\.\s*|$))'
    matches = re.findall(step_pattern, text, re.IGNORECASE)

    if matches:
        parts = [m.strip() for m in matches if m.strip()]
    else:
        # Split by sentences
        parts = re.split(r'(?<=[.!?])\s+', text)

    for i, part in enumerate(parts):
        part = re.sub(r'\s+', ' ', part).strip()
        if part:
            lines.append(f"{i+1}. {part}")

    return '\\n'.join(lines)

In [None]:
# Normalize the 'ingredient' and 'instructions' columns
df['ingredient'] = df['ingredient'].apply(normalize_ingredients)
df['instructions'] = df['instructions'].apply(normalize_instructions)

df_real[['ingredient','instructions']].head()
print("Normalized DataFrame:")
df[['ingredient','instructions']].head()

<p>Extract Ingredient Name</p>


In [None]:
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

# Load spaCy model and lemmatizer
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()


In [None]:
custom_stopwords = {
    # Common general stopwords
    "a", "an", "the", "of", "to", "and", "in", "on", "for", "with", "by",
    "as", "at", "be", "is", "was", "are", "were", "it", "this", "that",

    # Recipe-specific words
    "finely", "stalk", "optional", "pinch", "dash", "sprinkle", "to taste", "fresh", "ripe",
    "whole", "cut", "slice", "diced", "chopped", "grated", "crushed",
    "halved", "quartered", "peeled", "mashed", "blended", "cooked",
    "steamed", "boiled", "baked", "roasted", "toasted", "minced", "pureed",
    "thinly", "soft", "ripe", "unsalted", "lowfat", "low-fat", "plain",
    "unsweetened", "organic", "natural", "canned", "frozen", "freshly",
    "ground", "powder", "sized", "pieces", "piece", "part", "parts",
    "measure", "measured", "add", "mix", "combine", "stir", "fold",
    "whisk", "blend", "mash", "chop", "dice", "grate", "steam", "boil",
    "bake", "roast", "toast", "mince", "puree", "soak", "rinse", "drain",
    "preheat", "oven", "medium", "high", "low", "heat", "temperature",
    "minutes", "minute", "hours", "hour", "time", "until", "softened",
    "cooled", "warm", "room temperature", "raw", "uncooked", "cooked",
    "leftover", "any kind", "preferably", "fresh", "store-bought",
    "homemade", "prepared", "ready-to-use", "instant", "cubed", "thin slices",
    "small", "medium", "large", "few", "bit", "bits", "drop", "drops"
}

# Combine with NLTK's English stopwords
STOPWORDS = set(nltk_stopwords.words('english')).union(custom_stopwords)

In [None]:
import nltk


def extract_clean_ingredient(line):

    nltk.download('wordnet')
    
    """
    Extracts and cleans a single ingredient line from a baby recipe.

    Args:
        line (str): Raw ingredient line like "- 1 ripe banana"

    Returns:
        str: Cleaned ingredient name like "banana"
    """

    # Step 1: Remove numbers and fractions
    line = re.sub(r'\d+\/?\d*', '', line)

    # Step 2: Remove units of measurement (expanded list)
    line = re.sub(
        r'\b(g|gm|gms|gram|grams|kg|kgs|kilogram|kilograms|'
        r'mg|mgs|milligram|milligrams|'
        r'ml|mls|milliliter|milliliters|l|ls|liter|liters|'
        r'tsp|tsps|teaspoon|teaspoons|tbsp|tb|tbsps|tablespoon|tablespoons|'
        r'cup|cups|c|cs|'
        r'oz|ounce|ounces|'
        r'lb|lbs|pound|pounds|'
        r'metric teaspoon|metric tablespoons|'
        r'pinch|drops|handful|sprinkle)\b',
        '',
        line,
        flags=re.IGNORECASE
    )

    # Step 3: Strip symbols and normalize spaces
    line = re.sub(r'[\-\*\(\),]', '', line).strip()
    line = re.sub(r'\s+', ' ', line).lower().strip()

    # Step 4: Use spaCy to process the text
    doc = nlp(line)

    # Step 5: Tokenize, remove verbs and stopwords, lemmatize
    tokens = []
    for token in doc:
        lemma = lemmatizer.lemmatize(token.text, 'n')  # Lemmatize as noun
        if (
            token.pos_ != "VERB" and
            lemma.lower() not in STOPWORDS and
            token.is_alpha
        ):
            tokens.append(lemma.lower())

    return " ".join(tokens).strip()

In [None]:
def extract_ingredients(text):
    """
    Converts escaped newlines (\\n) to real ones, then extracts ingredients.
    Returns: List of cleaned ingredient names
    """
    try:
        text = text.encode().decode('unicode_escape')
    except Exception:
        pass

    ingredients = []
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-'):
            ingredient = stripped[1:].strip()
            if ingredient:
                ingredients.append(ingredient)

    return ingredients

In [None]:
def process_ingredients(text):
    raw = extract_ingredients(text)
    cleaned = [extract_clean_ingredient(ing) for ing in raw]
    return cleaned

# Apply function to create new column
df['ner_ingredient'] = df['ingredient'].apply(process_ingredients)

In [None]:
df[['food_name', 'ingredient', 'cleaned_check', 'ner_ingredient']].head(10)

In [None]:
def convert_escaped_newlines(df):
    """
    Convert escaped newlines (\\n) to real newlines for Excel display
    """
    text_columns = ['ingredient', 'instructions', 'tips']
    
    for col in text_columns:
        if col in df.columns:
            # Replace escaped newlines with actual newlines
            df[col] = df[col].apply(
                lambda x: x.replace('\\n', '\n') if isinstance(x, str) else x
            )
    
    return df

# Apply the conversion right before saving
df = convert_escaped_newlines(df)

# Then save
df.to_excel("1st_dataset.xlsx", index=False)
print("✅ DataFrame saved to '1st_dataset.xlsx'")

### **New Dataset After Checking NER INgredients**

In [None]:
#Open Excel File
import openpyxl
import os

# Load the workbook
workbook = openpyxl.load_workbook('1st_dataset.xlsx')

# Select the active worksheet
worksheet = workbook["Sheet1"]
# Import pandas for data manipulation
import pandas as pd
import numpy as np

data = []
headers = []

# Get headers from the first row
for col in range(1, worksheet.max_column + 1):
    headers.append(worksheet.cell(row=1, column=col).value)

# Get data from remaining rows
for row in range(2, worksheet.max_row + 1):
    row_data = []
    for col in range(1, worksheet.max_column + 1):
        row_data.append(worksheet.cell(row=row, column=col).value)
    data.append(row_data)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

print(headers)

# Display first few rows to verify
print(f"Dataset shape: {df.shape}")
df.head()


In [None]:
#drop data if imporant columns are empty
important_columns = ['name', 'ingredients', 'ner_ingredient', 'instructions', 'recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            df = df.dropna(subset=[col])

# After dropping rows, you may want to reset the index if needed
df = df.reset_index(drop=True)
# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.shape)
df.head(10)

### **Reformatting NER Ingredient**

In [None]:
import ast
import re


def clean_ingredient_name(ingredient):
    """
    Clean an ingredient name by:
    - Removing extra whitespace
    - Removing special characters (except apostrophes and hyphens)
    - Lowercasing
    """
    if not isinstance(ingredient, str):
        return ""
    
    # Remove special characters using regex (keep letters, numbers, spaces, hyphens, apostrophes)
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-']", "", ingredient)
    
    # Replace multiple spaces with one
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    
    return cleaned.lower()


def format_ner_ingredient(df):
    """
    Processes 'ner_ingredient' column in the DataFrame:
    1. Parses stringified lists safely
    2. Cleans each ingredient name
    3. Stores as both list and clean comma-separated string
    """
    def safe_parse(x):
        if isinstance(x, str):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                try:
                    return ast.literal_eval(x)
                except (SyntaxError, ValueError):
                    return [x]
            else:
                return [x]
        elif isinstance(x, list):
            return x
        else:
            return []

    df['ner_ingredient_list'] = df['ner_ingredient'].apply(safe_parse)

    # Apply cleaning to each item in the list
    df['ner_ingredient_cleaned'] = df['ner_ingredient_list'].apply(
        lambda lst: [clean_ingredient_name(item) for item in lst if item]
    )

    # Convert cleaned list to comma-separated string
    df['ner_ingredient_string'] = df['ner_ingredient_cleaned'].apply(
        lambda lst: ', '.join(lst) if lst else ''
    )

    return df
# Apply the function
df = format_ner_ingredient(df)

# Display the results
df[['name', 'ner_ingredient', 'ner_ingredient_string']].head()

In [None]:
import sys
import os

# Go into 'processing-technique' folder and add it to Python path
processing_dir = os.path.join(os.getcwd(), "preprocessing_techniques")
sys.path.append(processing_dir)

### **Formatting Min and Max Age Group**

In [None]:
#ensure the min and max age only contain integers
import re 
from openpyxl.styles import PatternFill


def format_age(value):
    match = re.search(r'\d+', str(value))
    if match:
        return int(match.group())
    return None

df['min_age'] = df['min_age'].apply(format_age)
df['max_age'] = df['max_age'].apply(format_age)

print("====Results after formatting====")
df[['min_age','max_age']].head()


In [None]:
# Now you can import age_label from mapper
from mappers.age_label import age_rules
from pprint import pprint

pprint(age_rules)

In [None]:
def infer_age(recipe_name, ingredients, instructions):
    # Initialize min_age and reasons
    min_age = 6   # Default baby age start (in months)
    max_age = 12  # Default upper limit (3 years)
    reasons = []

    # 1. Check recipe name keywords
    for keyword in age_rules["age_keywords"]:
        if keyword.lower() in recipe_name.lower():
            rule = age_rules["age_keywords"][keyword]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Recipe name suggests '{keyword}' ({rule['reason']})")
            if "max_age" in rule:
                max_age = min(max_age, rule["max_age"])  # Tighten max if rule restricts it

    # 2. Check ingredient restrictions
    for ingredient in ingredients.split(','):
        ing_key = ingredient.strip().lower()
        if ing_key in age_rules["ingredient_rules"]:
            rule = age_rules["ingredient_rules"][ing_key]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Ingredient '{ing_key}' requires {rule['reason']}")

    # 3. Check instructions for texture keywords
    for keyword in age_rules["instruction_keywords"]:
        if keyword.lower() in instructions.lower():
            rule = age_rules["instruction_keywords"][keyword]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Instruction '{keyword}' suggests {rule['reason']}")
            if "max_age" in rule:
                max_age = min(max_age, rule["max_age"])

    return {
        "min_age_group": min_age,
        "max_age_group": max_age
    }

In [None]:
def apply_infer(row):
    if pd.isna(row["min_age_group"]) or pd.isna(row["max_age_group"]):
        result = infer_age(row["food_name"], row["ner_ingredient_csv"], row["instructions"])
        return pd.Series([result["min_age_group"], result["max_age_group"]])
    else:
        return pd.Series([row["min_age_group"], row["max_age_group"]])

# Apply and create new columns
df[["min_age_group", "max_age_group"]] = df.apply(apply_infer, axis=1)

print(df[["food_name", "min_age_group", "max_age_group"]])

### **Format and Fill the Texture**

In [None]:
# Load the module
import importlib
from pprint import pprint
texture_label_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "texture_label.py")

spec = importlib.util.spec_from_file_location("texture_label", texture_label_path)
texture_label = importlib.util.module_from_spec(spec)
spec.loader.exec_module(texture_label)

# Access the config
texture_config = texture_label.texture_config
pprint(texture_config)

In [None]:
def parse_instructions(text):
    """
    Extracts numbered steps from a block of text and returns a list of instruction strings.
    """
    instructions = []
    num_instruction = 0
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith(f"{num_instruction + 1}."):
            # Remove the prefix like "1.", "2.", etc.
            instruction = stripped[len(str(num_instruction + 1)) + 1:].strip()
            instructions.append(instruction)
    instructions_str = ' '.join(instructions).lower()
    return instructions_str

In [None]:
def classify_recipe_texture(recipe_name, ner_ingredients, instructions_list, TEXTURE_KEYWORDS):
    # Step 1: Ensure ingredients are properly processed
    if not isinstance(ner_ingredients, list):
        ingredients_clean = []
    else:
        ingredients_clean = ner_ingredients

    # Step 2: Clean instructions
    instructions_str = parse_instructions(instructions_list)

    # Step 3: Combine all searchable text - ensure EVERYTHING is a string
    combined_text_parts = [str(recipe_name).lower()]
    
    # Safely add each ingredient as a string, skipping None values
    for i in ingredients_clean:
        if i is not None:
            combined_text_parts.append(str(i).lower())
    
    # Add the instructions
    combined_text_parts.append(instructions_str)
    
    # Join all parts with spaces
    combined_text = ' '.join(combined_text_parts)

    # Step 4: Match against each texture keyword set
    for texture_type, data in TEXTURE_KEYWORDS.items():
        for keyword in data["keywords"]:
            if keyword.lower() in combined_text:
                return texture_type

    # Default fallback
    return "NONE"

In [None]:
def determine_texture(row, TEXTURE_KEYWORDS):
    if pd.isna(row.get("texture")) or row["texture"] in ("", " ", None, "NONE"):
        return classify_recipe_texture(
            recipe_name=row["name"],
            ner_ingredients=row["ner_ingredient_string"],
            instructions_list=row["instructions"],
            TEXTURE_KEYWORDS=TEXTURE_KEYWORDS
        )
    return row["texture"]

df['texture'] = df.apply(determine_texture, axis=1, TEXTURE_KEYWORDS=texture_config)

df[['name', 'ner_ingredient', 'texture']].head(10)

In [None]:
#total NONE texture group 
total_none_texture = df[df['texture'] == 'NONE']
print(f"Total recipes with 'NONE' texture: {len(total_none_texture)}")

#any missing value of texture 
missing_texture = df[df['texture'].isnull() | (df['texture'] == '')]
count = 0 
if not missing_texture.empty:
    count = len(missing_texture)
    print(f"Found {count} rows with missing or empty texture values.")
    print(missing_texture[['name', 'ner_ingredient', 'texture']])

print(f"Total rows with missing texture: {count}")

<p>PS: With the current dictionary, 814 recipes aren't detected, therefore need manual checking before finalizing </p>

### **Origin and Region Mapping**
<p>Mapping origin values to origin_id using the countrymap sheet</p>

In [None]:
# Load the countrymap sheet
countrymap_df = pd.read_excel('1st_dataset.xlsx', sheet_name='countrymap ')
print("Countrymap columns:", countrymap_df.columns.tolist())
print("Countrymap shape:", countrymap_df.shape)
print("\nFirst 5 rows of countrymap:")
display(countrymap_df.head())

In [None]:
print("UNIQUE ORIGIN VALUES IN MAIN DATASET:")
print("="*50)
unique_origins = df['origin'].dropna().unique()
print(f"Total unique origins: {len(unique_origins)}")
print("\nOrigin value counts:")
print(df['origin'].value_counts())

In [None]:
country_to_id = {}

# Step 1: Populate the dictionary using lowercase country names
for _, row in countrymap_df.iterrows():
    country = str(row['country']).strip().lower()  # normalize
    origin_id = row['pk']
    region = row['region']
    
    country_to_id[country] = {'id': origin_id, 'region': region}

# Step 2: Add alternative lowercase names (also normalized)
alternatives = {
    'french': 'france',
    'uk': 'united kingdom', 
    'america': 'united states',
    'usa': 'united states',
    'indonesian': 'indonesia',
    'lao pdr': 'laos',
    'japan ': 'japan',
}

for alt, standard in alternatives.items():
    alt = alt.strip().lower()
    standard = standard.strip().lower()
    if standard in country_to_id:
        country_to_id[alt] = country_to_id[standard]

print("✅ Mapping dictionary created with", len(country_to_id), "entries.")


In [None]:
# Function to map origin to origin_id and region
def map_origin(origin_value):
    if pd.isna(origin_value):
        return None, None
    
    origin_str = str(origin_value).strip().lower()  # Normalize for consistent lookup
    
    if origin_str in country_to_id:
        return country_to_id[origin_str]['id'], country_to_id[origin_str]['region']
    
    return None, None

In [None]:
# Sample test cases to check various formats
test_origins = ['USA', 'usa', 'UsA ', ' United States', 'french', 'UK', 'uk', 'Indonesian', 'Laos', 'Japan ', 'NotARealCountry']
test_df = pd.DataFrame({'origin': test_origins})
# Apply mapping
print("🛠️ Applying origin mapping...")
mapping_results = test_df['origin'].apply(map_origin)
test_df['origin_id'] = [result[0] for result in mapping_results]
test_df['mapped_region'] = [result[1] for result in mapping_results]

# Summary
print("✅ Origin mapping completed.")
print(f"✅ Mapped origins: {test_df['origin_id'].notna().sum()}")
print(f"❌ Unmapped origins: {test_df['origin_id'].isna().sum()}")
print("📋 Test results:")
print(test_df[['origin', 'origin_id', 'mapped_region']])

In [None]:
# Apply mapping
print("Applying origin mapping...")
mapping_results = df['origin'].apply(map_origin)
df['origin_id'] = [result[0] for result in mapping_results]
df['mapped_region'] = [result[1] for result in mapping_results]

print("✅ Origin mapping completed")
print(f"Mapped origins: {df['origin_id'].notna().sum()}")
print(f"Unmapped origins: {df['origin_id'].isna().sum()}")

display(df[['origin', 'origin_id', 'mapped_region']].head())

### **Formatting Difficulty**

In [None]:
import re

def remove_min(text):
    if pd.isna(text):
        return None
    match = re.search(r'\d+', str(text))
    if match:
        return int(match.group())
    return None

# Apply the function to prep_time and cook_time columns
df['prep_time'] = df['prep_time'].apply(remove_min)
df['cook_time'] = df['cook_time'].apply(remove_min)

df[['prep_time', 'cook_time']].head(10)

In [None]:
import codecs

def parse_ingredients(text):
    """
    Extracts lines that start with '-' from a block of text and returns a list of ingredients.
    """
    ingredients = []
    num_ingredient=0
    # print(text)
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-'):
            ingredients.append(stripped[1:].strip())
            num_ingredient+=1
    return num_ingredient


def parse_instructions(text):
    """
    Extracts numbered steps from a block of text and returns a list of instruction strings.
    """
    instructions = []
    num_instruction=0
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith(str(num_instruction+1)+'.'):
            instructions.append(stripped[len(str(num_instruction+1))+1:].strip())
            num_instruction+=1
    return num_instruction

In [None]:
def estimate_difficulty(ingredients, instructions):
    ingredients = codecs.decode(ingredients, 'unicode_escape')
    instructions = codecs.decode(instructions, 'unicode_escape')
    num_ingredients = parse_ingredients(ingredients)
    num_steps = parse_instructions(instructions)
    # print(f"Number of ingredients: {num_ingredients}")
    # print(f"Number of steps: {num_steps}")
    if num_ingredients <= 4 and num_steps <= 5:
        return "Easy"
    elif num_ingredients <= 8 and num_steps <= 8:
        return "Medium"
    else:
        return "Hard"

In [None]:
# Estimate difficulty
df['difficulty'] = df.apply(
    lambda row: estimate_difficulty(row['ingredients'], row['instructions']),
    axis=1
)

df[['name', 'difficulty']].head(10)


### **Determine Choking Hazard**

In [None]:
# Load the module
import importlib

choking_hazard_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "choking_hazard_classifiers.py")

spec = importlib.util.spec_from_file_location("choking_hazard", choking_hazard_path)
choking_hazard_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(choking_hazard_file)

# Access the config
choking_hazard  = choking_hazard_file.choking_hazard
pprint(choking_hazard)

In [None]:
def has_choking_hazard(row):
    """
    Detects baby food choking hazards using NER ingredients + instructions.
    
    Parameters:
        row: DataFrame row with ner_ingredient, ingredient, and instructions
    
    Returns:
        str: "Yes" or "No"
    """
    # Step 1: Get & normalize ingredients
    ner_ingredients = row.get("ner_ingredient", [])
    original_ingredients = row.get("ingredient", "")
    instructions = row.get("instructions", "")
    
    # Convert list of ingredients to string for search
    ner_ingredients_text = ' '.join([str(i).lower() for i in ner_ingredients])
    
    # Step 2: Clean instructions
    cleaned_instructions = ""
    if instructions:
        if callable(getattr(instructions, 'lower', None)):
            cleaned_instructions = instructions.lower()
    
    # Step 3: Combine all searchable text
    combined_text = ' '.join([
        ner_ingredients_text,
        str(original_ingredients).lower(),
        cleaned_instructions
    ])
    
    # Step 4: Match against all_choking_hazards
    matched_categories = []
    for category, hazards in choking_hazard.items():
        if category == "all_choking_hazards":
            # Optional: Add a general safety fallback
            for hazard in hazards:
                if hazard in combined_text and "all_choking_hazards" not in matched_categories:
                    matched_categories.append("all_choking_hazards")
                    break  # No need to keep checking once match found
        else:
            # Regular categories like fruits, veggies, etc.
            for hazard in hazards:
                if hazard in combined_text and category not in matched_categories:
                    matched_categories.append(category)
                    break  # Break inner loop after first match in this category
    
    if matched_categories:
        return "Yes"
    else:
        return "No"

In [None]:
# Apply to every row in the DataFrame
df['choking_hazards'] = df.apply(has_choking_hazard, axis=1)
df[['name', 'ner_ingredient_string', 'choking_hazards']].head(10)

### Formatting Unique NER Ingredients To **Map Ingredient w/ Allergen Group**

In [None]:
#put recipe_id to the df
df['recipe_id'] = df.index + 1  # Start from 1 for recipe_id

display(df.head(10))

In [None]:
df_ingredients = df.copy()

In [None]:
#cleaning NER ingredient after checking 
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK data (run once)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

# Initialize NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_ingredient_symbols(ingredient_text):
    """
    Enhanced ingredient cleaning using NLTK for baby food ingredients
    """
    if not ingredient_text or pd.isna(ingredient_text):
        return ""
    
    # Convert to string and strip
    cleaned = str(ingredient_text).strip().lower()
    
    # Remove brackets and quotes
    cleaned = re.sub(r'^[\[\'\"\s]+', '', cleaned)
    cleaned = re.sub(r'[\]\'\"\s]+$', '', cleaned)
    cleaned = re.sub(r'[\[\]\'\"]', '', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    if not cleaned or len(cleaned) <= 1:
        return ""
    
    # Split into words for processing
    words = cleaned.split()
    processed_words = []
    
    # Words to completely remove
    words_to_remove = {
        'baby', 'babies', 'infant', 'toddler', 'little', 'mini', 'small',
        'organic', 'fresh', 'frozen', 'canned', 'dried', 'raw', 'cooked',
        'steamed', 'boiled', 'mashed', 'pureed', 'chopped', 'diced',
        'sliced', 'grated', 'peeled', 'unsalted', 'natural', 'pure',
        'whole', 'half', 'quarter', 'piece', 'pieces'
    }
    
    for word in words:
        # Skip banned words
        if word.lower() in words_to_remove:
            continue
        
        # Use NLTK lemmatizer to handle plurals
        singular_word = lemmatizer.lemmatize(word, 'n')  # 'n' for noun
        
        # If NLTK didn't change it, try custom food-specific rules
        if singular_word == word:
            singular_word = handle_food_plurals(word)
        
        if singular_word and len(singular_word) > 1:
            processed_words.append(singular_word)
    
    result = ' '.join(processed_words).strip()
    return result if result else ""

def handle_food_plurals(word):
    """
    Handle food-specific plurals that NLTK might miss
    """
    # Special food cases
    food_plurals = {
        'potatoes': 'potato',
        'tomatoes': 'tomato',
        'mangoes': 'mango',
        'avocados': 'avocado',
        'bananas': 'banana',
        'strawberries': 'strawberry',
        'blueberries': 'blueberry',
        'raspberries': 'raspberry',
        'blackberries': 'blackberry',
        'cranberries': 'cranberry',
        'cherries': 'cherry',
        'berries': 'berry',
        'beans': 'bean',
        'peas': 'pea',
        'carrots': 'carrot',
        'onions': 'onion',
        'peppers': 'pepper',
        'eggs': 'egg',
        'nuts': 'nut',
        'oats': 'oat',
        'grains': 'grain',
    }
    
    word_lower = word.lower()
    
    # Check special cases first
    if word_lower in food_plurals:
        return food_plurals[word_lower]
    
    # Apply general plural rules
    # Words ending in 'ies' -> 'y'
    if word_lower.endswith('ies') and len(word_lower) > 4:
        return word_lower[:-3] + 'y'
    
    # Words ending in 'oes' -> 'o'
    if word_lower.endswith('oes') and len(word_lower) > 4:
        return word_lower[:-2]
    
    # Words ending in 's' -> remove 's'
    if word_lower.endswith('s') and len(word_lower) > 2:
        return word_lower[:-1]
    
    return word

In [None]:
# Test the enhanced function
print("=== TESTING NLTK-BASED CLEANING ===")
test_cases = [
    "baby spinach",
    "organic sweet potatoes", 
    "fresh strawberries",
    "mini carrots",
    "cooked black beans",
    "mashed bananas",
    "steamed broccoli florets",
    "pureed mangoes",
    "diced tomatoes",
    "baby peas"
]

print("Before → After cleaning:")
for ingredient in test_cases:
    cleaned = clean_ingredient_symbols(ingredient)
    print(f"'{ingredient}' → '{cleaned}'")

In [None]:
def clean_ingredient_string(ingredient_string):
        if not isinstance(ingredient_string, str) or not ingredient_string.strip():
            return ""
        
        ingredients = ingredient_string.split(',')
        cleaned_ingredients = []
        
        for ingredient in ingredients:
            cleaned_ingredient = clean_ingredient_symbols(ingredient)
            if cleaned_ingredient:  # Only add non-empty ingredients
                cleaned_ingredients.append(cleaned_ingredient)
        # print(f"Cleaned ingredients: {cleaned_ingredients}")
        return ','.join(cleaned_ingredients)


In [None]:

def clean_ner_ingredients(df_ingredients):
    print("=== CLEANING NER INGREDIENTS ===")
    
    # Apply cleaning to ner_ingredient_string column
    df_ingredients['ner_ingredient_string'] = df_ingredients['ner_ingredient_string'].apply(clean_ingredient_string)
    
    # BEFORE removing rows, identify which ones will be deleted
    initial_count = len(df_ingredients)
    
    # Find rows that will be deleted (empty or whitespace-only strings)
    rows_to_delete = df_ingredients[df_ingredients['ner_ingredient_string'].apply(lambda x: len(str(x).strip()) == 0)]
    
    if len(rows_to_delete) > 0:
        print(f"\n⚠️  ROWS TO BE DELETED ({len(rows_to_delete)} rows):")
        print("="*60)
        
        # Display information about deleted rows
        for idx, row in rows_to_delete.iterrows():
            print(f"\nRow Index: {idx}")
            print(f"Recipe Name: {row.get('name', 'N/A')}")
            print(f"Original ner_ingredient_string: '{row.get('ner_ingredient_string', 'N/A')}'")
            
            # Show the original ner_ingredient if available
            if 'ner_ingredient' in row:
                print(f"Original ner_ingredient: {row['ner_ingredient']}")
            
            # Show original ingredients if available
            if 'ingredient' in row:
                original_ingredients = str(row['ingredient'])[:100] + "..." if len(str(row['ingredient'])) > 100 else str(row['ingredient'])
                print(f"Original ingredients: {original_ingredients}")
            
            print("-" * 40)
        
        # Optional: Save deleted rows to a separate file for analysis
        rows_to_delete.to_excel('deleted_rows_analysis.xlsx', index=True)
        print(f"\n💾 Saved deleted rows to 'deleted_rows_analysis.xlsx' for detailed analysis")
    
    # Now remove the rows
    df_ingredients = df_ingredients[df_ingredients['ner_ingredient_string'].apply(lambda x: len(str(x).strip()) > 0)]
    removed_count = initial_count - len(df_ingredients)
    
    if removed_count > 0:
        print(f"\n✅ Removed {removed_count} rows with no valid ingredients after cleaning")
        print(f"Remaining rows: {len(df_ingredients)}")
    else:
        print("\n✅ No rows were removed - all recipes retained valid ingredients")
    
    return df_ingredients

# Apply the enhanced cleaning function
df_ingredients = clean_ner_ingredients(df_ingredients)

In [None]:
df[['name', 'ner_ingredient_string']].head(10)

In [None]:

all_ingredients = []

for ingredient_string in df_ingredients['ner_ingredient_string']:
    if isinstance(ingredient_string, str) and ingredient_string.strip():
        # Split by comma and clean each ingredient
        ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
        all_ingredients.extend(ingredients)

display(df[['name', 'ner_ingredient_string']].head(10))
print(f"Total unique ingredients found: {len(set(all_ingredients))}")


In [None]:
df.columns.to_list

In [None]:
#recipe_id null or not
recipe_id_null = df[df['recipe_id'].isnull()]
if not recipe_id_null.empty:
    print(f"❌ Found {len(recipe_id_null)} rows with null recipe_id.")
else:
    print("✅ All recipe_id values are valid.")

In [None]:
if 'df' in locals() and 'ner_ingredient_string' in df.columns:
    
    # Create recipe-ingredient structure
    recipe_ingredient_data = []
    
    for index, row in df.iterrows():
        recipe_id = row.get('recipe_id')  # Use existing recipe_id or index + 1
        recipe_name = row.get('name', f'Recipe_{recipe_id}')  # Use actual name or fallback
        ner_ingredient_string = str(row['ner_ingredient_string']).strip()
        
        if pd.isna(ner_ingredient_string) or ner_ingredient_string == 'nan' or ner_ingredient_string == '':
            # Handle empty ingredient strings
            recipe_ingredient_data.append({
                'recipe_id': recipe_id,
                'recipe_name': recipe_name,
                'single_ingredient': None,
                'ingredient_position': 0,
                'original_ingredient_string': ner_ingredient_string
            })
        else:
            # Split by comma and clean each ingredient
            individual_ingredients = [ing.strip() for ing in ner_ingredient_string.split(',')]
            individual_ingredients = [ing for ing in individual_ingredients if ing]  # Remove empty strings
            
            if individual_ingredients:
                for position, ingredient in enumerate(individual_ingredients, 1):
                    recipe_ingredient_data.append({
                        'recipe_id': recipe_id,
                        'recipe_name': recipe_name,
                        'single_ingredient': ingredient,
                        'ingredient_position': position,
                        'original_ingredient_string': ner_ingredient_string
                    })
            else:
                # Fallback for cases where split results in empty list
                recipe_ingredient_data.append({
                    'recipe_id': recipe_id,
                    'recipe_name': recipe_name,
                    'single_ingredient': ner_ingredient_string,
                    'ingredient_position': 1,
                    'original_ingredient_string': ner_ingredient_string
                })
    
    # Create recipe-ingredient dataframe
    recipe_ingredient_df = pd.DataFrame(recipe_ingredient_data)
    
    print(f"✅ Created recipe-ingredient structure:")
    print(f"   Original recipes: {len(df)}")
    print(f"   Recipe-ingredient records: {len(recipe_ingredient_df)}")
    print(f"   Average ingredients per recipe: {len(recipe_ingredient_df) / len(df):.2f}")
    
    # Show sample structure
    print(f"\n📋 Sample Recipe-Ingredient Structure:")
    sample_recipes = recipe_ingredient_df[recipe_ingredient_df['recipe_id'].isin([1, 2, 3])]
    display(sample_recipes[['recipe_id', 'recipe_name', 'single_ingredient', 'ingredient_position']].head(10))
    
else:
    print("❌ Dataset not found or 'ner_ingredient_string' column missing")
    if 'df' in locals():
        print("Available columns:", [col for col in df.columns if 'ingredient' in str(col).lower()])

In [None]:
# Get unique ingredients
unique_ingredients_list = recipe_ingredient_df['single_ingredient'].dropna().unique()
print(f"\n📊 Unique Ingredients Analysis:")
print(f"   Total unique ingredients: {len(unique_ingredients_list)}")
print(f"   Sample unique ingredients: {list(unique_ingredients_list[:10])}")

In [None]:
import numpy as np

# Convert string representations of null to actual NaN
recipe_ingredient_df['original_ingredient_string'] = recipe_ingredient_df['original_ingredient_string'].replace(
    ['nan', 'NaN', 'None', '', ' ', '\t', '\n'], np.nan
)

# Now drop NaN values
recipe_ingredient_df = recipe_ingredient_df.dropna(subset=['original_ingredient_string'])

print(f"✅ Remaining rows after cleaning and dropping nulls: {len(recipe_ingredient_df)}")

Standarization of NER Ingredients

In [None]:
print("🔄 LOADING NER STANDARDIZATION MAPPING")
print("=" * 60)

standardization_file = 'ner_data_standarization.txt'

# Load standardization mapping
standardization_mapping = {}
compound_ingredients = {}

try:
    with open(standardization_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        if ' --> ' in line:
            original, standardized = line.split(' --> ', 1)
            original = original.strip()
            standardized = standardized.strip()
            standardization_mapping[original] = standardized
    
    print(f"✅ Loaded {len(standardization_mapping)} standardization mappings")
    
    # Show sample mappings
    print("\n📋 Sample standardization mappings:")
    sample_items = list(standardization_mapping.items())[:10]
    for original, standardized in sample_items:
        print(f"   '{original}' → '{standardized}'")
    
except FileNotFoundError:
    print(f"❌ File not found: {standardization_file}")
except Exception as e:
    print(f"❌ Error loading file: {e}")

In [None]:
def detect_compound_ingredients(recipe_ingredient_df):
    """
    Step 1: Scan all ingredients and list compound sentences that need special handling.
    
    Args:
        recipe_ingredient_df: DataFrame with single_ingredient column
        
    Returns:
        dict: Dictionary of detected compound ingredients and their counts
    """
    print("DETECTING COMPOUND INGREDIENTS")
    
    compound_indicators = {
        'multiple_foods': [],      # Multiple food items in one string
        'long_sentences': [],      # More than 4 words
        'known_patterns': []       # Known compound patterns
    }
    
    # Criteria for compound detection
    def is_likely_compound(ingredient_text):
        if not ingredient_text or pd.isna(ingredient_text):
            return False, "null"
            
        words = str(ingredient_text).strip().split()
        word_count = len(words)
        
        # Check for multiple food indicators
        food_keywords = ['oil', 'sauce', 'salt', 'sugar', 'water', 'milk', 'juice', 'broth', 
                        'beef', 'chicken', 'pork', 'fish', 'tofu', 'rice', 'noodle', 'egg', 'seed',
                        'teri','sesame']
        
        food_count = sum(1 for word in words if word.lower() in food_keywords)
        
        # Classification logic
        if word_count > 6:
            return True, "long_sentence"
        elif food_count >= 3:
            return True, "multiple_foods"
        elif word_count > 4 and food_count >= 2:
            return True, "mixed_pattern"
        else:
            return False, "simple"
    
    # Scan all ingredients
    compound_findings = {}
    
    for index, row in recipe_ingredient_df.iterrows():
        ingredient = row['single_ingredient']
        is_compound, reason = is_likely_compound(ingredient)
        
        if is_compound:
            if ingredient not in compound_findings:
                compound_findings[ingredient] = {
                    'count': 0,
                    'reason': reason,
                    'recipe_ids': []
                }
            compound_findings[ingredient]['count'] += 1
            compound_findings[ingredient]['recipe_ids'].append(row['recipe_id'])
    
    print(f"✅ Found {len(compound_findings)} compound ingredients")
    
    # Sort by frequency
    sorted_compounds = sorted(compound_findings.items(), 
                            key=lambda x: x[1]['count'], 
                            reverse=True)
    
    print(f"\n📋 TOP COMPOUND INGREDIENTS (sorted by frequency):")
    print("-" * 80)
    
    for i, (ingredient, data) in enumerate(sorted_compounds[:15], 1):
        print(f"{i:2d}. '{ingredient}'")
        print(f"    Count: {data['count']} | Reason: {data['reason']}")
        print(f"    Recipe IDs: {data['recipe_ids'][:5]}{'...' if len(data['recipe_ids']) > 5 else ''}")
        print()
    
    return dict(sorted_compounds)

# Run compound detection
if 'recipe_ingredient_df' in locals():
    detected_compounds = detect_compound_ingredients(recipe_ingredient_df)
else:
    print("❌ Recipe ingredient DataFrame not found. Run previous steps first.")

In [None]:
PREDEFINED_COMPOUND_PATTERNS = {
    # Exact matches from your problematic cases
    'oil clove garlic beef sweet soy sauce salt sugar rice water tofu': [
        'oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu'
    ],
    'rice red sweet potato chicken broth long bean salt margarine': [
        'rice', 'red sweet potato', 'chicken broth', 'long bean', 'salt', 'margarine'
    ],
    'seed baby lemon juice water formula milk': [
        'lemon juice', 'water', 'formula milk'
    ],
    'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth': [
        'rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth'
    ],
    'cooking oil clove garlic beef sweet soy sauce salt sugar': [
        'cooking oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar'
    ],
    'bumboo teri bubuk sesame seed': [
        'anchovy powder', 'sesame seed'
    ],
    'roll egg noodle boiling water': [
        'noodle', 'water'
    ],
    'chicken upper lower thigh wing': [
        'chicken thigh', 'chicken wing'
    ]
}

print("📋 PREDEFINED COMPOUND PATTERNS")
print("=" * 60)
print(f"Total predefined patterns: {len(PREDEFINED_COMPOUND_PATTERNS)}")
print("\nPattern examples:")
for i, (compound, parts) in enumerate(list(PREDEFINED_COMPOUND_PATTERNS.items())[:5], 1):
    print(f"{i}. '{compound}'")
    print(f"   → {parts}")
    print()

In [None]:
def standardize_simple_ingredient_only(ingredient, mapping_dict):
    """
    Args:
        ingredient (str): Single ingredient to standardize
        mapping_dict (dict): Standardization mapping rules
        
    Returns:
        str: Standardized ingredient name
    """
    if not ingredient or pd.isna(ingredient):
        return None
        
    ingredient_clean = str(ingredient).strip().lower()
    
    # Direct mapping lookup only
    if ingredient_clean in mapping_dict:
        return mapping_dict[ingredient_clean]
    
    # If no mapping found, return cleaned original
    return ingredient_clean

In [None]:
def is_compound_ingredient_only(ingredient, predefined_patterns):
    """
    PURE COMPOUND CHECKING FUNCTION - NO PROCESSING LOGIC
    
    Args:
        ingredient (str): Ingredient to check
        predefined_patterns (dict): Known compound patterns
        
    Returns:
        bool: True if compound, False if simple
    """
    if not ingredient or pd.isna(ingredient):
        return False
        
    ingredient_clean = str(ingredient).strip().lower()
    
    # Check if it's in our predefined compound patterns (exact match)
    if ingredient_clean in predefined_patterns:
        return True
    
    return False

In [None]:
# Test compound checking function
print("🧪 TESTING COMPOUND CHECKING FUNCTION")
print("=" * 50)

compound_test_cases = [
    'water',  # Simple
    'yoghurt',  # Simple
    'oil clove garlic beef sweet soy sauce salt sugar rice water tofu',  # Compound
    'rice red sweet potato chicken broth long bean salt margarine',  # Compound
    'chicken',  # Simple
    'cooking oil clove garlic chicken broth salt shrimp carrot celery'  # Compound
]

print("Compound detection tests:")
for ingredient in compound_test_cases:
    is_compound = is_compound_ingredient_only(ingredient, PREDEFINED_COMPOUND_PATTERNS)
    print(f"  '{ingredient}' → {'COMPOUND' if is_compound else 'SIMPLE'}")

In [None]:
def process_compound_ingredient_only(ingredient, predefined_patterns, mapping_dict):
    """
    SIMPLIFIED COMPOUND PROCESSING - ONLY USES PREDEFINED PATTERNS
    
    Args:
        ingredient (str): Compound ingredient to split and standardize
        predefined_patterns (dict): Known compound splitting patterns
        mapping_dict (dict): Standardization mapping rules (for individual parts)
        
    Returns:
        list: List of standardized individual ingredients
    """
    if not ingredient or pd.isna(ingredient):
        return []
        
    ingredient_clean = str(ingredient).strip().lower()

    if ingredient_clean in predefined_patterns:
        # Use exact predefined pattern
        individual_parts = predefined_patterns[ingredient_clean]
        print(f"  🎯 Using predefined pattern: '{ingredient_clean}' → {individual_parts}")
    else:
        print(f"  ❌ ERROR: '{ingredient_clean}' not found in predefined patterns!")
        return [ingredient_clean]  # Return as single ingredient
    
    # Apply simple mapping to each individual part
    standardized_parts = []
    for part in individual_parts:
        standardized_part = standardize_simple_ingredient_only(part, mapping_dict)
        standardized_parts.append(standardized_part)
        print(f"    '{part}' → '{standardized_part}'")
    
    return standardized_parts

In [None]:
# Add compound column to existing recipe_ingredient_df
if 'recipe_ingredient_df' in locals() and 'PREDEFINED_COMPOUND_PATTERNS' in locals():
    print("🔄 ADDING COMPOUND COLUMN TO EXISTING DATAFRAME")
    print("=" * 50)
    
    # Apply compound detection to each ingredient
    recipe_ingredient_df['compound'] = recipe_ingredient_df['single_ingredient'].apply(
        lambda ingredient: is_compound_ingredient_only(ingredient, PREDEFINED_COMPOUND_PATTERNS)
    )

display(recipe_ingredient_df[['recipe_id', 'single_ingredient', 'compound']].head(10)   )

In [None]:
def standardized_ingredients_into_df(recipe_ingredient_df, predefined_patterns, mapping_dict):
    """
    Alternative function that directly embeds standardized_ingredient column into the existing DataFrame
    
    Args:
        recipe_ingredient_df: DataFrame with columns ['recipe_id', 'single_ingredient']
        predefined_patterns: Dictionary of known compound patterns
        mapping_dict: Dictionary for simple ingredient standardization
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with standardized_ingredient column
    """
    
    def get_standardized_ingredient(ingredient):
        """Helper function to get standardized ingredient for each row"""
        if pd.isna(ingredient) or not ingredient:
            return None
            
        try:
            # Check if compound
            is_compound = is_compound_ingredient_only(ingredient, predefined_patterns)
            
            if is_compound:
                # For compound ingredients, get the first part (or join all parts)
                individual_parts = process_compound_ingredient_only(ingredient, predefined_patterns, mapping_dict)
                if individual_parts:
                    # Option 1: Return first part
                    return individual_parts[0]
                    # Option 2: Return all parts joined (uncomment below)
                    # return ', '.join(individual_parts)
                else:
                    return str(ingredient).strip().lower()
            else:
                # For simple ingredients, apply direct mapping
                return standardize_simple_ingredient_only(ingredient, mapping_dict)
                
        except Exception as e:
            print(f"❌ Error processing '{ingredient}': {e}")
            return str(ingredient).strip().lower()
    
    # Apply standardization to create new column
    recipe_ingredient_df['standardized_ingredient'] = recipe_ingredient_df['single_ingredient'].apply(get_standardized_ingredient)
    
    # Add metadata columns
    recipe_ingredient_df['is_compound'] = recipe_ingredient_df['single_ingredient'].apply(
        lambda x: is_compound_ingredient_only(x, predefined_patterns) if pd.notna(x) else False
    )
    
    print(f"✅ Successfully embedded standardized_ingredient column!")
    print(f"   Total rows processed: {len(recipe_ingredient_df)}")
    print(f"   Compound ingredients: {recipe_ingredient_df['is_compound'].sum()}")
    print(f"   Simple ingredients: {(~recipe_ingredient_df['is_compound']).sum()}")
    
    return recipe_ingredient_df

standardized_ingredients_into_df(recipe_ingredient_df, PREDEFINED_COMPOUND_PATTERNS, standardization_mapping)

Map Out Ingredient Ids

In [None]:
import re

def clean_ingredient_name_symbols(ingredient_name):
    """
    Clean unwanted symbols, quotes, and brackets from ingredient names
    
    Args:
        ingredient_name: String containing the ingredient name
        
    Returns:
        str: Cleaned ingredient name
    """
    if pd.isna(ingredient_name) or ingredient_name == '':
        return ingredient_name
        
    # Convert to string if not already
    cleaned = str(ingredient_name)
    
    # Remove quotes (single and double)
    cleaned = cleaned.replace("'", "").replace('"', "")
    
    # Remove brackets and parentheses
    cleaned = cleaned.replace("[", "").replace("]", "")
    cleaned = cleaned.replace("(", "").replace(")", "")
    
    # Remove other unwanted symbols but keep hyphens and spaces
    cleaned = re.sub(r'[^\w\s\-]', '', cleaned)
    
    # Clean up extra whitespace
    cleaned = ' '.join(cleaned.split())
    
    # Remove leading/trailing whitespace
    cleaned = cleaned.strip()
    
    return cleaned if cleaned else ingredient_name


In [None]:
def create_ingredient_master_df_with_na_check(recipe_ingredient_df, use_standardized=True):
    """
    Create a master ingredient DataFrame with unique ingredient IDs and handle NA values
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        tuple: (ingredient_df, ingredient_id_mapping, cleaned_df)
    """
    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return None, None, None
    
    print(f"🔍 CHECKING FOR NA VALUES IN '{ingredient_column}'")
    print("=" * 60)
    
    # Check for NA values in standardized ingredient column
    na_mask = recipe_ingredient_df[ingredient_column].isna()
    na_count = na_mask.sum()
    total_count = len(recipe_ingredient_df)
    
    print(f"📊 NA Analysis:")
    print(f"   Total rows: {total_count}")
    print(f"   NA values in '{ingredient_column}': {na_count}")
    print(f"   Percentage NA: {(na_count/total_count)*100:.2f}%")
    
    if na_count > 0:
        print(f"\n🔍 Analyzing NA rows...")
        na_rows = recipe_ingredient_df[na_mask].copy()
        
        # Check if there's an 'original_ingredient_string' column to reference
        original_column = None
        possible_original_columns = ['original_ingredient_string', 'single_ingredient', 'ingredient']
        
        for col in possible_original_columns:
            if col in recipe_ingredient_df.columns and col != ingredient_column:
                original_column = col
                break
        
        if original_column:
            print(f"   Using '{original_column}' as reference for original data")
            
            # Check which NA rows have valid original data
            na_with_original = na_rows[na_rows[original_column].notna() & (na_rows[original_column] != '')]
            na_without_original = na_rows[na_rows[original_column].isna() | (na_rows[original_column] == '')]
            
            print(f"\n📋 NA Row Analysis:")
            print(f"   NA rows with valid original data: {len(na_with_original)}")
            print(f"   NA rows without original data: {len(na_without_original)}")
            
            # Show NA rows with valid original data (these need attention)
            if len(na_with_original) > 0:
                print(f"\n⚠️  HIGHLIGHT: NA rows with valid original data (need investigation):")
                display_cols = ['recipe_id', 'recipe_name', original_column, ingredient_column]
                available_cols = [col for col in display_cols if col in na_with_original.columns]
                
                for idx, row in na_with_original.head(10).iterrows():
                    print(f"   Row {idx}:")
                    print(f"     Recipe: {row.get('recipe_name', 'N/A')}")
                    print(f"     Original: '{row.get(original_column, 'N/A')}'")
                    print(f"     Standardized: {row.get(ingredient_column, 'N/A')}")
                    print()
                
                if len(na_with_original) > 10:
                    print(f"     ... and {len(na_with_original) - 10} more rows")
            
            # Drop rows without original data
            if len(na_without_original) > 0:
                print(f"\n🗑️  DROPPING: {len(na_without_original)} rows with no original data")
                # Show sample of rows being dropped
                print("   Sample rows being dropped:")
                for idx, row in na_without_original.head(5).iterrows():
                    print(f"     Row {idx}: Recipe '{row.get('recipe_name', 'N/A')}' - No original ingredient data")
        
        else:
            print(f"   ⚠️  No original ingredient column found for reference")
            print(f"   Available columns: {list(recipe_ingredient_df.columns)}")
    
    # Create cleaned DataFrame (drop rows with NA in standardized ingredient)
    cleaned_df = recipe_ingredient_df.dropna(subset=[ingredient_column]).copy()
    dropped_count = total_count - len(cleaned_df)
    
    if dropped_count > 0:
        print(f"\n✅ Dropped {dropped_count} rows with NA values")
        print(f"   Remaining rows: {len(cleaned_df)}")
    
    # Continue with ingredient master creation using cleaned data
    print(f"\n🔄 Creating ingredient master DataFrame from cleaned data...")
    
    # Get unique ingredients and clean them
    unique_ingredients = cleaned_df[ingredient_column].dropna().unique()
    
    # Clean all ingredient names to remove unwanted symbols
    cleaned_ingredients = [clean_ingredient_name_symbols(ingredient) for ingredient in unique_ingredients]
    
    # Remove duplicates after cleaning and sort
    unique_cleaned_ingredients = sorted(list(set(cleaned_ingredients)))
    
    # Create ingredient master DataFrame
    ingredient_data = []
    ingredient_id_mapping = {}
    
    for idx, ingredient in enumerate(unique_cleaned_ingredients, start=1):
        if ingredient and ingredient.strip():  # Only add non-empty ingredients
            ingredient_data.append({
                'ingredient_id': idx,
                'ingredient_name': ingredient
            })
            ingredient_id_mapping[ingredient] = idx
    
    ingredient_df = pd.DataFrame(ingredient_data)
    
    print(f"\n✅ Created ingredient master DataFrame:")
    print(f"   Total unique ingredients: {len(ingredient_df)}")
    print(f"   Using column: {ingredient_column}")
    print(f"   Cleaned unwanted symbols from ingredient names")
    
    return ingredient_df, ingredient_id_mapping, cleaned_df

# Alternative: Simple function to just check NA values first
def check_na_in_standardized_ingredients(recipe_ingredient_df):
    """
    Quick function to check NA values in standardized ingredients
    """
    ingredient_column = 'standardized_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found")
        return
    
    # Check for NA values
    na_mask = recipe_ingredient_df[ingredient_column].isna()
    na_count = na_mask.sum()
    
    print(f"🔍 NA CHECK RESULTS:")
    print(f"   Total rows: {len(recipe_ingredient_df)}")
    print(f"   NA values in standardized_ingredient: {na_count}")
    print(f"   Percentage: {(na_count/len(recipe_ingredient_df))*100:.2f}%")
    
    if na_count > 0:
        na_rows = recipe_ingredient_df[na_mask]
        
        # Check original data
        original_cols = ['original_ingredient_string', 'single_ingredient']
        for col in original_cols:
            if col in recipe_ingredient_df.columns:
                has_original = na_rows[col].notna().sum()
                print(f"   NA rows with valid '{col}': {has_original}")
                
                # Show sample
                if has_original > 0:
                    print(f"\n📋 Sample NA rows with original data:")
                    sample = na_rows[na_rows[col].notna()].head(5)
                    for idx, row in sample.iterrows():
                        print(f"     Row {idx}: '{row[col]}'")
                break
    else:
        print("✅ No NA values found!")

# Usage examples:

# Quick check first
check_na_in_standardized_ingredients(recipe_ingredient_df)

# Then use the enhanced function
ingredient_df, ingredient_id_mapping, cleaned_recipe_df = create_ingredient_master_df_with_na_check(
    recipe_ingredient_df, 
    use_standardized=True
)

In [None]:

if ingredient_df is not None:
    print(f"\n📋 Sample of cleaned ingredient_df:")
    display(ingredient_df.head(10))
    
    print(f"\n🔍 Checking for any remaining unwanted symbols...")
    # Check if any ingredient names still contain unwanted symbols
    has_quotes = ingredient_df['ingredient_name'].str.contains("'|\"", na=False).sum()
    has_brackets = ingredient_df['ingredient_name'].str.contains("\[|\]|\(|\)", na=False).sum()
    
    print(f"   Ingredients with quotes: {has_quotes}")
    print(f"   Ingredients with brackets: {has_brackets}")
    
    if has_quotes > 0 or has_brackets > 0:
        print("   ⚠️  Some unwanted symbols still found!")
        problematic = ingredient_df[
            ingredient_df['ingredient_name'].str.contains("'|\"|\[|\]|\(|\)", na=False)
        ]
        print("   Sample problematic ingredients:")
        display(problematic.head())
    else:
        print("   ✅ All ingredient names are clean!")



Map Out to the RecipeIngredient

In [None]:
def map_ingredient_ids_to_recipe_df(recipe_ingredient_df, ingredient_id_mapping, use_standardized=True):
    """
    Map ingredient IDs to the recipe ingredient DataFrame using cleaned ingredient names
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        ingredient_id_mapping: Dictionary mapping cleaned ingredient names to IDs
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with ingredient_id column
    """

    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return recipe_ingredient_df
    
    # Create a copy to avoid modifying the original
    final_recipe_ingredient = recipe_ingredient_df.copy()
    
    # Clean the ingredient names in the DataFrame and map to IDs
    def get_ingredient_id(ingredient_name):
        if pd.isna(ingredient_name):
            return None
        cleaned_name = clean_ingredient_name_symbols(ingredient_name)
        return ingredient_id_mapping.get(cleaned_name, None)
    
    final_recipe_ingredient['ingredient_id'] = final_recipe_ingredient[ingredient_column].apply(get_ingredient_id)
    
    # Check mapping results
    total_rows = len(final_recipe_ingredient)
    mapped_rows = final_recipe_ingredient['ingredient_id'].notna().sum()
    unmapped_rows = total_rows - mapped_rows
    
    print(f"✅ Ingredient ID mapping complete:")
    print(f"   Total rows: {total_rows}")
    print(f"   Successfully mapped: {mapped_rows} ({(mapped_rows/total_rows)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_rows} ({(unmapped_rows/total_rows)*100:.1f}%)")
    
    if unmapped_rows > 0:
        print(f"\n🔍 Sample unmapped ingredients:")
        unmapped_sample = final_recipe_ingredient[final_recipe_ingredient['ingredient_id'].isna()][ingredient_column].dropna().unique()[:5]
        for ingredient in unmapped_sample:
            cleaned = clean_ingredient_name_symbols(ingredient)
            print(f"   Original: '{ingredient}' → Cleaned: '{cleaned}'")
    
    return final_recipe_ingredient

In [None]:
recipe_ingredient_df.columns.to_list()

In [None]:
# Step 2: Map ingredient IDs back to recipe DataFrame

recipe_ingredient_df_with_ids = map_ingredient_ids_to_recipe_df(
    recipe_ingredient_df,
    ingredient_id_mapping,
    use_standardized=True
)

print(f"\n📋 Sample of updated recipe_ingredient_df with IDs:")
sample_cols = ['recipe_id', 'single_ingredient', 'standardized_ingredient', 'ingredient_id']
available_cols = [col for col in sample_cols if col in recipe_ingredient_df_with_ids.columns]
display(recipe_ingredient_df_with_ids[available_cols].head(10))

print(f"\n📊 Final DataFrame Statistics:")
print(f"   Recipe DataFrame shape: {recipe_ingredient_df_with_ids.shape}")
print(f"   Unique recipes: {recipe_ingredient_df_with_ids['recipe_id'].nunique()}")
print(f"   Unique ingredients: {len(ingredient_df)}")
print(f"   Recipe-ingredient relationships: {len(recipe_ingredient_df_with_ids)}")


In [None]:
def drop_unmapped_ingredients(recipe_ingredient_df_with_ids):
    """
    Drop rows where ingredient_id is null (unmapped ingredients)
    
    Args:
        recipe_ingredient_df_with_ids: DataFrame with ingredient_id column
        
    Returns:
        DataFrame: Cleaned DataFrame without unmapped ingredients
    """
    print("🗑️ DROPPING UNMAPPED INGREDIENTS")
    print("=" * 40)
    
    # Count before dropping
    total_before = len(recipe_ingredient_df_with_ids)
    unmapped_count = recipe_ingredient_df_with_ids['ingredient_id'].isna().sum()
    
    print(f"Before dropping:")
    print(f"   Total rows: {total_before}")
    print(f"   Unmapped ingredients: {unmapped_count}")
    
    if unmapped_count > 0:
        # Show sample of what will be dropped
        print(f"\n📋 Sample of rows to be dropped:")
        unmapped_sample = recipe_ingredient_df_with_ids[
            recipe_ingredient_df_with_ids['ingredient_id'].isna()
        ][['recipe_id', 'single_ingredient', 'standardized_ingredient']].head(5)
        
        for idx, row in unmapped_sample.iterrows():
            print(f"   Row {idx}: Recipe {row['recipe_id']} - '{row['single_ingredient']}'")
        
        # Drop unmapped ingredients
        cleaned_df = recipe_ingredient_df_with_ids.dropna(subset=['ingredient_id'])
        
        # Reset index
        cleaned_df = cleaned_df.reset_index(drop=True)
        
        # Show results
        total_after = len(cleaned_df)
        dropped_count = total_before - total_after
        
        print(f"\n✅ After dropping:")
        print(f"   Total rows: {total_after}")
        print(f"   Dropped rows: {dropped_count}")
        print(f"   Retention rate: {(total_after/total_before)*100:.1f}%")
        
        return cleaned_df
    
    else:
        print("✅ No unmapped ingredients found - nothing to drop!")
        return recipe_ingredient_df_with_ids

# Usage
cleaned_recipe_ingredient_df = drop_unmapped_ingredients(recipe_ingredient_df_with_ids)

In [None]:
recipe_ingredient = cleaned_recipe_ingredient_df.copy()

#take recipe_id and ingredient_id only
recipe_ingredient = recipe_ingredient[['recipe_id', 'ingredient_id']].drop_duplicates().reset_index(drop=True)

print(f"\n📋 Final Recipe-Ingredient DataFrame:")
print(f"   Shape: {recipe_ingredient.shape}")
print(f"   Unique recipe IDs: {recipe_ingredient['recipe_id'].nunique()}")
print(f"   Unique ingredient IDs: {recipe_ingredient['ingredient_id'].nunique()}")

#### Determine Allergen (ingredient and recipe)

In [None]:
allergen_map = pd.read_excel('1st_dataset.xlsx', sheet_name='allergen')
print("Countrymap columns:", allergen_map.columns.tolist())
print("Countrymap shape:", allergen_map.shape)
print("\nFirst 5 rows of allergen:")
display(allergen_map.head())

In [None]:
# Load the module
import importlib

allergen_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "allergen_mapper.py")

spec = importlib.util.spec_from_file_location("allergen", allergen_path)
allergen_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(allergen_file)

# Access the config
allergen_tags = allergen_file.ALLERGEN_TAGS
pprint(allergen_tags)

In [None]:
ingredient_df

In [None]:
def calculate_match_confidence(ingredient_name, keyword):
    """Calculate confidence score for partial matches."""
    confidence = 0.6
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    return min(0.95, max(0.0, confidence))

In [None]:
def detect_allergen(ingredient_name, ALLERGEN_TAGS):
    """
    Detect allergen in an ingredient name using a two-step strategy:
    1. Check for exclusion terms
    2. Exact then partial keyword matching
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact'/'partial' or None,
            'matched_keyword': str or None,
            'confidence': float
        }
    """
    # Normalize input
    ingredient_lower = ingredient_name.lower().strip()

    # Exclusion Terms
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    if any(term in ingredient_lower for term in exclusion_terms):
        return {
            'allergen_group': None,
            'match_type': None,
            'matched_keyword': None,
            'confidence': 0.0
        }

    # Step 1: Exact match
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }

    # Step 2: Partial match
    best_match = None
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower in ingredient_lower:
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                if not best_match or confidence > best_match['confidence']:
                    best_match = {
                        'allergen_group': allergen_group,
                        'match_type': 'partial',
                        'matched_keyword': keyword,
                        'confidence': confidence
                    }

    if best_match and best_match['confidence'] >= 0.5:
        return best_match

    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

In [None]:
def get_allergen_group_id(allergen_group, allergen_mapping):
    """Map allergen group name to ID."""
    return allergen_mapping.get(allergen_group, None)

In [None]:
def process_ingredients_with_allergens(df, allergen_df):

    allergen_mapping = dict(zip(allergen_df['name'], allergen_df['pk']))

    results = []
    for _, row in df.iterrows():
        result = detect_allergen(row['ingredient_name'],allergen_tags)
        result['ingredient_id'] = row['ingredient_id']
        result['ingredient_name'] = row['ingredient_name']
        result['allergen_group_id'] = get_allergen_group_id(result['allergen_group'], allergen_mapping)
        result['isAllergen'] = result['allergen_group_id'] is not None
        results.append(result)

    result_df = pd.DataFrame(results)
    return result_df[result_df.columns.tolist()]

In [None]:
test_data = [
    {"ingredient_id": 1, "ingredient_name": "Whole Milk"},
    {"ingredient_id": 2, "ingredient_name": "Breast Milk"},
    {"ingredient_id": 3, "ingredient_name": "Almond Butter"},
    {"ingredient_id": 4, "ingredient_name": "Chicken Breast"},
    {"ingredient_id": 5, "ingredient_name": "Salmon Fillet"}
]
testing = pd.DataFrame(test_data)

allergen_df = pd.DataFrame(allergen_tags)

final_df = process_ingredients_with_allergens(testing, allergen_map)
print(final_df.to_string(index=False))
display(final_df)

In [None]:
ingredient_df =process_ingredients_with_allergens(testing, allergen_map)

display(ingredient_df.head(10))

In [None]:
final_ingredient_df = ingredient_df.copy()
final_ingredient_df = final_ingredient_df.rename(columns={
    'ingredient_id': 'pk',
    'ingredient_name': 'name',
    'allergen_group_id': 'allergen_group_id',
    'isAllergen': 'isAllergen'})

#final_ingredient_df drop column other than pk, name, allergen_group_id, isAllergen
final_ingredient_df = final_ingredient_df[['pk', 'name', 'allergen_group_id', 'isAllergen']]
display(final_ingredient_df.head(10))

##### Recipe Based To Get isHypo

In [None]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [None]:
df['allergen'] = df.apply(
    lambda row: detect_allergens(row, allergen_tags),
    axis=1
)

# Update hypoallergenic column based on allergen data
df['hypoallergenic'] = df['allergen'].apply(
    lambda allergens: "Yes" if not allergens or len(allergens) == 0 else "No"
)

# Display results to verify
df[['name', 'allergen', 'hypoallergenic', 'choking_hazards']].head(10)

### **Classify Category of The Recipe**

In [None]:
# Load the module
import importlib

category_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "category_dict.py")

spec = importlib.util.spec_from_file_location("category", category_path)
category_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(category_file)

# Access the config
dietary_tags  = category_file.dietary_tags
pprint(dietary_tags)

In [None]:
category_map = pd.read_excel('1st_dataset.xlsx', sheet_name='category')
print("Category columns:", category_map.columns.tolist())
print("Category shape:", category_map.shape)
print("\nFirst 5 rows of Category:")
display(category_map.head())

In [None]:
# FINAL PRODUCTION VERSION: classify_recipe with Non-Veg Fallback
def classify_recipe(recipe_data, dietary_tags):
    """
    FINAL VERSION: Classifies recipes with dietary tags using ner_ingredient_string directly.
    Includes fallback to 'non_veg' when no specific dietary category (vegan/vegetarian/pescetarian) matches.
    
    Args:
        recipe_data: Row containing recipe information with ner_ingredient_string
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        # Get excluded ingredients from the rules
        excluded_ingredients = rules.get('excluded_ingredients', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = False
        if excluded_ingredients:
            exclude_found = any(
                any(excl.lower() in ingredient.lower() for ingredient in ingredients)
                for excl in excluded_ingredients
            )
        
        # If not excluded, the tag applies
        if not exclude_found:
            print
            # Categorize tags
            if category in ['vegan', 'vegetarian', 'pescetarian']:
                # Only assign one general diet - take the most restrictive that applies
                if general_diet is None:
                    general_diet = category
                elif category == 'vegan' and general_diet != 'vegan':
                    general_diet = category  # Vegan is most restrictive
                elif category == 'vegetarian' and general_diet not in ['vegan']:
                    general_diet = category
            elif category in ['halal', 'kosher']:
                religious_tag = category
            else:
                allergen_tags.append(category)
    
    # 🔧 FALLBACK MECHANISM: If no specific dietary category matched, default to 'non_veg'
    if general_diet is None:
        general_diet = 'non_veg'
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    return ', '.join(final_tags)


In [None]:
sample_row = df.iloc[0].copy()  
classified_tags = classify_recipe(sample_row, dietary_tags)
print("Classified Tags:")
print(classified_tags)

In [None]:
df['dietary_tags'] = df.apply(
    lambda row: classify_recipe(row, dietary_tags),
    axis=1)

df[['ner_ingredient', 'dietary_tags', 'choking_hazards']].head(10)

In [None]:
df['dietary_tags'] = df['dietary_tags'].str.replace('_', ' ', regex=False)

df[['name', 'dietary_tags', 'choking_hazard']].head(10)

In [None]:
# 1. Show how many unique dietary tags combinations exist
unique_combinations = df['dietary_tags'].value_counts(dropna=False)
print("📊 Unique dietary tag combinations:")
print(unique_combinations)
print("-" * 50)

# 2. If you want to see them as individual tags instead:
def split_and_flatten(series):
    return series.str.split(",").explode().str.strip().str.lower()

# Get all unique dietary tags
unique_tags = split_and_flatten(df['dietary_tags']).dropna().drop_duplicates().sort_values()
print("\n✅ Unique dietary tags found:")
for tag in unique_tags:
    print(f" - {tag}")

# 3. Check if all values are the same
all_same = df['dietary_tags'].nunique(dropna=False) == 1
if all_same:
    sample_value = df['dietary_tags'].iloc[0]
    print(f"\n⚠️ Warning: All rows have the same dietary_tags value: '{sample_value}'")
else:
    print("\n✅ Dietary_tags vary across rows ✅")

#### Map to the category id <> recipe_id

In [None]:
# df = df.drop(columns=['dietary_tags_list'])
df[['recipe_id', 'name', 'dietary_tags']].head(10)

In [None]:
category_name_to_id = dict(zip(category_map['name'], category_map['pk']))
print(f"📊 Created mapping for {len(category_name_to_id)} categories from category_map")
print(f"   Sample mapping: {list(category_name_to_id.items())}")    

In [None]:
df_category = df.copy()
def parse_tags(tag_str):
    if isinstance(tag_str, str):
        return [tag.strip() for tag in tag_str.split(",")]
    elif isinstance(tag_str, list):
        return tag_str
    else:
        return []

df_category['dietary_tags_list'] = df_category['dietary_tags'].apply(parse_tags)
df_category[['recipe_id', 'name', 'dietary_tags', 'dietary_tags_list']].head(10) 

In [None]:
def get_category_ids(tags):
    return [category_name_to_id[tag] for tag in tags if tag in category_name_to_id]

df_category['category_ids'] = df_category['dietary_tags_list'].apply(get_category_ids)

# Step 4: Explode into one row per (recipe_id, category_id)
recipe_category_df = df_category[['recipe_id', 'category_ids']].explode('category_ids')
recipe_category_df.columns = ['recipe_id', 'category_id']

# Step 5: Drop rows where no matching category was found
recipe_category_df.dropna(subset=['category_id'], inplace=True)
recipe_category_df['category_id'] = recipe_category_df['category_id'].astype(int)

# Optional: Reset index
recipe_category_df.reset_index(drop=True, inplace=True)

In [None]:
display(df_category[['recipe_id', 'name', 'dietary_tags', 'category_ids']].head(10))

display(recipe_category_df[['recipe_id', 'category_id']].head(10))

In [None]:
#finalize the recipe_Df

### **Populating Nutrition Value**

In [None]:
# Check which records have empty nutrition_value
empty_nutrition = df['nutrition_value'].isna() | (df['nutrition_value'] == '') | (df['nutrition_value'] == 'None')
print(f"Records with empty nutrition_value: {empty_nutrition.sum()} out of {len(df)}")


# Function to extract ingredient information
def extract_ingredient_info(ingredient_text):
    """Extract quantity, measurement, and ingredient name from ingredient text"""
    if pd.isna(ingredient_text) or ingredient_text == '':
        return []
    
    # Split by lines and clean
    lines = [line.strip() for line in ingredient_text.split('\n') if line.strip()]
    
    extracted_ingredients = []
    
    for line in lines:
        # Remove leading dash or bullet points
        line = re.sub(r'^[-•*]\s*', '', line)
        
        # Pattern to match quantity, measurement, and ingredient
        # Examples: "60 g cassava", "2-3 tablespoons of plain yogurt", "1½ tablespoons vegetable oil"
        patterns = [
            # Pattern 1: Range + unit + "of" + ingredient (e.g., "2-3 tablespoons of plain yogurt")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?[-–][0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+of\s+(.+)$',
            
            # Pattern 2: Range + unit + ingredient (e.g., "2-3 tablespoons plain yogurt")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?[-–][0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+(.+)$',
            
            # Pattern 3: Number + unit + "of" + ingredient (e.g., "30 g of sweet potato")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+of\s+(.+)$',
            
            # Pattern 4: Number + unit + ingredient (e.g., "60 g cassava", "175g cauliflower")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+(.+)$',
            
            # Pattern 5: Fraction + unit + "of" + ingredient (e.g., "1½ tablespoons of oil")
            r'^([0-9]*[½¼¾][0-9]*)\s*([a-zA-Z]+)\s+of\s+(.+)$',
            
            # Pattern 6: Fraction + unit + ingredient (e.g., "1½ tablespoons oil")
            r'^([0-9]*[½¼¾][0-9]*)\s*([a-zA-Z]+)\s+(.+)$',
            
            # Pattern 7: Range + ingredient (no unit) (e.g., "2-3 carrots")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?[-–][0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s+(.+)$',
            
            # Pattern 8: Number + ingredient (no unit) (e.g., "1 carrot", "1 onion")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s+(.+)$',
            
            # Pattern 9: Just ingredient (no quantity/unit)
            r'^(.+)$'
        ]
        
        quantity = None
        measurement = None
        ingredient_name = None
        
        for i, pattern in enumerate(patterns):
            match = re.match(pattern, line, re.IGNORECASE)
            if match:
                if i == 0:  # Pattern 1: range + unit + "of" + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 1:  # Pattern 2: range + unit + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 2:  # Pattern 3: number + unit + "of" + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 3:  # Pattern 4: number + unit + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 4:  # Pattern 5: fraction + unit + "of" + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 5:  # Pattern 6: fraction + unit + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 6:  # Pattern 7: range + ingredient (no unit)
                    quantity = match.group(1)
                    measurement = None
                    ingredient_name = match.group(2).strip()
                elif i == 7:  # Pattern 8: number + ingredient (no unit)
                    quantity = match.group(1)
                    measurement = None
                    ingredient_name = match.group(2).strip()
                else:  # Pattern 9: just ingredient
                    quantity = None
                    measurement = None
                    ingredient_name = match.group(1).strip()
                break
        
        # Clean up ingredient name (remove extra descriptions after comma)
        if ingredient_name:
            # Remove descriptions after comma (e.g., "cassava, boiled and blended" -> "cassava")
            ingredient_name = ingredient_name.split(',')[0].strip()
            
            extracted_ingredients.append({
                'original_text': line,
                'quantity': quantity,
                'measurement': measurement,
                'ingredient_name': ingredient_name
            })
    
    return extracted_ingredients



In [None]:
# Test the function with specific examples
print("=== TESTING INGREDIENT EXTRACTION WITH RANGE FORMATS ===")

test_ingredients = [
    "2-3 tablespoons of plain yogurt",
    "1-2 teaspoons of vanilla extract", 
    "30 g of sweet potato",
    "60 g cassava",
    "1½ tablespoons vegetable oil",
    "2-3 carrots",
    "1 onion",
    "plain water"
]

for test_ingredient in test_ingredients:
    print(f"\nTesting: '{test_ingredient}'")
    result = extract_ingredient_info(test_ingredient)
    if result:
        for r in result:
            print(f"  ✅ Quantity: {r['quantity']}, Measurement: {r['measurement']}, Ingredient: {r['ingredient_name']}")
    else:
        print(f"  ❌ No match found")


In [None]:
df_empty_nutrition = df[df['nutrition_value'].isna() | (df['nutrition_value'] == '') | (df['nutrition_value'] == 'None')].copy()

print(f"\nExtracting ingredient information for {len(df_empty_nutrition)} records with empty nutrition_value...")
df_empty_nutrition['extracted_ingredients'] = df_empty_nutrition['ingredients'].apply(extract_ingredient_info)

df_empty_nutrition[['ingredients','extracted_ingredients']]


In [None]:
# Display sample extractions
print("\nSample ingredient extractions:")
for idx, row in df_empty_nutrition.head(3).iterrows():
    print(f"\n--- Recipe: {row['name']} ---")
    print(f"Original ingredients:\n{row['ingredients']}")
    print("\nExtracted ingredients:")
    for ing in row['extracted_ingredients']:
        print(f"  - Quantity: {ing['quantity']}, Measurement: {ing['measurement']}, Ingredient: {ing['ingredient_name']}")
        print(f"    Original: {ing['original_text']}")


In [None]:
# Create a detailed breakdown DataFrame
ingredient_details = []
for idx, row in df_empty_nutrition.iterrows():
    recipe_name = row['name']
    for ing in row['extracted_ingredients']:
        ingredient_details.append({
            'recipe_id': idx,
            'recipe_name': recipe_name,
            'quantity': ing['quantity'],
            'measurement': ing['measurement'],
            'ingredient_name': ing['ingredient_name'],
            'original_text': ing['original_text']
        })

ingredient_breakdown_df = pd.DataFrame(ingredient_details)
print(f"\nCreated ingredient breakdown with {len(ingredient_breakdown_df)} ingredient entries")
print(f"From {len(df_empty_nutrition)} recipes with empty nutrition values")



In [None]:
#print new df
ingredient_breakdown_df.head()


In [None]:
# Show summary statistics
print("\nSummary of extracted measurements:")
if len(ingredient_breakdown_df) > 0:
    print(ingredient_breakdown_df['measurement'].value_counts().head(10))
    
    print("\nMost common ingredients:")
    print(ingredient_breakdown_df['ingredient_name'].value_counts().head(10))

In [None]:
# Function to convert measurements to grams
def convert_to_grams(quantity, measurement, ingredient_name):
    """
    Convert quantity and measurement to grams based on common cooking conversions.
    Returns converted quantity in grams and 'g' as measurement.
    """
    if not quantity or not measurement:
        return quantity, measurement
    
    # Clean and parse quantity (handle ranges and fractions)
    def parse_quantity(qty_str):
        if not qty_str:
            return 1.0
        
        # Handle fractions
        fraction_map = {'½': 0.5, '¼': 0.25, '¾': 0.75, '⅓': 0.33, '⅔': 0.67}
        for frac, val in fraction_map.items():
            qty_str = str(qty_str).replace(frac, str(val))
        
        # Handle ranges (take average)
        if '-' in str(qty_str) or '–' in str(qty_str):
            parts = re.split(r'[-–]', str(qty_str))
            if len(parts) == 2:
                try:
                    min_val = float(parts[0].strip())
                    max_val = float(parts[1].strip())
                    return (min_val + max_val) / 2
                except:
                    pass
        
        # Convert to float
        try:
            return float(qty_str)
        except:
            return 1.0
    
    parsed_qty = parse_quantity(quantity)
    measurement_lower = str(measurement).lower()
    
    # Conversion factors to grams
    conversions = {
        # Volume conversions (approximate for common ingredients)
        'tsp': 5,           # 1 tsp ≈ 5g (for most liquids/powders)
        'teaspoon': 5,
        'teaspoons': 5,
        'tbsp': 15,         # 1 tbsp ≈ 15g
        'tablespoon': 15,
        'tablespoons': 15,
        'cup': 240,         # 1 cup ≈ 240g (for liquids)
        'cups': 240,
        'ml': 1,            # 1ml ≈ 1g (for water-based liquids)
        'milliliters': 1,
        'milliliter': 1,
        'l': 1000,          # 1 liter = 1000g
        'liter': 1000,
        'liters': 1000,
        
        # Weight conversions
        'g': 1,             # already in grams
        'gram': 1,
        'grams': 1,
        'kg': 1000,         # 1 kg = 1000g
        'kilogram': 1000,
        'kilograms': 1000,
        'oz': 28.35,        # 1 oz ≈ 28.35g
        'ounce': 28.35,
        'ounces': 28.35,
        'lb': 453.6,        # 1 lb ≈ 453.6g
        'pound': 453.6,
        'pounds': 453.6,
        'cc': 1,           
        
        # Piece conversions (rough estimates)
        'small': 50,        # small piece ≈ 50g
        'medium': 100,      # medium piece ≈ 100g
        'large': 150,       # large piece ≈ 150g
        'piece': 75,        # average piece ≈ 75g
        'pieces': 75,
        'clove': 3,         # garlic clove ≈ 3g
        'cloves': 3,
    }
    
    # Convert to grams
    if measurement_lower in conversions:
        converted_qty = parsed_qty * conversions[measurement_lower]
        return round(converted_qty, 1), 'g'
    else:
        # If measurement not found, return original
        return quantity, measurement



In [None]:
# Test the conversion function
print("Testing conversion function:")
test_cases = [
    ('2', 'tbsp', 'vegetable oil'),
    ('1', 'cup', 'flour'),
    ('1', 'cc', 'salt'),
    ('100', 'ml', 'water'),
    ('1', 'medium', 'onion'),
    ('2-3', 'tsp', 'sugar'),
    ('½', 'cup', 'butter')
]

# for qty, measure, ingredient in test_cases:
#     # new_qty, new_measure = convert_to_grams(qty, measure, ingredient)
#     print(f"{qty} {measure} {ingredient} → {new_qty} {new_measure}")


In [None]:
# Apply conversion to all ingredients in the dataframe
print("Converting all measurements to grams...")

# Create new columns for converted values
ingredient_breakdown_df['original_quantity'] = ingredient_breakdown_df['quantity'].copy()
ingredient_breakdown_df['original_measurement'] = ingredient_breakdown_df['measurement'].copy()

# Apply conversion
conversion_results = ingredient_breakdown_df.apply(
    lambda row: convert_to_grams(row['quantity'], row['measurement'], row['ingredient_name']), 
    axis=1
)

# Update the dataframe with converted values
ingredient_breakdown_df['quantity'] = [result[0] for result in conversion_results]
ingredient_breakdown_df['measurement'] = [result[1] for result in conversion_results]

print(f"\nConversion completed for {len(ingredient_breakdown_df)} ingredients")

# Show sample conversions
print("\nSample conversions:")
sample_conversions = ingredient_breakdown_df[ingredient_breakdown_df['original_measurement'].notna()].head(10)
for idx, row in sample_conversions.iterrows():
    print(f"{row['original_quantity']} {row['original_measurement']} {row['ingredient_name']} → {row['quantity']} {row['measurement']}")

# Show new measurement distribution
print("\nNew measurement distribution:")
print(ingredient_breakdown_df['measurement'].value_counts())



In [None]:
#print new df
ingredient_breakdown_df.head()

In [None]:
# Filter ingredients that have complete quantity and measurement data
print("Filtering ingredients with complete quantity and measurement data...")

# Check current data completeness
print(f"\nTotal ingredients in breakdown: {len(ingredient_breakdown_df)}")
print(f"Ingredients with quantity: {ingredient_breakdown_df['quantity'].notna().sum()}")
print(f"Ingredients with measurement: {ingredient_breakdown_df['measurement'].notna().sum()}")
print(f"Ingredients with both quantity and measurement: {(ingredient_breakdown_df['quantity'].notna() & ingredient_breakdown_df['measurement'].notna()).sum()}")

# Create mask for complete data (both quantity and measurement are not null)
complete_data_mask = (
    ingredient_breakdown_df['quantity'].notna() & 
    ingredient_breakdown_df['measurement'].notna() &
    (ingredient_breakdown_df['quantity'] != '') &
    (ingredient_breakdown_df['measurement'] != '')
)

# Filter dataframe to only include ingredients with complete data
ingredients_complete = ingredient_breakdown_df[complete_data_mask].copy()

print(f"\nIngredients with complete quantity and measurement: {len(ingredients_complete)}")
print(f"Percentage of complete data: {len(ingredients_complete) / len(ingredient_breakdown_df) * 100:.1f}%")

# Show sample of complete ingredients
print("\nSample of ingredients with complete data:")
print(ingredients_complete[['ingredient_name', 'quantity', 'measurement']].head(10))


In [None]:
ingredient_breakdown_df[complete_data_mask]

In [None]:
ingredient_counts = ingredients_complete['ingredient_name'].value_counts()
print(f"Total unique ingredients: {len(ingredient_counts)}")
print("\nMost frequently used ingredients:")
print(ingredient_counts.head(15))

# Create a dataframe with ingredient frequencies
unique_ingredients_with_counts = ingredient_counts.reset_index()
unique_ingredients_with_counts.columns = ['ingredient_name', 'frequency']
print(f"\nCreated dataframe with {len(unique_ingredients_with_counts)} unique ingredients")



In [None]:

unique_ingredients_with_counts


##### **Requesting to USDA Food Central**

In [None]:
import requests
from tqdm import tqdm
import requests
import time

api_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
api_key = "KulngHmZ1nJeaPPBrZ8pH3kyJI2Gy1r9Xm121YO9"

def get_nutrition_data(query: str, api_key: str):
    """
    Fetch nutrition data for a specific ingredient from USDA API with prioritized search strategy.
    
    Search Strategy:
    1. Foundation exact match
    2. Foundation first result
    3. Survey (FNDDS) exact match  
    4. Survey (FNDDS) first result
    
    Returns:
    dict: Nutrition information including energy, macronutrients, top 3 micronutrients by value,
          and search method used
    """
    
    base_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
    
    def search_with_params(data_type):
        """Helper function to search with specific parameters"""
        params = {
            'query': query,
            'api_key': api_key,
            'dataType': [data_type],
            'pageSize': 25,  # Get more results for better matching
            'pageNumber': 1,
            'sortBy': 'dataType.keyword',
            'sortOrder': 'asc'
        }
        
        try:
            response = requests.get(base_url, params=params, timeout=10)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"❌ API request failed: {response.status_code}")
                return None
        except Exception as e:
            print(f"❌ Error in API request: {e}")
            return None
    
    def find_exact_match(foods, query_lower):
        """Find exact match by comparing full ingredient name with full food description (case-insensitive)"""
        for food in foods:
            description_lower = food['description'].lower()
            # Check if the query exactly matches the description or if query is a whole word in description
            # Use word boundaries to ensure exact matching
            import re
            
            # Create pattern for exact word matching
            pattern = r'\b' + re.escape(query_lower) + r'\b'
            
            # Check if query is the entire description OR appears as complete word(s)
            if (query_lower == description_lower or 
                re.search(pattern, description_lower)):
                return food
        return None
    
    def extract_nutrition_info(food, search_method):
        """Extract nutrition information from food item"""
        nutrients = food.get('foodNutrients', [])
        
        result = {
            'ingredient_name': query,
            'found_description': food['description'],
            'search_method': search_method,
            'energy_kcal': None,
            'carbohydrate_g': None,
            'protein_g': None,
            'fat_g': None,
            'micronutrients': [],
            'status': 'success'
        }
        
        # Energy (Atwater General Factors)
        energy = next((item for item in nutrients if item['nutrientName'] == 'Energy (Atwater Specific Factors)'), None)
        if energy:
            result['energy_kcal'] = energy['value']
        
        # Carbohydrates
        carbohydrate = next((item for item in nutrients if item['nutrientName'] == 'Carbohydrate, by difference'), None)
        if carbohydrate:
            result['carbohydrate_g'] = carbohydrate['value']
        
        # Fat
        fat = next((item for item in nutrients if item['nutrientName'] == 'Total lipid (fat)'), None)
        if fat:
            result['fat_g'] = fat['value']
        
        # Protein
        protein = next((item for item in nutrients if item['nutrientName'] == 'Protein'), None)
        if protein:
            result['protein_g'] = protein['value']
        
        # Exclude certain nutrients from micronutrients
        exclude_nutrients = [
            "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
            "Nitrogen", "Protein", "Total lipid (fat)", "Ash", "Carbohydrates",
            "Carbohydrate, by difference", "Total dietary fiber (AOAC 2011.25)",
            "High Molecular Weight Dietary Fiber (HMWDF)", "Low Molecular Weight Dietary Fiber (LMWDF)",
            "Sugars, Total", "Total Sugars", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
        ]
        
        # Get micronutrients (vitamins and minerals) - top 3 by value
        filtered_micronutrients = [
            item for item in nutrients 
            if item['nutrientName'] not in exclude_nutrients and item['value'] > 0
        ]
        
        # Sort by value in descending order and take top 3
        sorted_micronutrients = sorted(filtered_micronutrients, key=lambda x: x['value'], reverse=True)
        top_3_micronutrients = sorted_micronutrients[:3]
        
        # Extract only the nutrient names
        micronutrients = [item['nutrientName'] for item in top_3_micronutrients]
        result['micronutrients'] = micronutrients
        
        return result

    # Make the API call with prioritized search strategy
    try:
        query_lower = query.lower()
        
        # Step 1: Search Foundation for exact match
        print(f"🔍 Step 1: Searching Foundation for exact match: '{query}'")
        data = search_with_params("Foundation")
        if data and 'foods' in data and data['foods']:
            exact_match = find_exact_match(data['foods'], query_lower)
            if exact_match:
                print(f"✅ Found exact match in Foundation: {exact_match['description']}")
                return extract_nutrition_info(exact_match, "Foundation_exact")
        
        # Step 2: Search Foundation and take first result
        print(f"🔍 Step 2: Taking first Foundation result: '{query}'")
        if data and 'foods' in data and data['foods']:
            first_result = data['foods'][0]
            print(f"📄 Using first Foundation result: {first_result['description']}")
            return extract_nutrition_info(first_result, "Foundation_first")
        
        # Step 3: Search Survey (FNDDS) for exact match
        print(f"🔍 Step 3: Searching Survey (FNDDS) for exact match: '{query}'")
        data = search_with_params("Survey (FNDDS)")
        if data and 'foods' in data and data['foods']:
            exact_match = find_exact_match(data['foods'], query_lower)
            if exact_match:
                print(f"✅ Found exact match in Survey: {exact_match['description']}")
                return extract_nutrition_info(exact_match, "Survey_exact")
            
            # Step 4: Take first Survey result as fallback
            print(f"🔍 Step 4: Taking first Survey (FNDDS) result: '{query}'")
            first_result = data['foods'][0]
            print(f"📄 Using first Survey result: {first_result['description']}")
            return extract_nutrition_info(first_result, "Survey_first")
        
        # If no results found at all
        print(f"❌ No nutrition data found for '{query}'")
        return {
            'ingredient_name': query,
            'found_description': None,
            'search_method': 'not_found',
            'energy_kcal': None,
            'carbohydrate_g': None,
            'protein_g': None,
            'fat_g': None,
            'micronutrients': [],
            'status': 'failed',
            'error': 'No results found in any database'
        }
        
    except Exception as e:
        print(f"❌ Error processing '{query}': {e}")
        return {
            'ingredient_name': query,
            'found_description': None,
            'search_method': 'error',
            'energy_kcal': None,
            'carbohydrate_g': None,
            'protein_g': None,
            'fat_g': None,
            'micronutrients': [],
            'status': 'failed',
            'error': str(e)
        }



In [None]:
import time
from tqdm import tqdm

def process_all_ingredients(unique_ingredients_df, api_key, delay=0.1):
    """
    Process all unique ingredients to get their nutrition data.
    
    Args:
        unique_ingredients_df: DataFrame with 'ingredient_name' column
        api_key: USDA API key
        delay: Delay between API calls in seconds (to respect rate limits)
    
    Returns:
        tuple: (nutrition_df, failed_ingredients_list)
    """
    nutrition_results = []
    failed_ingredients = []
    
    print(f"Processing {len(unique_ingredients_df)} unique ingredients...")
    
    # Process each ingredient
    for idx, row in tqdm(unique_ingredients_df.iterrows(), total=len(unique_ingredients_df)):
        ingredient_name = row['ingredient_name']
        # frequency = row['frequency']
        
        try:
            # Get nutrition data
            nutrition_data = get_nutrition_data(ingredient_name, api_key)
            # nutrition_data['frequency'] = frequency
            
            if nutrition_data['status'] == 'success':
                nutrition_results.append(nutrition_data)
                print(f"✅ {ingredient_name}: Found - {nutrition_data['found_description']}")
            else:
                failed_ingredients.append({
                    'ingredient_name': ingredient_name,
                    # 'frequency': frequency,
                    'reason': nutrition_data['status']
                })
                print(f"❌ {ingredient_name}: {nutrition_data['status']}")
            
            # Add delay to respect API rate limits
            time.sleep(delay)
            
        except Exception as e:
            failed_ingredients.append({
                'ingredient_name': ingredient_name,
                # 'frequency': frequency,
                'reason': f'exception: {str(e)}'
            })
            print(f"❌ {ingredient_name}: Exception - {str(e)}")
    
    # Create DataFrames
    nutrition_df = pd.DataFrame(nutrition_results)
    failed_df = pd.DataFrame(failed_ingredients)
    
    return nutrition_df, failed_df

In [None]:
# Process all unique ingredients (excluding water)
print("Starting nutrition data collection for all unique ingredients...")
print(f"Total ingredients to process: {len(unique_ingredients_with_counts)}")

# Exclude 'water' from processing (since water has no significant nutrition and causes matching issues)
ingredients_to_exclude = ['water', 'plain water', 'boiling water', 'cold water', 'warm water']
filtered_ingredients = unique_ingredients_with_counts[
    ~unique_ingredients_with_counts['ingredient_name'].str.lower().isin([x.lower() for x in ingredients_to_exclude])
].copy()


In [None]:
# Create 6 separate DataFrames for each batch of 150 ingredients
import math

def create_batch_dataframes(unique_ingredients_list, batch_size=150):
    """
    Create 6 separate DataFrames, each containing a batch of ingredients.
    Each DataFrame will be processed separately and can store its own nutrition data.
    
    Args:
        unique_ingredients_list: List of unique ingredients
        batch_size: Number of ingredients per batch (default: 150)
    
    Returns:
        Dictionary of DataFrames (batch_1 through batch_6)
    """
    # Calculate total number of batches needed
    total_batches = math.ceil(len(unique_ingredients_list) / batch_size)
    max_batches = 6  # Limit to 6 batches as requested
    
    print(f"Total ingredients: {len(unique_ingredients_list)}")
    print(f"Batch size: {batch_size}")
    print(f"Total batches needed: {total_batches}")
    print(f"Creating {min(total_batches, max_batches)} DataFrames")
    print("=" * 50)
    
    batch_dataframes = {}
    
    for batch_num in range(1, min(total_batches, max_batches) + 1):
        # Calculate start and end indices for this batch
        start_idx = (batch_num - 1) * batch_size
        end_idx = min(start_idx + batch_size, len(unique_ingredients_list))
        
        # Get ingredients for this batch
        batch_ingredients = unique_ingredients_list[start_idx:end_idx]
        
        # Create DataFrame for this batch with empty nutrition columns
        batch_df = pd.DataFrame({
            'ingredient': batch_ingredients,
            'description': [None] * len(batch_ingredients),
            'energy_kcal_per_100g': [None] * len(batch_ingredients),
            'carbs_g_per_100g': [None] * len(batch_ingredients),
            'protein_g_per_100g': [None] * len(batch_ingredients),
            'fat_g_per_100g': [None] * len(batch_ingredients),
            'top_micronutrients': [None] * len(batch_ingredients),
            'search_method': [None] * len(batch_ingredients),
            'batch_number': [batch_num] * len(batch_ingredients)
        })
        
        # Store in dictionary
        batch_dataframes[f'batch_{batch_num}'] = batch_df
        
        print(f"Batch {batch_num}: {len(batch_ingredients)} ingredients (indices {start_idx}-{end_idx-1})")
        print(f"  Sample ingredients: {batch_ingredients[:3]}...")
        print()
    
    return batch_dataframes



In [None]:
# Create the unique ingredients list from filtered_ingredients DataFrame
unique_ingredients_list = filtered_ingredients['ingredient_name'].tolist()
print(f"Extracted {len(unique_ingredients_list)} unique ingredients from filtered_ingredients DataFrame")

# Create the batch DataFrames
print("Creating 6 separate DataFrames for batch processing...")
batch_dfs = create_batch_dataframes(unique_ingredients_list, batch_size=150)

# Display information about each batch
print("\n📊 BATCH DATAFRAMES CREATED:")
print("=" * 60)
for batch_name, df in batch_dfs.items():
    print(f"{batch_name.upper()}:")
    print(f"  - Shape: {df.shape}")
    print(f"  - Ingredient range: {df['ingredient'].iloc[0]} ... {df['ingredient'].iloc[-1]}")
    print(f"  - Batch number: {df['batch_number'].iloc[0]}")
    print()

In [None]:
# Create 6 separate DataFrames for each batch of 150 ingredients
import math

def create_batch_dataframes(unique_ingredients_list, batch_size=150):
    """
    Create 6 separate DataFrames, each containing a batch of ingredients.
    Each DataFrame will be processed separately and can store its own nutrition data.
    
    Args:
        unique_ingredients_list: List of unique ingredients
        batch_size: Number of ingredients per batch (default: 150)
    
    Returns:
        Dictionary of DataFrames (batch_1 through batch_6)
    """
    # Calculate total number of batches needed
    total_batches = math.ceil(len(unique_ingredients_list) / batch_size)
    max_batches = 6  # Limit to 6 batches as requested
    
    print(f"Total ingredients: {len(unique_ingredients_list)}")
    print(f"Batch size: {batch_size}")
    print(f"Total batches needed: {total_batches}")
    print(f"Creating {min(total_batches, max_batches)} DataFrames")
    print("=" * 50)
    
    batch_dataframes = {}
    
    for batch_num in range(1, min(total_batches, max_batches) + 1):
        # Calculate start and end indices for this batch
        start_idx = (batch_num - 1) * batch_size
        end_idx = min(start_idx + batch_size, len(unique_ingredients_list))
        
        # Get ingredients for this batch
        batch_ingredients = unique_ingredients_list[start_idx:end_idx]
        
        # Create DataFrame for this batch with empty nutrition columns
        batch_df = pd.DataFrame({
            'ingredient': batch_ingredients,
            'description': [None] * len(batch_ingredients),
            'energy_kcal_per_100g': [None] * len(batch_ingredients),
            'carbs_g_per_100g': [None] * len(batch_ingredients),
            'protein_g_per_100g': [None] * len(batch_ingredients),
            'fat_g_per_100g': [None] * len(batch_ingredients),
            'top_micronutrients': [None] * len(batch_ingredients),
            'search_method': [None] * len(batch_ingredients),
            'batch_number': [batch_num] * len(batch_ingredients)
        })
        
        # Store in dictionary
        batch_dataframes[f'batch_{batch_num}'] = batch_df
        
        print(f"Batch {batch_num}: {len(batch_ingredients)} ingredients (indices {start_idx}-{end_idx-1})")
        print(f"  Sample ingredients: {batch_ingredients[:3]}...")
        print()
    
    return batch_dataframes


In [None]:
# Function to process individual batch DataFrames
import datetime  # Add missing import

def process_batch_dataframe(batch_df, batch_name, api_key, save_results=True):
    """
    Process a single batch DataFrame by fetching nutrition data for all ingredients.
    
    Args:
        batch_df: DataFrame containing ingredients for this batch
        batch_name: Name of the batch (e.g., 'batch_1')
        api_key: USDA API key
        save_results: Whether to save results to Excel file
    
    Returns:
        Tuple of (processed_df, failed_ingredients_list)
    """
    print(f"\n🔄 PROCESSING {batch_name.upper()}")
    print("=" * 50)
    
    processed_df = batch_df.copy()
    failed_ingredients = []
    successful_count = 0
    
    total_ingredients = len(batch_df)
    
    for idx, row in batch_df.iterrows():
        ingredient = row['ingredient']
        
        # Progress indicator
        current_position = idx - batch_df.index[0] + 1
        print(f"Processing {current_position}/{total_ingredients}: {ingredient}")
        
        # Fetch nutrition data
        nutrition_data = get_nutrition_data(ingredient, api_key)
        
        if nutrition_data and nutrition_data['status'] == 'success':
            # Update the DataFrame with nutrition data
            processed_df.at[idx, 'description'] = nutrition_data.get('found_description', ingredient)
            processed_df.at[idx, 'energy_kcal_per_100g'] = nutrition_data.get('energy_kcal')
            processed_df.at[idx, 'energy_kcal_per_100g'] = nutrition_data.get('energy_kcal')
            processed_df.at[idx, 'carbs_g_per_100g'] = nutrition_data.get('carbohydrate_g')
            processed_df.at[idx, 'protein_g_per_100g'] = nutrition_data.get('protein_g')
            processed_df.at[idx, 'fat_g_per_100g'] = nutrition_data.get('fat_g')
            processed_df.at[idx, 'top_micronutrients'] = ', '.join(nutrition_data.get('micronutrients', []))
            processed_df.at[idx, 'search_method'] = nutrition_data.get('search_method')
            
            successful_count += 1
            print(f"  ✅ Success - Method: {nutrition_data.get('search_method')}")
        else:
            failed_ingredients.append(ingredient)
            print(f"  ❌ Failed")
        
        # Small delay to avoid overwhelming the API
        time.sleep(0.1)
    
    # Summary
    print(f"\n📊 {batch_name.upper()} SUMMARY:")
    print(f"  - Total ingredients: {total_ingredients}")
    print(f"  - Successful: {successful_count}")
    print(f"  - Failed: {len(failed_ingredients)}")
    print(f"  - Success rate: {successful_count/total_ingredients*100:.1f}%")
    
    # if save_results:
    #     # Save processed DataFrame
    #     results_filename = f"nutrition_results_{batch_name}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
    #     processed_df.to_excel(results_filename, index=False)
    #     print(f"  💾 Results saved to: {results_filename}")
        
    #     # Save failed ingredients
    #     if failed_ingredients:
    #         failed_filename = f"failed_ingredients_{batch_name}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
    #         failed_df = pd.DataFrame({'failed_ingredient': failed_ingredients, 'batch': batch_name})
    #         failed_df.to_excel(failed_filename, index=False)
    #         print(f"  💾 Failed ingredients saved to: {failed_filename}")
    
    return processed_df, failed_ingredients

In [None]:
# Function to process all batches or specific batches
def process_selected_batches(batch_dfs, api_key, batch_numbers=None, save_results=True):

    processed_results = {}
    all_failed = {}
    
    # Determine which batches to process
    if batch_numbers is None:
        batches_to_process = list(batch_dfs.keys())
    else:
        batches_to_process = [f'batch_{num}' for num in batch_numbers if f'batch_{num}' in batch_dfs]
    
    print(f"🚀 STARTING BATCH PROCESSING")
    print(f"Batches to process: {batches_to_process}")
    print("=" * 60)
    
    for batch_name in batches_to_process:
        if batch_name in batch_dfs:
            try:
                processed_df, failed_list = process_batch_dataframe(
                    batch_dfs[batch_name], 
                    batch_name, 
                    api_key, 
                    save_results
                )
                processed_results[batch_name] = processed_df
                all_failed[batch_name] = failed_list
                
            except Exception as e:
                print(f"❌ Error processing {batch_name}: {str(e)}")
                all_failed[batch_name] = list(batch_dfs[batch_name]['ingredient'])
        
        print("\n" + "="*60)
    
    # Overall summary
    total_ingredients = sum(len(df) for df in batch_dfs.values() if any(batch in batch_dfs for batch in batches_to_process))
    total_successful = sum(len(df[df['search_method'].notna()]) for df in processed_results.values())
    total_failed = sum(len(failed_list) for failed_list in all_failed.values())
    
    print(f"\n🎯 OVERALL PROCESSING SUMMARY:")
    print(f"  - Total ingredients processed: {total_successful + total_failed}")
    print(f"  - Successful: {total_successful}")
    print(f"  - Failed: {total_failed}")
    print(f"  - Overall success rate: {total_successful/(total_successful + total_failed)*100:.1f}%")
    
    return processed_results, all_failed

###### Batch Processing

In [None]:
# 📊 BATCH 1 PROCESSING (Ingredients 1-150)
print("🔄 PROCESSING BATCH 1")
print("=" * 50)
print(f"Batch 1 contains {len(batch_dfs['batch_1'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_1']['ingredient'].head())}")
print()

# Process Batch 1
batch_1_results, batch_1_failed = process_batch_dataframe(
    batch_dfs['batch_1'], 
    'batch_1', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 1 COMPLETED!")


In [None]:
# 🔍 CHECK BATCH 1 RESULTS
print("📋 BATCH 1 DETAILED RESULTS")
print("=" * 50)

if 'batch_1_results' in locals():
    print(f"✅ Successfully processed: {len(batch_1_results[batch_1_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_1_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_1_results[batch_1_results['search_method'].notna()]) / len(batch_1_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 1 RESULTS:")
    display(batch_1_results)
    
    # Show failed ingredients if any
    if batch_1_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 1:")
        for ingredient in batch_1_failed:
            print(f"  - {ingredient}")
else:
    print("⚠️ Batch 1 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 2 PROCESSING (Ingredients 151-300)
print("🔄 PROCESSING BATCH 2")
print("=" * 50)
print(f"Batch 2 contains {len(batch_dfs['batch_2'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_2']['ingredient'].head())}")
print()

# Process Batch 2
batch_2_results, batch_2_failed = process_batch_dataframe(
    batch_dfs['batch_2'], 
    'batch_2', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 2 COMPLETED!")

In [None]:
# 🔍 CHECK BATCH 2 RESULTS
print("📋 BATCH 2 DETAILED RESULTS")
print("=" * 50)

if 'batch_2_results' in locals():
    print(f"✅ Successfully processed: {len(batch_2_results[batch_2_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_2_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_2_results[batch_2_results['search_method'].notna()]) / len(batch_2_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 2 RESULTS:")
    display(batch_2_results)
    
    # Show failed ingredients if any
    if batch_2_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 2:")
        for ingredient in batch_2_failed:
            print(f"  - {ingredient}")
else:
    print("⚠️ Batch 2 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 3 PROCESSING (Ingredients 301-450)
print("🔄 PROCESSING BATCH 3")
print("=" * 50)
print(f"Batch 3 contains {len(batch_dfs['batch_3'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_3']['ingredient'].head())}")
print()

# Process Batch 3
batch_3_results, batch_3_failed = process_batch_dataframe(
    batch_dfs['batch_3'], 
    'batch_3', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 3 COMPLETED!")

In [None]:
# 🔍 CHECK BATCH 3 RESULTS
print("📋 BATCH 3 DETAILED RESULTS")
print("=" * 50)

if 'batch_3_results' in locals():
    print(f"✅ Successfully processed: {len(batch_3_results[batch_3_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_3_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_3_results[batch_3_results['search_method'].notna()]) / len(batch_3_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 3 RESULTS:")
    display(batch_3_results)
    
    # Show failed ingredients if any
    if batch_3_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 3:")
        for ingredient in batch_3_failed:
            print(f"  - {ingredient}")
    
    print(f"\n✅ Ready to proceed to Batch 4? Run the next cell!")
else:
    print("⚠️ Batch 3 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 4 PROCESSING (Ingredients 451-600)
print("🔄 PROCESSING BATCH 4")
print("=" * 50)
print(f"Batch 4 contains {len(batch_dfs['batch_4'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_4']['ingredient'].head())}")
print()

# Process Batch 4
batch_4_results, batch_4_failed = process_batch_dataframe(
    batch_dfs['batch_4'], 
    'batch_4', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 4 COMPLETED!")


In [None]:
# 🔍 CHECK BATCH 4 RESULTS
print("📋 BATCH 4 DETAILED RESULTS")
print("=" * 50)

if 'batch_4_results' in locals():
    print(f"✅ Successfully processed: {len(batch_4_results[batch_4_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_4_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_4_results[batch_4_results['search_method'].notna()]) / len(batch_4_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 4 RESULTS:")
    display(batch_4_results)
    
    # Show failed ingredients if any
    if batch_4_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 4:")
        for ingredient in batch_4_failed:
            print(f"  - {ingredient}")
else:
    print("⚠️ Batch 4 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 5 PROCESSING (Ingredients 601-750)
print("🔄 PROCESSING BATCH 5")
print("=" * 50)
print(f"Batch 5 contains {len(batch_dfs['batch_5'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_5']['ingredient'].head())}")
print()

# Process Batch 5
batch_5_results, batch_5_failed = process_batch_dataframe(
    batch_dfs['batch_5'], 
    'batch_5', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 5 COMPLETED!")

In [None]:
# 📊 BATCH 6 PROCESSING (Ingredients 751+) - FINAL BATCH
print("🔄 PROCESSING BATCH 6 (FINAL)")
print("=" * 50)
print(f"Batch 6 contains {len(batch_dfs['batch_6'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_6']['ingredient'].head())}")
print()

# Process Batch 6
batch_6_results, batch_6_failed = process_batch_dataframe(
    batch_dfs['batch_6'], 
    'batch_6', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 6 COMPLETED!")


In [None]:
print(f"✅ Successfully processed: {len(batch_6_results[batch_6_results['search_method'].notna()])} ingredients")
print(f"❌ Failed: {len(batch_5_failed)} ingredients")
print(f"📊 Success rate: {len(batch_6_results[batch_6_results['search_method'].notna()]) / len(batch_6_results) * 100:.1f}%")

# Show full results DataFrame
print(f"\n📊 FULL BATCH 5 RESULTS:")
display(batch_6_results)

# Show failed ingredients if any
if batch_6_failed:
    print(f"\n❌ FAILED INGREDIENTS IN BATCH 5:")
    for ingredient in batch_6_failed:
        print(f"  - {ingredient}")


In [None]:
def combine_batch_results_to_single_file(output_filename=None):
    """
    Combine all individually processed batch results into a single Excel file 
    with 2 sheets: Successful and Failed results, each with batch tracking.
    
    This function looks for variables in the current namespace following the pattern:
    - batch_X_results (DataFrame with successful results)
    - batch_X_failed (List with failed ingredients)
    
    Parameters:
    - output_filename: Output filename (if None, auto-generated with timestamp)
    
    Returns:
    - combined_successful_df: DataFrame with all successful results
    - combined_failed_df: DataFrame with all failed results
    - summary_stats: Dictionary with combination statistics
    """
    import pandas as pd
    from datetime import datetime
    import re
    
    print("🔍 Searching for existing batch results...")
    
    # Get all variables from the current namespace
    current_vars = globals()
    
    # Find all batch result variables
    successful_batches = {}
    failed_batches = {}
    
    # Pattern to match batch variables
    for var_name in current_vars:
        # Match successful batch results (e.g., batch_1_results, batch_2_results)
        if re.match(r'batch_\d+_results$', var_name):
            batch_num = re.search(r'batch_(\d+)_results', var_name).group(1)
            batch_name = f"batch_{batch_num}"
            if isinstance(current_vars[var_name], pd.DataFrame):
                successful_batches[batch_name] = current_vars[var_name]
                print(f"   ✅ Found {var_name}: {len(current_vars[var_name])} successful results")
        
        # Match failed batch results (e.g., batch_1_failed, batch_2_failed)
        elif re.match(r'batch_\d+_failed$', var_name):
            batch_num = re.search(r'batch_(\d+)_failed', var_name).group(1)
            batch_name = f"batch_{batch_num}"
            if isinstance(current_vars[var_name], list):
                failed_batches[batch_name] = current_vars[var_name]
                print(f"   ❌ Found {var_name}: {len(current_vars[var_name])} failed ingredients")
    
    if not successful_batches and not failed_batches:
        print("⚠️  No batch results found! Make sure you have processed batches first.")
        print("   Expected variables: batch_1_results, batch_1_failed, batch_2_results, batch_2_failed, etc.")
        return None, None, None
    
    print(f"\n📊 Found {len(successful_batches)} successful batch result sets")
    print(f"📊 Found {len(failed_batches)} failed batch result sets")
    
    # Combine all successful results
    all_successful_dfs = []
    total_successful = 0
    
    for batch_name, df in successful_batches.items():
        if not df.empty:
            # Add batch column
            df_copy = df.copy()
            df_copy['batch'] = batch_name
            all_successful_dfs.append(df_copy)
            total_successful += len(df_copy)
            print(f"   ✅ {batch_name}: {len(df_copy)} successful results")
    
    # Combine all failed results
    all_failed_data = []
    total_failed = 0
    
    for batch_name, failed_list in failed_batches.items():
        if failed_list:
            for ingredient in failed_list:
                all_failed_data.append({
                    'ingredient': ingredient,
                    'batch': batch_name,
                    'reason': 'No nutrition data found',
                    'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            total_failed += len(failed_list)
            print(f"   ❌ {batch_name}: {len(failed_list)} failed ingredients")
    
    # Create combined DataFrames
    if all_successful_dfs:
        combined_successful_df = pd.concat(all_successful_dfs, ignore_index=True)
        # Reorder columns to put batch column first after ingredient
        cols = list(combined_successful_df.columns)
        if 'batch' in cols and 'ingredient' in cols:
            # Move batch column to second position (after ingredient)
            cols.remove('batch')
            ingredient_idx = cols.index('ingredient')
            cols.insert(ingredient_idx + 1, 'batch')
            combined_successful_df = combined_successful_df[cols]
    else:
        combined_successful_df = pd.DataFrame()
    
    if all_failed_data:
        combined_failed_df = pd.DataFrame(all_failed_data)
    else:
        combined_failed_df = pd.DataFrame()
    
    # Generate filename if not provided
    if output_filename is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f'combined_batch_results_{timestamp}.xlsx'
    
    # Save to Excel with multiple sheets
    print(f"\n💾 Saving combined results to: {output_filename}")
    
    with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
        # Save successful results
        if not combined_successful_df.empty:
            combined_successful_df.to_excel(writer, sheet_name='Successful', index=False)
            print(f"   ✅ Successful sheet: {len(combined_successful_df)} ingredients saved")
        else:
            # Create empty sheet with headers
            pd.DataFrame(columns=['ingredient', 'batch', 'energy_kcal', 'carbs_g', 'protein_g', 'fat_g', 
                                'micronutrients', 'search_method', 'found_description']).to_excel(
                writer, sheet_name='Successful', index=False)
            print("   ⚠️  No successful results to save")
        
        # Save failed results
        if not combined_failed_df.empty:
            combined_failed_df.to_excel(writer, sheet_name='Failed', index=False)
            print(f"   ❌ Failed sheet: {len(combined_failed_df)} ingredients saved")
        else:
            # Create empty sheet with headers
            pd.DataFrame(columns=['ingredient', 'batch', 'reason', 'processed_at']).to_excel(
                writer, sheet_name='Failed', index=False)
            print("   ✅ No failed results (perfect success!)")
        
        # Create summary sheet
        summary_data = []
        batch_stats = {}
        
        # Get stats for each batch
        all_batch_names = set(list(successful_batches.keys()) + list(failed_batches.keys()))
        
        for batch_name in sorted(all_batch_names):
            successful_count = len(successful_batches.get(batch_name, pd.DataFrame()))
            failed_count = len(failed_batches.get(batch_name, []))
            total_count = successful_count + failed_count
            success_rate = (successful_count / total_count * 100) if total_count > 0 else 0
            
            summary_data.append({
                'Batch': batch_name,
                'Total_Ingredients': total_count,
                'Successful': successful_count,
                'Failed': failed_count,
                'Success_Rate_%': round(success_rate, 2)
            })
            
            batch_stats[batch_name] = {
                'total': total_count,
                'successful': successful_count,
                'failed': failed_count,
                'success_rate': success_rate
            }
        
        # Add overall summary
        overall_total = total_successful + total_failed
        if overall_total > 0:
            summary_data.append({
                'Batch': 'OVERALL_TOTAL',
                'Total_Ingredients': overall_total,
                'Successful': total_successful,
                'Failed': total_failed,
                'Success_Rate_%': round((total_successful / overall_total) * 100, 2)
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
        print(f"   📊 Summary sheet: Batch statistics saved")
    
    # Print final summary
    print(f"\n🎉 COMBINATION COMPLETED!")
    print(f"📊 Final Combined Results:")
    print(f"   • Total ingredients: {total_successful + total_failed}")
    print(f"   • Successful: {total_successful} ({(total_successful/(total_successful + total_failed))*100:.1f}%)")
    print(f"   • Failed: {total_failed} ({(total_failed/(total_successful + total_failed))*100:.1f}%)")
    print(f"   • Batches combined: {len(all_batch_names)}")
    print(f"💾 Results saved to: {output_filename}")
    
    return combined_successful_df, combined_failed_df, {
        'total_ingredients': total_successful + total_failed,
        'successful': total_successful,
        'failed': total_failed,
        'success_rate': (total_successful / (total_successful + total_failed)) * 100 if (total_successful + total_failed) > 0 else 0,
        'batches_combined': len(all_batch_names),
        'batch_statistics': batch_stats,
        'output_filename': output_filename
    }

In [None]:
combined_successful_df, combined_failed_df, summary_stats = combine_batch_results_to_single_file(
    # output_filename=None  # Auto-generate filename with timestamp
    output_filename='my_combined_results.xlsx'  # Or specify custom filename
)

if combined_successful_df is not None:
    # Display final summary
    print(f"\n📊 FINAL COMBINATION SUMMARY:")
    print(f"✅ Total successful: {len(combined_successful_df)}")
    print(f"❌ Total failed: {len(combined_failed_df)}")
    print(f"📈 Overall success rate: {summary_stats['success_rate']:.1f}%")
    print(f"🔢 Batches combined: {summary_stats['batches_combined']}")
    print(f"💾 Results saved to: {summary_stats['output_filename']}")
    
    # Show batch breakdown
    print(f"\n📋 BATCH BREAKDOWN:")
    for batch_name, stats in summary_stats['batch_statistics'].items():
        print(f"   {batch_name}: {stats['successful']}/{stats['total']} successful ({stats['success_rate']:.1f}%)")
    
    # Display sample successful results
    if not combined_successful_df.empty:
        print(f"\n📄 SAMPLE SUCCESSFUL RESULTS (first 5):")
        display(combined_successful_df.head())
    
    # Display sample failed results  
    if not combined_failed_df.empty:
        print(f"\n❌ SAMPLE FAILED RESULTS (first 5):")
        display(combined_failed_df.head())
else:
    print("❌ No batch results found to combine. Please process some batches first!")

PS: Crosscheck the content file

#### Request to do Specific Ingredients

In [None]:
BASE_URL = 'https://api.nal.usda.gov/fdc/v1/food/'

def get_food_data(fdc_id):
    """Fetch food data from USDA FoodData Central using the given FDC ID."""
    url = f"{BASE_URL}{fdc_id}"
    params = {
        'api_key': api_key,
        'format': 'abridged',  # This ensures we get nutrient values
        'nutrients': 'all'     # Get all nutrients with values
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        return None

In [None]:
def extract_nutrition_info(food, search_method):
    """Extract nutrition information from food item."""
    nutrients = food.get('foodNutrients', [])
    
    result = {
        'ingredient_name': food.get('description', 'N/A'),
        'found_description': food.get('description', 'N/A'),
        'search_method': food.get('foodClass', search_method),  # Use foodClass as search_method
        'energy_kcal': None,
        'carbohydrate_g': None,
        'protein_g': None,
        'fat_g': None,
        'micronutrients': [],
        'status': 'success'
    }
    
    # Energy (Atwater General Factors)
    energy = next((item for item in nutrients if item['nutrient']['name'] == 'Energy (Atwater Specific Factors)'), None)
    if energy:
        result['energy_kcal'] = energy['amount']
    
    # Carbohydrates
    carbohydrate = next((item for item in nutrients if item['nutrient']['name'] == 'Carbohydrate, by difference'), None)
    if carbohydrate:
        result['carbohydrate_g'] = carbohydrate['amount']
    
    # Fat
    fat = next((item for item in nutrients if item['nutrient']['name'] == 'Total lipid (fat)'), None)
    if fat:
        result['fat_g'] = fat['amount']
    
    # Protein
    protein = next((item for item in nutrients if item['nutrient']['name'] == 'Protein'), None)
    if protein:
        result['protein_g'] = protein['amount']
    
    # Exclude certain nutrients from micronutrients
    exclude_nutrients = [
        "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
        "Nitrogen", "Protein", "Total lipid (fat)", "Ash", "Carbohydrates",
        "Carbohydrate, by difference", "Total dietary fiber (AOAC 2011.25)",
        "High Molecular Weight Dietary Fiber (HMWDF)", "Low Molecular Weight Dietary Fiber (LMWDF)",
        "Sugars, Total", "Total Sugars", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
    ]
    
    # Get micronutrients (vitamins and minerals) - top 3 by amount
    filtered_micronutrients = [
        item for item in nutrients 
        if item['nutrient']['name'] not in exclude_nutrients and item['amount'] > 0
    ]
    
    # Sort by amount in descending order and take top 3
    sorted_micronutrients = sorted(filtered_micronutrients, key=lambda x: x['amount'], reverse=True)
    top_3_micronutrients = sorted_micronutrients[:3]
    
    # Extract only the nutrient names
    micronutrients = [item['nutrient']['name'] for item in top_3_micronutrients]
    result['micronutrients'] = micronutrients
    
    return result

In [None]:
def extract_nutrition_info_robust(food, search_method):
    """Extract nutrition information from food item with robust structure handling."""
    nutrients = food.get('foodNutrients', [])
    
    result = {
        'ingredient_name': food.get('description', 'N/A'),
        'found_description': food.get('description', 'N/A'),
        'search_method': food.get('foodClass', search_method),
        'energy_kcal': None,
        'carbohydrate_g': None,
        'protein_g': None,
        'fat_g': None,
        'micronutrients': [],
        'status': 'success'
    }
    
    def get_nutrient_value(nutrients, nutrient_name):
        """Helper function to extract nutrient value regardless of structure."""
        for item in nutrients:
            # Handle different possible structures
            name = None
            value = None
            
            # Try different ways to get nutrient name
            if isinstance(item, dict):
                if 'nutrient' in item and isinstance(item['nutrient'], dict):
                    name = item['nutrient'].get('name')
                elif 'nutrientName' in item:
                    name = item['nutrientName']
                elif 'name' in item:
                    name = item['name']
                
                # Try different ways to get value
                if 'amount' in item:
                    value = item['amount']
                elif 'value' in item:
                    value = item['value']
                elif 'quantity' in item:
                    value = item['quantity']
            
            # Check if this is the nutrient we're looking for
            if name and nutrient_name.lower() in name.lower():
                return value
        return None
    
    # Extract main nutrients
    result['energy_kcal'] = get_nutrient_value(nutrients, 'Energy (Atwater General Factors)')
    if result['energy_kcal'] is None:
        result['energy_kcal'] = get_nutrient_value(nutrients, 'Energy')
    
    result['carbohydrate_g'] = get_nutrient_value(nutrients, 'Carbohydrate, by difference')
    if result['carbohydrate_g'] is None:
        result['carbohydrate_g'] = get_nutrient_value(nutrients, 'Carbohydrate')
    
    result['fat_g'] = get_nutrient_value(nutrients, 'Total lipid (fat)')
    if result['fat_g'] is None:
        result['fat_g'] = get_nutrient_value(nutrients, 'Fat')
    
    result['protein_g'] = get_nutrient_value(nutrients, 'Protein')
    
    # Get micronutrients (excluding main macronutrients)
    exclude_nutrients = [
        "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
        "Nitrogen", "Protein", "Total lipid (fat)", "Fat", "Ash", "Carbohydrates",
        "Carbohydrate, by difference", "Total dietary fiber", "Fiber",
        "Sugars", "Sugar", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
    ]
    
    micronutrients_with_values = []
    
    for item in nutrients:
        if isinstance(item, dict):
            name = None
            value = None
            
            # Get nutrient name
            if 'nutrient' in item and isinstance(item['nutrient'], dict):
                name = item['nutrient'].get('name')
            elif 'nutrientName' in item:
                name = item['nutrientName']
            elif 'name' in item:
                name = item['name']
            
            # Get value
            if 'amount' in item:
                value = item['amount']
            elif 'value' in item:
                value = item['value']
            elif 'quantity' in item:
                value = item['quantity']
            
            # Check if this is a micronutrient
            if name and value is not None and value > 0:
                is_excluded = any(excl.lower() in name.lower() for excl in exclude_nutrients)
                if not is_excluded:
                    micronutrients_with_values.append({'name': name, 'value': value})
    
    # Sort by value and take top 3
    sorted_micronutrients = sorted(micronutrients_with_values, key=lambda x: x['value'], reverse=True)
    top_3_micronutrients = sorted_micronutrients[:3]
    result['micronutrients'] = [item['name'] for item in top_3_micronutrients]
    
    return result

# Test the robust function
print("🧪 Testing robust extraction function:")
fdc_id = 2684441
food_data = get_food_data(fdc_id)
if food_data:
    nutrition_info = extract_nutrition_info_robust(food_data, 'direct_search')
    print("✅ Success! Nutrition info extracted:")
    for key, value in nutrition_info.items():
        print(f"  {key}: {value}")
else:
    print("❌ Failed to get food data")

In [None]:
# Let's examine several nutrients to understand the structure
food_data = get_food_data(2684441)
if food_data:
    nutrients = food_data.get('foodNutrients', [])
    print(f"🔍 DETAILED NUTRIENT INSPECTION (first 10 nutrients):")
    
    for i, nutrient in enumerate(nutrients[:10]):
        print(f"\n--- Nutrient {i+1} ---")
        print(f"Full structure: {nutrient}")
        
        if 'nutrient' in nutrient:
            print(f"Nutrient name: {nutrient['nutrient'].get('name')}")
        
        # Check for any field containing values
        value_fields = [key for key in nutrient.keys() if 'value' in key.lower() or 'amount' in key.lower() or 'quantity' in key.lower()]
        if value_fields:
            print(f"Value fields found: {value_fields}")
            for field in value_fields:
                print(f"  {field}: {nutrient[field]}")
        else:
            print("No value fields found")
            
    # Let's also check if there's a different way to get nutrient values
    print(f"\n🔍 Looking for nutrients with actual values...")
    nutrients_with_values = []
    for nutrient in nutrients:
        if any(key for key in nutrient.keys() if 'value' in key.lower() or 'amount' in key.lower()):
            nutrients_with_values.append(nutrient)
    
    print(f"Found {len(nutrients_with_values)} nutrients with value fields")
    if nutrients_with_values:
        print(f"Example nutrient with values: {nutrients_with_values[0]}")

In [None]:
def extract_nutrition_info_corrected(food, search_method):
    """Extract nutrition information from food item - CORRECTED VERSION."""
    nutrients = food.get('foodNutrients', [])
    
    result = {
        'ingredient_name': food.get('description', 'N/A'),
        'found_description': food.get('description', 'N/A'),
        'search_method': food.get('foodClass', search_method),
        'energy_kcal': None,
        'carbohydrate_g': None,
        'protein_g': None,
        'fat_g': None,
        'micronutrients': [],
        'status': 'success'
    }
    
    def get_nutrient_value(nutrient_item):
        """Get the value from a nutrient item, handling different field names."""
        # Try different possible field names for the value
        possible_fields = ['amount', 'value', 'quantity', 'val']
        for field in possible_fields:
            if field in nutrient_item:
                return nutrient_item[field]
        return None
    
    # 1st priority: Energy (Atwater Specific Factors)
    energy = next((item for item in nutrients if item['nutrient']['name'] == 'Energy (Atwater Specific Factors)'), None)
    if energy:
        result['energy_kcal'] = get_nutrient_value(energy)
        result['energy_source'] = 'Atwater Specific Factors'
        print(f"   ✅ Found Energy (Atwater Specific Factors): {result['energy_kcal']} kcal")
    else:
        # 2nd priority: Energy (Atwater General Factors)
        energy = next((item for item in nutrients if item['nutrient']['name'] == 'Energy (Atwater General Factors)'), None)
        if energy:
            result['energy_kcal'] = get_nutrient_value(energy)
            result['energy_source'] = 'Atwater General Factors'
            print(f"   ✅ Found Energy (Atwater General Factors): {result['energy_kcal']} kcal")
        else:
            # 3rd priority: Energy with unitName = "kcal" ⭐ YOUR REQUIREMENT
            energy = next((item for item in nutrients 
                          if item['nutrient']['name'] == 'Energy' and 
                          item['nutrient'].get('unitName') == 'kcal'), None)
            if energy:
                result['energy_kcal'] = get_nutrient_value(energy)
                result['energy_source'] = 'Energy (kcal unit)'
                print(f"   ✅ Found Energy with unitName='kcal': {result['energy_kcal']} kcal")
            else:
                # 4th priority: Any Energy entry as final fallback
                energy = next((item for item in nutrients if 'Energy' in item['nutrient']['name']), None)
                if energy:
                    result['energy_kcal'] = get_nutrient_value(energy)
                    unit = energy['nutrient'].get('unitName', 'unknown unit')
                    result['energy_source'] = f'Fallback Energy ({unit})'
                    print(f"   ⚠️ Found fallback Energy: {result['energy_kcal']} {unit}")
                else:
                    result['energy_source'] = 'Not found'
                    print(f"   ❌ No Energy data found")
    
    # Extract other macronutrients
    print(f"   🔍 Extracting other nutrients...")
    
    # Carbohydrates
    carbohydrate = next((item for item in nutrients if item['nutrient']['name'] == 'Carbohydrate, by difference'), None)
    if carbohydrate:
        result['carbohydrate_g'] = get_nutrient_value(carbohydrate)
    
    # Fat
    fat = next((item for item in nutrients if item['nutrient']['name'] == 'Total lipid (fat)'), None)
    if fat:
        result['fat_g'] = get_nutrient_value(fat)
    
    # Protein
    protein = next((item for item in nutrients if item['nutrient']['name'] == 'Protein'), None)
    if protein:
        result['protein_g'] = get_nutrient_value(protein)
    
    # Exclude certain nutrients from micronutrients
    exclude_nutrients = [
        "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
        "Nitrogen", "Protein", "Total lipid (fat)", "Ash", "Carbohydrates",
        "Carbohydrate, by difference", "Total dietary fiber (AOAC 2011.25)",
        "High Molecular Weight Dietary Fiber (HMWDF)", "Low Molecular Weight Dietary Fiber (LMWDF)",
        "Sugars, Total", "Total Sugars", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
    ]
    
    # Get micronutrients (vitamins and minerals) - top 3 by value
    filtered_micronutrients = []
    for item in nutrients:
        if (item['nutrient']['name'] not in exclude_nutrients and 
            get_nutrient_value(item) is not None and 
            get_nutrient_value(item) > 0):
            filtered_micronutrients.append({
                'name': item['nutrient']['name'],
                'value': get_nutrient_value(item)
            })
    
    # Sort by value in descending order and take top 3
    sorted_micronutrients = sorted(filtered_micronutrients, key=lambda x: x['value'], reverse=True)
    top_3_micronutrients = sorted_micronutrients[:3]
    
    # Extract only the nutrient names
    micronutrients = [item['name'] for item in top_3_micronutrients]
    result['micronutrients'] = micronutrients
    
    return result

# Test the corrected function
print("🧪 Testing CORRECTED extraction function:")
fdc_id = 2684441
food_data = get_food_data(fdc_id)
if food_data:
    nutrition_info = extract_nutrition_info_corrected(food_data, 'direct_search')
    print("✅ Success! Nutrition info extracted:")
    for key, value in nutrition_info.items():
        print(f"  {key}: {value}")
else:
    print("❌ Failed to get food data")

In [None]:
def process_fdc_id_list_to_excel(name_id_list, output_filename=None, delay=0.5):
    """
    Process a list of [name, id] pairs, fetch nutrition data for each FDC ID, and save to Excel.
    
    Parameters:
    - name_id_list: List of [name, fdc_id] pairs, e.g., [["Apple", 123456], ["Banana", 789012]]
    - output_filename: Output Excel filename (if None, auto-generated with timestamp)
    - delay: Delay between API calls in seconds (default 0.5s)
    
    Returns:
    - results_df: DataFrame with all results
    - failed_list: List of failed items
    - summary_stats: Dictionary with processing statistics
    """
    import pandas as pd
    import time
    from datetime import datetime
    
    print(f"🚀 Processing {len(name_id_list)} FDC ID entries...")
    print("=" * 60)
    
    results = []
    failed_items = []
    
    start_time = time.time()
    
    for i, (name, fdc_id) in enumerate(name_id_list, 1):
        print(f"\n🔄 Processing {i}/{len(name_id_list)}: {name} (ID: {fdc_id})")
        
        try:
            # Convert fdc_id to int if it's a string
            if isinstance(fdc_id, str):
                fdc_id = int(fdc_id)
            
            # Get food data from API
            food_data = get_food_data(fdc_id)
            
            if food_data:
                # Extract nutrition info using the corrected function
                nutrition_info = extract_nutrition_info_corrected(food_data, 'FDC_ID_lookup')
                
                if nutrition_info and nutrition_info.get('status') == 'success':
                    # Create result entry
                    result_entry = {
                        'name': name,
                        'fdc_id': fdc_id,
                        'found_description': nutrition_info.get('found_description', ''),
                        'search_method': nutrition_info.get('search_method', ''),
                        'energy_kcal': nutrition_info.get('energy_kcal'),
                        'carbohydrate_g': nutrition_info.get('carbohydrate_g'),
                        'protein_g': nutrition_info.get('protein_g'),
                        'fat_g': nutrition_info.get('fat_g'),
                        'micronutrients': ', '.join(nutrition_info.get('micronutrients', [])),
                        'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'status': 'success'
                    }
                    results.append(result_entry)
                    print(f"   ✅ Success: {nutrition_info.get('energy_kcal')} kcal, {len(nutrition_info.get('micronutrients', []))} micronutrients")
                else:
                    # Failed to extract nutrition
                    failed_entry = {
                        'name': name,
                        'fdc_id': fdc_id,
                        'reason': 'Failed to extract nutrition data',
                        'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    }
                    failed_items.append(failed_entry)
                    print(f"   ❌ Failed: Could not extract nutrition data")
            else:
                # Failed to get food data
                failed_entry = {
                    'name': name,
                    'fdc_id': fdc_id,
                    'reason': 'Failed to fetch food data from API',
                    'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                failed_items.append(failed_entry)
                print(f"   ❌ Failed: API call failed")
        
        except Exception as e:
            # Error occurred
            failed_entry = {
                'name': name,
                'fdc_id': fdc_id,
                'reason': f'Error: {str(e)}',
                'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            failed_items.append(failed_entry)
            print(f"   ❌ Error: {str(e)}")
        
        # Add delay between requests
        if i < len(name_id_list):  # Don't delay after the last item
            time.sleep(delay)
    
    # Create DataFrames
    results_df = pd.DataFrame(results) if results else pd.DataFrame()
    failed_df = pd.DataFrame(failed_items) if failed_items else pd.DataFrame()
    
    # Generate filename if not provided
    if output_filename is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f'fdc_nutrition_data_{timestamp}.xlsx'
    
    # Save to Excel
    print(f"\n💾 Saving results to: {output_filename}")
    
    with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
        # Save successful results
        if not results_df.empty:
            results_df.to_excel(writer, sheet_name='Nutrition_Data', index=False)
            print(f"   ✅ Nutrition data saved: {len(results_df)} items")
        else:
            # Create empty sheet with headers
            empty_df = pd.DataFrame(columns=['name', 'fdc_id', 'found_description', 'search_method', 
                                           'energy_kcal', 'carbohydrate_g', 'protein_g', 'fat_g', 
                                           'micronutrients', 'processed_at', 'status'])
            empty_df.to_excel(writer, sheet_name='Nutrition_Data', index=False)
            print("   ⚠️  No successful results to save")
        
        # Save failed items
        if not failed_df.empty:
            failed_df.to_excel(writer, sheet_name='Failed_Items', index=False)
            print(f"   ❌ Failed items saved: {len(failed_df)} items")
        
        # Create summary sheet
        summary_data = [{
            'Total_Items': len(name_id_list),
            'Successful': len(results),
            'Failed': len(failed_items),
            'Success_Rate_%': (len(results) / len(name_id_list)) * 100 if name_id_list else 0,
            'Processing_Time_Seconds': round(time.time() - start_time, 1),
            'Processed_At': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }]
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
        print(f"   📊 Summary saved")
    
    # Print final summary
    total_time = time.time() - start_time
    success_rate = (len(results) / len(name_id_list)) * 100 if name_id_list else 0
    
    print(f"\n🎉 PROCESSING COMPLETED!")
    print(f"📊 Final Results:")
    print(f"   • Total items processed: {len(name_id_list)}")
    print(f"   • Successful: {len(results)} ({success_rate:.1f}%)")
    print(f"   • Failed: {len(failed_items)} ({100-success_rate:.1f}%)")
    print(f"   • Total processing time: {total_time:.1f} seconds")
    print(f"   • Average time per item: {total_time/len(name_id_list):.2f} seconds")
    print(f"💾 Results saved to: {output_filename}")
    
    return results_df, failed_items, {
        'total_items': len(name_id_list),
        'successful': len(results),
        'failed': len(failed_items),
        'success_rate': success_rate,
        'processing_time': total_time,
        'output_filename': output_filename
    }

In [None]:
name_id_list = [
    ["salmon", 2684441],
    ["Whole Milk", 746782],
    ["White Bread", 339005],
    ["yoghurt", 2259793],
    ["baby oatmeal", 2708492],
    ["bok choy", 2685572],
    ["tomato", 2685581],
    ["clove", 171321],
    ["cloves", 171321],
    ["wholemeal bread", 335240],
    ["rice",2512381],
    ["basmati rice", 2708404],
    ["thyme", 173470], 
    ["coconut milk", 2705413],
    ["paprika", 171329],
    ["cod", 2684444],
    ["dark chocolat",170271],
    ["baking powder", 172805],
    ["formula", 2705518],
    ["parsley", 170416], 
    ["butter", 790508],
    ["apricot", 2710815],
    ["stock",2707132],
    ["coriander", 169997],
    ["chayote", 170402],
    ["chives", 169994],
    ["blueberry", 2346411],
    ["blueberries", 2346411],
    ["spring onion", 170005],
    ["shrimp", 2684443],
    ["catfish", 2684445],
    ["brown sugar", 2710260],
    ["berries", 2709272],
    ["turkey", 2514747],
    ["black pepper", 170931],
    ["raisins",2709212],
    ["egg noodles", 169731],
    ["lemon grass", 168573],
    ["papaya", 2709246],
    ["coconut water", 2707572],
    ["ginger", 169231],
    ["seaweed", 2709988],
    ["eyes fish", 168034],
    ["curry powder", 170924],
    ["cream cheese", 173418],
    ["feta cheese", 2259796],
    ["quinoa", 2708401],
    ["kiwi", 2710831],
    ["mango", 2710834],
    ["mangoes", 2710834],
    ["cassava", 169985], 
    
]
# Process the test list
results_df, failed_list, stats = process_fdc_id_list_to_excel(
    name_id_list=name_id_list,
    output_filename=None,  # Auto-generate filename
    delay=0.5  # 500ms delay between calls
)

In [None]:
# Display results
if not results_df.empty:
    print(f"\n📄 RESULTS PREVIEW:")
    display(results_df)
    
    print(f"\n📊 NUTRITION DATA SUMMARY:")
    numeric_cols = ['energy_kcal', 'carbohydrate_g', 'protein_g', 'fat_g']
    for col in numeric_cols:
        if col in results_df.columns:
            avg_val = results_df[col].mean()
            print(f"   • Average {col}: {avg_val:.2f}")

if failed_list:
    print(f"\n❌ FAILED ITEMS:")
    for item in failed_list:
        print(f"   • {item['name']} (ID: {item['fdc_id']}): {item['reason']}")

print(f"\n💾 Excel file saved: {stats['output_filename']}")
print(f"📈 Success rate: {stats['success_rate']:.1f}%")

In [None]:
df.columns.to_list()

#### Compute Nutrients Per Ingredient of The Recipe

In [None]:
import pandas as pd

# Load the Excel file structure
excel_file = pd.ExcelFile('final_ingredient_nutrition.xlsx')

# Get list of available sheet names
sheet_names = excel_file.sheet_names

# Check if 'Succesful' sheet exists
if 'Successful' in sheet_names:
    combined_nutrition_data = pd.read_excel('testing/final_ingredient_nutritions.xlsx', sheet_name='Successful')
    print("Shape:", combined_nutrition_data.shape)
    print("✅ Loaded Successful sheet.")
else:
    print("❌ Sheet 'Succesful' not found.")
    print("📄 Available sheets:", sheet_names)
    combined_nutrition_data = None  # or raise an error / fallback action


In [None]:
# Filter for complete data
# Check ingredient breakdown data
print(f"\n=== INGREDIENT BREAKDOWN DATA ===")
print(f"ingredient_breakdown_df shape: {ingredient_breakdown_df.shape}")
print(f"Complete data available: {complete_data_mask.sum()}/{len(complete_data_mask)}")

complete_ingredients = ingredient_breakdown_df[complete_data_mask].copy()
print(f"Working with {len(complete_ingredients)} complete ingredient records")

ingredient_breakdown_df[complete_data_mask].head()

In [None]:
import pandas as pd

expected_cols = ['ingredient', 'energy_kcal_per_100g', 'carbs_g_per_100g',
                 'protein_g_per_100g', 'fat_g_per_100g', 'top_micronutrients']

# Check that data is a DataFrame and has required columns
if isinstance(combined_nutrition_data, pd.DataFrame) and all(col in combined_nutrition_data.columns for col in expected_cols):
    df = combined_nutrition_data[expected_cols]

    # --- Step 1: Drop rows where all 5 nutrient columns are empty/null/N/A ---
    nutrient_cols = ['energy_kcal_per_100g', 'carbs_g_per_100g', 'protein_g_per_100g', 'fat_g_per_100g', 'top_micronutrients']

    # Mark entries considered as "null"
    df[nutrient_cols] = df[nutrient_cols].replace(['', 'N/A', 'n/a', None], pd.NA)

    # Find rows where all nutrient fields are NA
    condition = df[nutrient_cols].isna().all(axis=1)

    # Save dropped rows and count
    dropped_rows = df[condition][['ingredient']].copy()
    dropped_rows['null_count'] = 5

    # Drop the identified rows
    df = df[~condition].reset_index(drop=True)

    # --- Step 2: Clean and convert numeric fields ---
    numeric_cols = ['energy_kcal_per_100g', 'carbs_g_per_100g', 'protein_g_per_100g', 'fat_g_per_100g']
    for col in numeric_cols:
        df[col] = df[col].replace(pd.NA, 0).fillna(0).astype(float)

else:
    print("❌ Could not load sheet or missing columns.")


In [None]:
print("✅ Cleaned DataFrame:")
display(df.head())
print("Current Shape:" , df.shape)

print("🗑️ Dropped rows where all nutrient info was missing:")
print(dropped_rows.count())

In [None]:
# Check measurements to ensure all are in grams
print("=== CHECKING MEASUREMENTS ===")
print("Unique measurement types in complete_ingredients:")
measurement_counts = complete_ingredients['measurement'].value_counts()
print(measurement_counts.head(10))

# Check if all measurements are in grams
non_gram_measurements = complete_ingredients[complete_ingredients['measurement'] != 'g']
print(f"\nNon-gram measurements: {len(non_gram_measurements)}")
if len(non_gram_measurements) > 0:
    print("Sample non-gram measurements:")
    print(non_gram_measurements[['ingredient_name', 'quantity', 'measurement']].head())
    
# For this analysis, we'll assume all measurements should be treated as grams
# If there are non-gram measurements, they should be converted beforehand
print(f"\nProceeding with assumption that all quantities are in grams or gram-equivalent units")

In [None]:
def find_nutrition_match(ingredient_name, nutrition_df):
    """
    Find nutrition match using 3-step strategy:
    1. Exact match (e.g., "sweet potato" matches exactly "sweet potato")
    2. Keyword match (e.g., "broccoli floret" matches "broccoli")
    3. No match - return None
    
    Includes custom rules:
    - Ignore case sensitivity
    - Remove text in brackets ()
    - Remove x000D characters
    - Special keyword matching rules
    - Exclusion rules for certain keywords
    """
    
    def clean_ingredient_name(name):
        """Clean ingredient name according to custom rules"""
        if pd.isna(name):
            return ""
        
        name = str(name)
        
        # Remove x000D
        name = name.replace('\x00\x0D', '').replace('x000D', '')
        
        # Remove text in brackets ()
        name = re.sub(r'\([^)]*\)', '', name)
        
        # Clean up extra spaces and convert to lowercase
        name = ' '.join(name.split()).lower().strip()
        
        return name
    
    def should_exclude_ingredient(ingredient_name):
        """Check if ingredient should be excluded from matching"""
        ingredient_lower = ingredient_name.lower()
        
        # Don't match hokkien or rice noodles
        if 'hokkien' in ingredient_lower or 'rice noodles' in ingredient_lower:
            return True
            
        # Don't match potato starch with potato (return all zeros)
        if 'potato starch' in ingredient_lower:
            return True
            
        return False
    
    def create_zero_nutrition():
        """Create a nutrition entry with all zeros for excluded ingredients"""
        # Create a pandas Series that matches the structure of nutrition_df
        zero_series = pd.Series({
            'ingredient': 'EXCLUDED - ALL ZEROS',
            'energy_kcal_per_100g': 0,
            'carbs_g_per_100g': 0,
            'protein_g_per_100g': 0,
            'fat_g_per_100g': 0,
            'top_micronutrients': ''
        })
        return zero_series, "excluded_zeros"
    
    def apply_special_matching_rules(ingredient_clean):
        """Apply special matching rules for specific ingredients"""
        # Yoghurt = yogurt
        if 'yoghurt' in ingredient_clean:
            ingredient_clean = ingredient_clean.replace('yoghurt', 'yogurt')
        
        # Cauliflower/broccoli = broccoli
        if 'cauliflower/broccoli' in ingredient_clean or 'cauliflower broccoli' in ingredient_clean:
            ingredient_clean = 'broccoli'
        
        # Oatmeal to baby oatmeal (if not already baby oatmeal)
        if 'oatmeal' in ingredient_clean and 'baby' not in ingredient_clean:
            ingredient_clean = ingredient_clean.replace('oatmeal', 'baby oatmeal')
        
        return ingredient_clean
    
    # Clean the input ingredient name
    ingredient_clean = clean_ingredient_name(ingredient_name)
    
    # Check exclusion rules first
    if should_exclude_ingredient(ingredient_clean):
        return create_zero_nutrition()
    
    # Special case for water - return all zeros
    if ingredient_clean in ['water', 'plain water', 'boiling water', 'cold water', 'warm water']:
        return create_zero_nutrition()
    
    # Apply special matching rules
    ingredient_clean = apply_special_matching_rules(ingredient_clean)
    
    # Clean nutrition dataframe ingredient names for comparison
    nutrition_df_clean = nutrition_df.copy()
    nutrition_df_clean['ingredient_clean'] = nutrition_df_clean['ingredient'].apply(clean_ingredient_name)
    
    # Step 1: Try exact match
    exact_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'] == ingredient_clean]
    if len(exact_match) > 0:
        return exact_match.iloc[0], "exact_match"
    
    # Step 2: Try keyword matching with special rules
    ingredient_words = ingredient_clean.split()
    
    for word in ingredient_words:
        if len(word) > 2:  # Only consider words longer than 2 characters
            
            # Special keyword matching rules
            if word == 'broth':
                # If there's "broth" in ingredient, match with "broth"
                try:
                    broth_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains(r'\bbroth\b', na=False, regex=True)]
                    if len(broth_match) > 0:
                        return broth_match.iloc[0], f"keyword_match_broth"
                except:
                    pass
            
            elif 'long onion' in ingredient_clean:
                # Long onion should match with "long onion" specifically
                try:
                    long_onion_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains('long onion', na=False, regex=False)]
                    if len(long_onion_match) > 0:
                        return long_onion_match.iloc[0], f"keyword_match_long_onion"
                except:
                    pass
            
            elif 'baby oatmeal' in ingredient_clean:
                # Baby oatmeal should match with "baby oatmeal" specifically
                try:
                    baby_oatmeal_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains('baby oatmeal', na=False, regex=False)]
                    if len(baby_oatmeal_match) > 0:
                        return baby_oatmeal_match.iloc[0], f"keyword_match_baby_oatmeal"
                except:
                    pass
            
            # General keyword matching for other words
            try:
                # Escape special regex characters and use word boundaries for exact word matching
                pattern = r'\b' + re.escape(word) + r'\b'
                keyword_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains(pattern, na=False, regex=True)]
                if len(keyword_match) > 0:
                    return keyword_match.iloc[0], f"keyword_match_{word}"
            except Exception as e:
                # If regex fails, try simple string contains
                try:
                    keyword_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains(word, na=False, regex=False)]
                    if len(keyword_match) > 0:
                        return keyword_match.iloc[0], f"keyword_match_{word}"
                except:
                    continue
    
    # Step 3: No match found
    return None, "no_match"


In [None]:
def calculate_ingredient_nutrition_new_strategy(ingredient_df, nutrition_df):
    """
    Calculate nutrition per ingredient using new strategy:
    - Use quantity directly (assume all in grams or gram-equivalent)
    - No conversion, direct proportional calculation from per 100g data
    """
    print("=== CALCULATING NUTRIENTS PER INGREDIENT (NEW STRATEGY) ===")
    
    result_df = ingredient_df.copy()
    
    # Add nutrition columns
    result_df['energy_kcal'] = 0.0
    result_df['carbs_g'] = 0.0
    result_df['protein_g'] = 0.0
    result_df['fat_g'] = 0.0
    result_df['micronutrients'] = ''
    result_df['nutrition_matched'] = False
    result_df['match_type'] = ''
    result_df['matched_ingredient'] = ''
    
    # Track statistics
    exact_matches = 0
    keyword_matches = 0
    no_matches = 0
    total_count = len(ingredient_df)
    
    print(f"Processing {total_count} ingredients...")
    
    for idx, row in ingredient_df.iterrows():
        ingredient_name = row['ingredient_name']
        quantity = row['quantity']
        
        # Find nutrition match
        nutrition_match, match_type = find_nutrition_match(ingredient_name, nutrition_df)
        
        if nutrition_match is not None:
            result_df.loc[idx, 'nutrition_matched'] = True
            result_df.loc[idx, 'match_type'] = match_type
            result_df.loc[idx, 'matched_ingredient'] = nutrition_match['ingredient']
            
            # Count match types
            if match_type == "exact_match":
                exact_matches += 1
            elif "keyword_match" in match_type:
                keyword_matches += 1
            
            # Calculate nutrition based on quantity (assume quantity is in grams)
            try:
                qty_numeric = float(quantity)
                if qty_numeric > 0:
                    # Calculate proportion of 100g
                    proportion = qty_numeric / 100.0
                    
                    # Scale numeric nutrients by proportion
                    if pd.notna(nutrition_match['energy_kcal_per_100g']):
                        result_df.loc[idx, 'energy_kcal'] = nutrition_match['energy_kcal_per_100g'] * proportion
                    if pd.notna(nutrition_match['carbs_g_per_100g']):
                        result_df.loc[idx, 'carbs_g'] = nutrition_match['carbs_g_per_100g'] * proportion
                    if pd.notna(nutrition_match['protein_g_per_100g']):
                        result_df.loc[idx, 'protein_g'] = nutrition_match['protein_g_per_100g'] * proportion
                    if pd.notna(nutrition_match['fat_g_per_100g']):
                        result_df.loc[idx, 'fat_g'] = nutrition_match['fat_g_per_100g'] * proportion
                    
                    # Store micronutrients as-is (don't scale)
                    if pd.notna(nutrition_match['top_micronutrients']):
                        result_df.loc[idx, 'micronutrients'] = str(nutrition_match['top_micronutrients'])
            except (ValueError, TypeError):
                # If quantity is not numeric, skip calculation but keep the match info
                pass
        else:
            no_matches += 1
            result_df.loc[idx, 'match_type'] = 'no_match'
    return result_df

In [None]:
ingredients_with_nutrition_updated = calculate_ingredient_nutrition_new_strategy(complete_ingredients, df)

In [None]:
ingredients_with_nutrition_updated.columns.to_list
ingredients_with_nutrition_updated.head()

##### Validation

In [None]:
# NUTRITION CALCULATION VALIDATION FUNCTION
def validate_nutrition_calculations(calculated_df, nutrition_database_df, sample_size=20, tolerance=0.1):
    """
    Validate nutrition calculations by comparing calculated values with expected values
    from the main nutrition database.
    
    Parameters:
    - calculated_df: DataFrame with calculated nutrition values per ingredient
    - nutrition_database_df: Main nutrition database (df) with per 100g values
    - sample_size: Number of random samples to validate
    - tolerance: Acceptable percentage difference (0.1 = 10%)
    
    Returns:
    - validation_results: DataFrame with validation details
    - summary_stats: Dictionary with validation summary
    """
    
    def parse_quantity_safe(qty_str):
        """Safely parse quantity string including fractions"""
        if pd.isna(qty_str):
            return 0.0
        
        qty_str = str(qty_str)
        
        # Handle fractions
        fraction_map = {'½': 0.5, '¼': 0.25, '¾': 0.75, '⅓': 0.33, '⅔': 0.67, '⅛': 0.125}
        for frac, val in fraction_map.items():
            qty_str = qty_str.replace(frac, str(val))
        
        # Handle ranges (take average)
        if '-' in qty_str or '–' in qty_str:
            parts = re.split(r'[-–]', qty_str)
            if len(parts) == 2:
                try:
                    min_val = float(parts[0].strip())
                    max_val = float(parts[1].strip())
                    return (min_val + max_val) / 2
                except:
                    pass
        
        # Convert to float
        try:
            return float(qty_str)
        except:
            return 0.0
    
    print("🔍 NUTRITION CALCULATION VALIDATION")
    print("=" * 60)
    
    # Filter only ingredients that have nutrition matches (not excluded or no_match)
    matched_ingredients = calculated_df[
        (calculated_df['nutrition_matched'] == True) & 
        (calculated_df['match_type'] != 'excluded_zeros') &
        (calculated_df['match_type'] != 'no_match')
    ].copy()
    
    print(f"Total ingredients with nutrition matches: {len(matched_ingredients)}")
    
    # Sample random ingredients for validation
    if len(matched_ingredients) > sample_size:
        validation_sample = matched_ingredients.sample(n=sample_size, random_state=42)
        print(f"Validating random sample of {sample_size} ingredients")
    else:
        validation_sample = matched_ingredients.copy()
        print(f"Validating all {len(validation_sample)} matched ingredients")
    
    validation_results = []
    
    for idx, row in validation_sample.iterrows():
        ingredient_name = row['ingredient_name']
        matched_ingredient = row['matched_ingredient']
        quantity = parse_quantity_safe(row['quantity'])
        match_type = row['match_type']
        
        # Get calculated values
        calc_energy = row['energy_kcal'] if pd.notna(row['energy_kcal']) else 0
        calc_protein = row['protein_g'] if pd.notna(row['protein_g']) else 0
        calc_carbs = row['carbs_g'] if pd.notna(row['carbs_g']) else 0
        calc_fat = row['fat_g'] if pd.notna(row['fat_g']) else 0
        
        # Find the corresponding nutrition data in main database
        nutrition_match = nutrition_database_df[
            nutrition_database_df['ingredient'].str.lower().str.strip() == 
            matched_ingredient.lower().strip()
        ]
        
        if len(nutrition_match) > 0:
            nutr = nutrition_match.iloc[0]
            
            # Calculate expected values (quantity/100 * per_100g_value)
            if quantity > 0:
                proportion = quantity / 100.0
                
                expected_energy = nutr['energy_kcal_per_100g'] * proportion if pd.notna(nutr['energy_kcal_per_100g']) else 0
                expected_protein = nutr['protein_g_per_100g'] * proportion if pd.notna(nutr['protein_g_per_100g']) else 0
                expected_carbs = nutr['carbs_g_per_100g'] * proportion if pd.notna(nutr['carbs_g_per_100g']) else 0
                expected_fat = nutr['fat_g_per_100g'] * proportion if pd.notna(nutr['fat_g_per_100g']) else 0
                
                # Calculate percentage differences
                def calc_percentage_diff(calculated, expected):
                    if expected == 0 and calculated == 0:
                        return 0.0
                    elif expected == 0:
                        return float('inf') if calculated != 0 else 0.0
                    else:
                        return abs((calculated - expected) / expected) * 100
                
                energy_diff = calc_percentage_diff(calc_energy, expected_energy)
                protein_diff = calc_percentage_diff(calc_protein, expected_protein)
                carbs_diff = calc_percentage_diff(calc_carbs, expected_carbs)
                fat_diff = calc_percentage_diff(calc_fat, expected_fat)
                
                # Determine if validation passed (within tolerance)
                tolerance_percent = tolerance * 100
                energy_pass = energy_diff <= tolerance_percent or energy_diff == 0.0
                protein_pass = protein_diff <= tolerance_percent or protein_diff == 0.0
                carbs_pass = carbs_diff <= tolerance_percent or carbs_diff == 0.0
                fat_pass = fat_diff <= tolerance_percent or fat_diff == 0.0
                
                overall_pass = energy_pass and protein_pass and carbs_pass and fat_pass
                
                validation_results.append({
                    'ingredient_name': ingredient_name,
                    'matched_ingredient': matched_ingredient,
                    'match_type': match_type,
                    'quantity_g': quantity,
                    'proportion': proportion,
                    
                    # Energy validation
                    'calc_energy': round(calc_energy, 2),
                    'expected_energy': round(expected_energy, 2),
                    'energy_diff_percent': round(energy_diff, 2) if energy_diff != float('inf') else 'INF',
                    'energy_pass': energy_pass,
                    
                    # Protein validation
                    'calc_protein': round(calc_protein, 2),
                    'expected_protein': round(expected_protein, 2),
                    'protein_diff_percent': round(protein_diff, 2) if protein_diff != float('inf') else 'INF',
                    'protein_pass': protein_pass,
                    
                    # Carbs validation
                    'calc_carbs': round(calc_carbs, 2),
                    'expected_carbs': round(expected_carbs, 2),
                    'carbs_diff_percent': round(carbs_diff, 2) if carbs_diff != float('inf') else 'INF',
                    'carbs_pass': carbs_pass,
                    
                    # Fat validation
                    'calc_fat': round(calc_fat, 2),
                    'expected_fat': round(expected_fat, 2),
                    'fat_diff_percent': round(fat_diff, 2) if fat_diff != float('inf') else 'INF',
                    'fat_pass': fat_pass,
                    
                    'overall_pass': overall_pass,
                    'validation_status': 'PASS' if overall_pass else 'FAIL'
                })
            else:
                # Zero quantity case
                validation_results.append({
                    'ingredient_name': ingredient_name,
                    'matched_ingredient': matched_ingredient,
                    'match_type': match_type,
                    'quantity_g': quantity,
                    'validation_status': 'ZERO_QUANTITY',
                    'overall_pass': True  # Zero quantities are valid
                })
        else:
            # Could not find nutrition data for validation
            validation_results.append({
                'ingredient_name': ingredient_name,
                'matched_ingredient': matched_ingredient,
                'match_type': match_type,
                'quantity_g': quantity,
                'validation_status': 'NO_DB_MATCH',
                'overall_pass': False
            })
    
    # Create validation results DataFrame
    validation_df = pd.DataFrame(validation_results)
    
    # Calculate summary statistics
    if len(validation_df) > 0:
        total_validations = len(validation_df)
        successful_validations = len(validation_df[validation_df['validation_status'].isin(['PASS', 'FAIL'])])
        passed_validations = (validation_df['overall_pass'] == True).sum()
        failed_validations = (validation_df['validation_status'] == 'FAIL').sum()
        no_db_match = (validation_df['validation_status'] == 'NO_DB_MATCH').sum()
        zero_quantity = (validation_df['validation_status'] == 'ZERO_QUANTITY').sum()
        
        # Calculate pass rates for each nutrient (only for PASS/FAIL status)
        valid_for_nutrient_check = validation_df[validation_df['validation_status'].isin(['PASS', 'FAIL'])]
        if len(valid_for_nutrient_check) > 0:
            energy_pass_rate = (valid_for_nutrient_check['energy_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
            protein_pass_rate = (valid_for_nutrient_check['protein_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
            carbs_pass_rate = (valid_for_nutrient_check['carbs_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
            fat_pass_rate = (valid_for_nutrient_check['fat_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
        else:
            energy_pass_rate = protein_pass_rate = carbs_pass_rate = fat_pass_rate = 0
        
        summary_stats = {
            'total_validations': total_validations,
            'successful_validations': successful_validations,
            'passed_validations': passed_validations,
            'failed_validations': failed_validations,
            'no_db_match': no_db_match,
            'zero_quantity': zero_quantity,
            'overall_pass_rate': passed_validations / total_validations * 100 if total_validations > 0 else 0,
            'energy_pass_rate': energy_pass_rate,
            'protein_pass_rate': protein_pass_rate,
            'carbs_pass_rate': carbs_pass_rate,
            'fat_pass_rate': fat_pass_rate,
            'tolerance_percent': tolerance * 100
        }
    else:
        summary_stats = {'error': 'No validations performed'}
    
    return validation_df, summary_stats

# Run the validation
print("Starting nutrition calculation validation...")
validation_results, validation_summary = validate_nutrition_calculations(
    ingredients_with_nutrition_updated, 
    df, 
    sample_size=30,  # Validate 30 random samples
    tolerance=0.05   # 5% tolerance
)

In [None]:
# Display validation results and summary
print("📊 VALIDATION SUMMARY REPORT")
print("=" * 60)

if 'error' in validation_summary:
    print(f"❌ Validation Error: {validation_summary['error']}")
else:
    # Display summary statistics
    print(f"📈 OVERALL VALIDATION STATISTICS:")
    print(f"   Total validations performed: {validation_summary['total_validations']}")
    print(f"   Successful validations: {validation_summary['successful_validations']}")
    print(f"   Passed validations: {validation_summary['passed_validations']}")
    print(f"   Failed validations: {validation_summary['failed_validations']}")
    print(f"   No database match: {validation_summary['no_db_match']}")
    print(f"   Zero quantity cases: {validation_summary['zero_quantity']}")
    print(f"   Overall pass rate: {validation_summary['overall_pass_rate']:.1f}%")
    print(f"   Tolerance used: ±{validation_summary['tolerance_percent']}%")
    
    print(f"\n🧪 NUTRIENT-SPECIFIC PASS RATES:")
    print(f"   Energy (kcal): {validation_summary['energy_pass_rate']:.1f}%")
    print(f"   Protein (g): {validation_summary['protein_pass_rate']:.1f}%")
    print(f"   Carbohydrates (g): {validation_summary['carbs_pass_rate']:.1f}%")
    print(f"   Fat (g): {validation_summary['fat_pass_rate']:.1f}%")

# Display detailed validation results
print(f"\n📋 DETAILED VALIDATION RESULTS:")
print("-" * 60)

if len(validation_results) > 0:
    # Show passed validations
    passed_validations = validation_results[validation_results['validation_status'] == 'PASS']
    if len(passed_validations) > 0:
        print(f"\n✅ PASSED VALIDATIONS ({len(passed_validations)}):")
        for idx, row in passed_validations.head(5).iterrows():
            print(f"   • {row['ingredient_name']} ({row['quantity_g']}g)")
            print(f"     Matched: {row['matched_ingredient']} via {row['match_type']}")
            print(f"     Energy: {row['calc_energy']} vs {row['expected_energy']} (diff: {row['energy_diff_percent']}%)")
    
    # Show failed validations
    failed_validations = validation_results[validation_results['validation_status'] == 'FAIL']
    if len(failed_validations) > 0:
        print(f"\n❌ FAILED VALIDATIONS ({len(failed_validations)}):")
        for idx, row in failed_validations.head(5).iterrows():
            print(f"   • {row['ingredient_name']} ({row['quantity_g']}g)")
            print(f"     Matched: {row['matched_ingredient']} via {row['match_type']}")
            print(f"     Energy: {row['calc_energy']} vs {row['expected_energy']} (diff: {row['energy_diff_percent']}%)")
            failed_nutrients = []
            if not row['energy_pass']: failed_nutrients.append('Energy')
            if not row['protein_pass']: failed_nutrients.append('Protein')
            if not row['carbs_pass']: failed_nutrients.append('Carbs')
            if not row['fat_pass']: failed_nutrients.append('Fat')
            print(f"     Failed nutrients: {', '.join(failed_nutrients)}")
    
    # Show some examples in tabular format
    print(f"\n📊 SAMPLE VALIDATION DETAILS:")
    display_cols = ['ingredient_name', 'quantity_g', 'calc_energy', 'expected_energy', 
                   'energy_diff_percent', 'validation_status']
    sample_results = validation_results[display_cols].head(10)
    print(sample_results.to_string(index=False))

# Overall validation assessment
print(f"\n🎯 VALIDATION ASSESSMENT:")
if 'error' not in validation_summary:
    overall_rate = validation_summary['overall_pass_rate']
    if overall_rate >= 95:
        print("   🟢 EXCELLENT: Calculations are highly accurate (≥95% pass rate)")
    elif overall_rate >= 90:
        print("   🟡 GOOD: Calculations are mostly accurate (≥90% pass rate)")
    elif overall_rate >= 80:
        print("   🟠 FAIR: Some calculation issues detected (≥80% pass rate)")
    else:
        print("   🔴 POOR: Significant calculation errors detected (<80% pass rate)")
        
    print(f"   📐 Calculation accuracy: {overall_rate:.1f}%")
    print(f"   🔍 Tolerance: ±{validation_summary['tolerance_percent']}%")

print(f"\n" + "=" * 60)
print("✅ VALIDATION COMPLETE!")

In [None]:
import pandas as pd

# Show all exact matches
exact_matches_df = ingredients_with_nutrition_updated[ingredients_with_nutrition_updated['match_type'] == 'exact_match']
print(f"\nExact Matches:")

exact_sample = exact_matches_df[['ingredient_name', 'matched_ingredient', 'match_type']]

# Display all rows without being cut off
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(exact_sample)


In [None]:
# Show all no matches
no_matches_df = ingredients_with_nutrition_updated [ingredients_with_nutrition_updated['match_type'] == 'no_match']
print(f"\nNo Matches:")

no_match_sample = no_matches_df[['ingredient_name', 'match_type']]

# Display all rows and columns fully
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(no_match_sample)


In [None]:
keyword_matches_df = ingredients_with_nutrition_updated [ingredients_with_nutrition_updated ['match_type'].str.contains('keyword_match', na=False)]
keyword_matches = keyword_matches_df[['ingredient_name', 'matched_ingredient', 'match_type']]

# Ensure full display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(keyword_matches)

In [None]:
# Check the total counts and math
total_processed = len(ingredients_with_nutrition_updated)
exact_matches = (ingredients_with_nutrition_updated['match_type'] == 'exact_match').sum()
keyword_matches = ingredients_with_nutrition_updated['match_type'].str.contains('keyword_match', na=False).sum()
no_matches = (ingredients_with_nutrition_updated['match_type'] == 'no_match').sum()
excluded_zeros = (ingredients_with_nutrition_updated['match_type'] == 'excluded_zeros').sum()

print(f"Total ingredients processed: {total_processed}")
print(f"Exact matches: {exact_matches}")
print(f"Keyword matches: {keyword_matches}")
print(f"No matches: {no_matches}")
print(f"Excluded zeros: {excluded_zeros}")

##### Calculate Recipe

In [None]:
# 🔧 SOLUTION: Ensure no_match cases have zero values for all numeric fields
def ensure_no_match_zero_values(df):
    print("🔧 ENSURING NO_MATCH CASES HAVE ZERO VALUES")
    print("=" * 50)
    
    # Define all numeric nutrition columns that should be zero for no_match
    numeric_nutrition_cols = [
        'energy_kcal', 'carbs_g', 'protein_g', 'fat_g', 
        'carbohydrate_g',  # Alternative naming
        # Add any other numeric nutrition columns you have
    ]
    
    # Find no_match cases
    no_match_mask = df['match_type'] == 'no_match'
    no_match_count = no_match_mask.sum()
    
    print(f"📊 Found {no_match_count} ingredients with 'no_match' status")
    
    if no_match_count > 0:
        # Set all numeric nutrition values to 0 for no_match cases
        for col in numeric_nutrition_cols:
            if col in df.columns:
                # Check current state
                before_none = df.loc[no_match_mask, col].isna().sum()
                before_values = df.loc[no_match_mask, col].notna().sum()
                
                # Set to zero
                df.loc[no_match_mask, col] = 0.0
                
                print(f"   ✅ {col}: Set {before_none} None/NaN + {before_values} existing values → 0")
        
        # Also ensure nutrition_matched is False for no_match cases
        if 'nutrition_matched' in df.columns:
            df.loc[no_match_mask, 'nutrition_matched'] = False
            print(f"   ✅ nutrition_matched: Set to False for all no_match cases")
        
        # Set micronutrients to empty list for no_match cases
        if 'micronutrients' in df.columns:
            df.loc[no_match_mask, 'micronutrients'] = '[]'  # Empty list as string
            print(f"   ✅ micronutrients: Set to empty list for all no_match cases")
        
        # Show sample of corrected data
        print(f"\n📋 SAMPLE CORRECTED NO_MATCH CASES:")
        sample_cols = ['ingredient_name', 'match_type'] + [col for col in numeric_nutrition_cols if col in df.columns]
        sample_no_match = df[no_match_mask][sample_cols].head(3)
        display(sample_no_match)
        
        print(f"\n✅ SUCCESS: All {no_match_count} no_match cases now have zero values for numeric fields!")
    else:
        print("ℹ️ No ingredients with 'no_match' status found.")
    
    return df

# Apply the correction to your current data
if 'ingredients_with_nutrition_updated' in locals():
    print("🧪 TESTING: Before correction")
    no_match_before = ingredients_with_nutrition_updated[ingredients_with_nutrition_updated['match_type'] == 'no_match']
    if len(no_match_before) > 0:
        print(f"Sample before correction (showing nutrition values):")
        display(no_match_before[['ingredient_name', 'match_type', 'energy_kcal', 'carbs_g', 'protein_g', 'fat_g']].head(3))
    
    # Apply the correction
    ingredients_with_nutrition_updated = ensure_no_match_zero_values(ingredients_with_nutrition_updated)
    
    print(f"\n🧪 VERIFICATION: After correction")
    no_match_after = ingredients_with_nutrition_updated[ingredients_with_nutrition_updated['match_type'] == 'no_match']
    if len(no_match_after) > 0:
        print(f"Sample after correction (all should be 0.0):")
        display(no_match_after[['ingredient_name', 'match_type', 'energy_kcal', 'carbs_g', 'protein_g', 'fat_g']].head(3))
else:
    print("❌ ingredients_with_nutrition_updated not found. Please run the ingredient processing first.")

In [None]:
# Calculate recipe-level nutrition using new strategy
def calculate_recipe_nutrition_new(ingredients_df):
    """
    Calculate overall recipe nutrition by aggregating ingredient nutrition.
    For numeric nutrients: sum all values and divide by number of ingredients
    For micronutrients: find overlapping nutrients across all ingredients
    """
    print("=== CALCULATING RECIPE-LEVEL NUTRITION (NEW STRATEGY) ===")
    
    # Group by recipe
    recipe_nutrition = []
    
    for recipe_name, recipe_group in ingredients_df.groupby('recipe_name'):
        # Filter only ingredients with nutrition data
        with_nutrition = recipe_group[recipe_group['nutrition_matched'] == True]
        
        if len(with_nutrition) == 0:
            continue
            
        recipe_data = {
            'recipe_name': recipe_name,
            'total_ingredients': len(recipe_group),
            'ingredients_with_nutrition': len(with_nutrition),
            'nutrition_coverage': len(with_nutrition) / len(recipe_group) * 100,
            'exact_matches': len(with_nutrition[with_nutrition['match_type'] == 'exact_match']),
            'keyword_matches': len(with_nutrition[with_nutrition['match_type'].str.contains('keyword_match', na=False)])
        }
        
        # Calculate total numeric nutrients (sum without averaging)
        numeric_cols = ['energy_kcal', 'carbs_g', 'protein_g', 'fat_g']
        
        for col in numeric_cols:
            total_value = with_nutrition[col].sum()
            recipe_data[f'total_{col}'] = total_value  # Total nutrition value
            # Optional: Keep average per ingredient with nutrition data
            recipe_data[f'avg_per_matched_{col}'] = total_value / len(with_nutrition) if len(with_nutrition) > 0 else 0
        
        # Find overlapping micronutrients
        micronutrient_lists = []
        for _, ingredient in with_nutrition.iterrows():
            if ingredient['micronutrients'] and str(ingredient['micronutrients']).strip():
                try:
                    # Try to parse as list if it's a string representation
                    micro_str = str(ingredient['micronutrients'])
                    if micro_str.startswith('[') and micro_str.endswith(']'):
                        micro_list = eval(micro_str)  # Be careful with eval in production
                    else:
                        # Split by comma if it's a comma-separated string
                        micro_list = [m.strip() for m in micro_str.split(',')]
                    micronutrient_lists.append(set(micro_list))
                except:
                    # If parsing fails, treat as single item
                    micronutrient_lists.append({str(ingredient['micronutrients'])})
        
        # Find overlapping micronutrients (present in all ingredients)
        if micronutrient_lists:
            overlapping_micronutrients = set.intersection(*micronutrient_lists)
            all_micronutrients = set.union(*micronutrient_lists)
            
            recipe_data['overlapping_micronutrients'] = list(overlapping_micronutrients)
            recipe_data['all_micronutrients'] = list(all_micronutrients)
            recipe_data['micronutrient_overlap_count'] = len(overlapping_micronutrients)
            recipe_data['total_unique_micronutrients'] = len(all_micronutrients)
        else:
            recipe_data['overlapping_micronutrients'] = []
            recipe_data['all_micronutrients'] = []
            recipe_data['micronutrient_overlap_count'] = 0
            recipe_data['total_unique_micronutrients'] = 0
        
        recipe_nutrition.append(recipe_data)
    
    recipe_nutrition_df = pd.DataFrame(recipe_nutrition)
    print(f"✅ Calculated nutrition for {len(recipe_nutrition_df)} recipes")
    
    return recipe_nutrition_df

In [None]:
# Execute recipe nutrition calculation
recipe_nutrition_new = calculate_recipe_nutrition_new(ingredients_with_nutrition_updated)

print("columns:" , recipe_nutrition_new.columns.to_list)

display(recipe_nutrition_new.head(10))


In [None]:
ingredients_with_nutrition_updated.to_list()


##### Last Validation w/ Saving Excel

In [None]:
# Check count of texture None values
none_count = df[df['texture'].isna()].shape[0]  # For actual None/NaN values
print(f"Number of None values in texture column: {none_count}")

# Check count of "None" string values
none_string_count = df[df['texture'] == "None"].shape[0]
print(f"Number of 'None' string values in texture column: {none_string_count}")

# Check count of empty strings
empty_string_count = df[df['texture'] == ""].shape[0]  
print(f"Number of empty strings in texture column: {empty_string_count}")

# Check count of "NONE" uppercase string values (as seen in your code elsewhere)
none_upper_count = df[df['texture'] == "NONE"].shape[0]
print(f"Number of 'NONE' values in texture column: {none_upper_count}")

In [None]:
# Flatten the list of lists to get all unique ingredients
all_ingredients = set()
for ingredient_list in df['ner_ingredient']:
    if isinstance(ingredient_list, list):
        all_ingredients.update(ingredient_list)

print(f"Total unique ingredients found: {len(all_ingredients)}")
print("Unique ingredients:")
for ingredient in sorted(all_ingredients):
    print(f"- {ingredient}")

In [None]:
# Count occurrences of each unique ingredient
ingredient_counts = {}
for ingredient_list in df['ner_ingredient']:
    if isinstance(ingredient_list, list):
        for ingredient in ingredient_list:
            if ingredient in ingredient_counts:
                ingredient_counts[ingredient] += 1
            else:
                ingredient_counts[ingredient] = 1

# Sort ingredients by frequency (most common first)
sorted_ingredients = sorted(ingredient_counts.items(), key=lambda x: x[1], reverse=True)

# Print the results
print(f"Total unique ingredients found: {len(ingredient_counts)}")
print("\nIngredient frequency (sorted by most common):")
for ingredient, count in sorted_ingredients:
    print(f"- {ingredient}: {count} recipes")

# Alternatively, print alphabetically with counts
print("\nIngredient frequency (sorted alphabetically):")
for ingredient in sorted(ingredient_counts.keys()):
    count = ingredient_counts[ingredient]
    print(f"- {ingredient}: {count} recipes")

In [None]:
# Step 1: Create unique ingredients dataframe with ingredient IDs
unique_ingredients_list = sorted(list(all_ingredients))

# Create ingredients dataframe with IDs
ingredients_df = pd.DataFrame({
    'ingredient_id': range(1, len(unique_ingredients_list) + 1),
    'ingredient_name': unique_ingredients_list
})

print(f"Created ingredients dataframe with {len(ingredients_df)} unique ingredients")
print("\nFirst 10 ingredients:")
print(ingredients_df.head(10))
print("\nLast 5 ingredients:")
print(ingredients_df.tail())

In [None]:
print("\nDetecting allergens for all ingredients...")
ingredients_df['detected_allergens'] = ingredients_df['ingredient_name'].apply(
    lambda x: detect_ingredient_allergens(x, allergen_tags)
)

# Filter ingredients with no allergens (empty list)
ingredients_with_no_allergens = ingredients_df[ingredients_df['detected_allergens'].apply(len) == 0]

# Store them in a separate dataframe
null_allergen_ingredients_df = ingredients_with_no_allergens[['ingredient_id', 'ingredient_name']].copy()

In [None]:

print(f"\nFound {len(null_allergen_ingredients_df)} ingredients with NO allergens detected:")
print("\nIngredients with no allergens:")
for _, row in null_allergen_ingredients_df.iterrows():
    print(f"- ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}'")

# Show summary statistics
print(f"\n📊 Allergen Detection Summary:")
print(f"- Total ingredients: {len(ingredients_df)}")
print(f"- Ingredients with allergens: {len(ingredients_df) - len(null_allergen_ingredients_df)}")
print(f"- Ingredients with NO allergens: {len(null_allergen_ingredients_df)}")
print(f"- Percentage with no allergens: {len(null_allergen_ingredients_df)/len(ingredients_df)*100:.1f}%")

# Save the null allergen ingredients to a separate Excel file if needed
null_allergen_ingredients_df.to_excel('ingredients_with_no_allergens.xlsx', index=False)
print(f"\n✅ Saved {len(null_allergen_ingredients_df)} ingredients with no allergens to 'ingredients_with_no_allergens.xlsx'")

In [None]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [None]:

def detect_allergen_with_strategy(ingredient_name, allergen_tags):
    """
    Advanced allergen detection using two-step matching strategy:
    1. Exact matching - check if ingredient name exactly matches any keyword
    2. Partial keyword matching - check if any allergen keyword appears in ingredient name
    
    Args:
        ingredient_name (str): Name of the ingredient to check
        allergen_tags (dict): Dictionary of allergen information
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact' or 'partial' or None,
            'matched_keyword': str or None,
            'confidence': float (0.0 to 1.0)
        }
    """
    ingredient_lower = ingredient_name.lower().strip()
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # Step 1: Exact matching (highest priority)
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }
    
    # Step 2: Partial keyword matching (lower priority)
    partial_matches = []
    
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            
            # Check if keyword appears as a word in ingredient name
            if keyword_lower in ingredient_lower:
                # Calculate confidence based on match quality
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                
                partial_matches.append({
                    'allergen_group': allergen_group,
                    'match_type': 'partial',
                    'matched_keyword': keyword,
                    'confidence': confidence,
                    'allergen_name': allergen_name
                })
    
    # If partial matches found, return the best one
    if partial_matches:
        # Sort by confidence (highest first)
        partial_matches.sort(key=lambda x: x['confidence'], reverse=True)
        best_match = partial_matches[0]
        
        # Only return if confidence is above threshold
        if best_match['confidence'] >= 0.5:
            return {
                'allergen_group': best_match['allergen_group'],
                'match_type': best_match['match_type'],
                'matched_keyword': best_match['matched_keyword'],
                'confidence': best_match['confidence']
            }
    
    # No match found
    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

def calculate_match_confidence(ingredient_name, keyword):
    """
    Calculate confidence score for partial matches based on various factors.
    
    Args:
        ingredient_name (str): The ingredient name (lowercase)
        keyword (str): The allergen keyword (lowercase)
    
    Returns:
        float: Confidence score between 0.0 and 1.0
    """
    # Base confidence for any partial match
    confidence = 0.6
    
    # Boost confidence if keyword is a significant portion of ingredient name
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    
    # Boost confidence if keyword appears at start or end
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    
    # Boost confidence if keyword appears as a complete word (surrounded by spaces or boundaries)
    import re
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    
    # Reduce confidence for very short keywords in long ingredient names
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    
    # Cap confidence at 0.95 for partial matches (exact matches get 1.0)
    return min(0.95, max(0.0, confidence))

def detect_allergen_simple(ingredient_name, allergen_tags):
    """
    Simplified version that returns just the allergen group (backward compatibility).
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags
    
    Returns:
        str or None: Allergen group name or None
    """
    result = detect_allergen_with_strategy(ingredient_name, allergen_tags)
    return result.get('allergen_group')

def test_allergen_detection():
    """
    Test function to demonstrate the matching strategy.
    """
    test_ingredients = [
        "milk",                    # Exact match
        "whole milk",             # Partial match  
        "milk chocolate",         # Partial match
        "almond butter",          # Exact match (tree nuts)
        "peanut oil",            # Exact match (peanuts)
        "wheat flour",           # Partial match (gluten)
        "salmon fillet",         # Partial match (fish)
        "chicken breast",        # No match
        "egg white powder",      # Partial match (egg)
        "soy sauce concentrate"  # Partial match (soy)
    ]
    
    print("🧪 TESTING ALLERGEN DETECTION STRATEGY")
    print("=" * 50)
    
    for ingredient in test_ingredients:
        result = detect_allergen_with_strategy(ingredient, ALLERGEN_TAGS)
        
        if result['allergen_group']:
            print(f"✅ '{ingredient}':")
            print(f"   → Allergen: {result['allergen_group']}")
            print(f"   → Match type: {result['match_type']}")
            print(f"   → Matched keyword: '{result['matched_keyword']}'")
            print(f"   → Confidence: {result['confidence']:.2f}")
        else:
            print(f"❌ '{ingredient}': No allergen detected")
        print()

# Uncomment to run test
# test_allergen_detection()

# Quick test examples
if __name__ == "__main__":
    # Test the function with a few examples
    test_cases = ["milk", "wheat flour", "almond butter", "chicken", "egg white"]
    
    print("Quick Test Results:")
    for ingredient in test_cases:
        result = detect_allergen_with_strategy(ingredient, ALLERGEN_TAGS)
        if result['allergen_group']:
            print(f"  {ingredient} → {result['allergen_group']} ({result['match_type']}, {result['confidence']:.2f})")
        else:
            print(f"  {ingredient} → No allergen detected")

In [None]:
def detect_primary_allergen_for_ingredient(ingredient_name, allergen_tags):
    """
    Detect the primary allergen for a single ingredient based on allergen tags.
    Ensures 1 ingredient maps to 1 primary allergen only.
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags with keywords
    
    Returns:
        str or None: Primary allergen group name, or None if no allergen detected
    """
    ingredient_lower = ingredient_name.lower().strip()
    detected_allergens = []
    
    # Priority order for allergens (most specific to least specific)
    allergen_priority = [
        'shellfish', 'fish', 'peanuts', 'tree_nuts', 'egg', 'dairy', 'soy', 'gluten'
    ]
    
    # First pass: detect all matching allergens
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        # Check if any keyword matches the ingredient exactly
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                detected_allergens.append(allergen_group)
                break  # No need to check other keywords for this allergen
    
    # If no allergens detected, return None
    if not detected_allergens:
        return None
    
    # If only one allergen detected, return it
    if len(detected_allergens) == 1:
        return detected_allergens[0]
    
    # If multiple allergens detected, select based on priority
    for priority_allergen in allergen_priority:
        if priority_allergen in detected_allergens:
            return priority_allergen
    
    # Fallback: return the first detected allergen
    return detected_allergens[0]


def add_allergen_group_id_to_ingredients(ingredient_df, allergen_df, allergen_tags):
    """
    Add allergen_group_id column to ingredient_df based on allergen detection.
    
    Args:
        ingredient_df: DataFrame with ingredient_id and ingredient_name
        allergen_df: DataFrame with allergen group information (pk, name, description)
        allergen_tags: Dictionary of allergen tags for detection
    
    Returns:
        DataFrame: Updated ingredient_df with allergen_group_id column
    """
    print("🔗 CONNECTING ALLERGEN GROUP IDs WITH INGREDIENT IDs")
    print("=" * 60)
    
    # Create a mapping from allergen group name to allergen group ID
    allergen_name_to_id = dict(zip(allergen_df['name'], allergen_df['pk']))
    
    print(f"📊 Available allergen groups:")
    for name, pk in allergen_name_to_id.items():
        print(f"   - {name} (ID: {pk})")
    
    # Create a copy of ingredient_df to avoid modifying the original
    updated_ingredient_df = ingredient_df.copy()
    
    # Detect primary allergen for each ingredient
    print(f"\n🔍 Detecting primary allergens for {len(updated_ingredient_df)} ingredients...")
    
    primary_allergens = []
    allergen_group_ids = []
    
    for _, row in updated_ingredient_df.iterrows():
        ingredient_name = row['ingredient_name']
        primary_allergen = detect_primary_allergen_for_ingredient(ingredient_name, allergen_tags)
        primary_allergens.append(primary_allergen)
        
        # Map to allergen group ID
        if primary_allergen and primary_allergen in allergen_name_to_id:
            allergen_group_id = allergen_name_to_id[primary_allergen]
        else:
            allergen_group_id = None  # No allergen detected
        
        allergen_group_ids.append(allergen_group_id)
    
    # Add the new columns
    updated_ingredient_df['primary_allergen'] = primary_allergens
    updated_ingredient_df['allergen_group_id'] = allergen_group_ids
    
    # Calculate statistics
    total_ingredients = len(updated_ingredient_df)
    ingredients_with_allergens = updated_ingredient_df['allergen_group_id'].notna().sum()
    ingredients_without_allergens = total_ingredients - ingredients_with_allergens
    
    print(f"\n✅ ALLERGEN MAPPING RESULTS:")
    print(f"   Total ingredients: {total_ingredients}")
    print(f"   Ingredients with allergens: {ingredients_with_allergens} ({ingredients_with_allergens/total_ingredients*100:.1f}%)")
    print(f"   Ingredients without allergens: {ingredients_without_allergens} ({ingredients_without_allergens/total_ingredients*100:.1f}%)")
    
    # Show breakdown by allergen group
    allergen_counts = updated_ingredient_df['primary_allergen'].value_counts()
    print(f"\n📊 Breakdown by allergen group:")
    for allergen, count in allergen_counts.items():
        if allergen:  # Skip None values
            allergen_id = allergen_name_to_id.get(allergen, 'Unknown')
            print(f"   - {allergen} (ID: {allergen_id}): {count} ingredients")
    
    # Show sample of mapped ingredients
    print(f"\n📋 Sample mapped ingredients:")
    sample_with_allergens = updated_ingredient_df[updated_ingredient_df['allergen_group_id'].notna()].head(10)
    for _, row in sample_with_allergens.iterrows():
        print(f"   - ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', Allergen: {row['primary_allergen']} (Group ID: {row['allergen_group_id']})")
    
    # Show sample of ingredients without allergens
    sample_without_allergens = updated_ingredient_df[updated_ingredient_df['allergen_group_id'].isna()].head(5)
    if len(sample_without_allergens) > 0:
        print(f"\n📋 Sample ingredients without allergens:")
        for _, row in sample_without_allergens.iterrows():
            print(f"   - ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}'")
    
    return updated_ingredient_df


# Apply the function to add allergen group IDs
ingredient_df_with_allergens = add_allergen_group_id_to_ingredients(
    ingredient_df, 
    allergen_df, 
    allergen_tags
)

In [None]:
# Validate and analyze the allergen mapping results
print("🔍 VALIDATION AND ANALYSIS OF ALLERGEN MAPPING")
print("=" * 50)

# Check the structure of the updated dataframe
print(f"📊 Updated ingredient_df structure:")
print(f"   Columns: {list(ingredient_df_with_allergens.columns)}")
print(f"   Shape: {ingredient_df_with_allergens.shape}")

# Show detailed breakdown
print(f"\n📈 Detailed allergen group mapping:")
allergen_mapping_summary = ingredient_df_with_allergens.groupby(['primary_allergen', 'allergen_group_id']).size().reset_index(name='count')
for _, row in allergen_mapping_summary.iterrows():
    if pd.notna(row['primary_allergen']):
        print(f"   {row['primary_allergen']} (Group ID: {row['allergen_group_id']}): {row['count']} ingredients")

# Show some specific examples
print(f"\n🔍 Example allergen mappings:")
example_allergens = ['dairy', 'egg', 'soy', 'gluten', 'tree_nuts']
for allergen in example_allergens:
    examples = ingredient_df_with_allergens[ingredient_df_with_allergens['primary_allergen'] == allergen].head(3)
    if len(examples) > 0:
        print(f"\n   {allergen.upper()} examples:")
        for _, row in examples.iterrows():
            print(f"     - '{row['ingredient_name']}' → Group ID: {row['allergen_group_id']}")

# Save the updated dataframe
output_filename = "ingredient_master_with_allergens.xlsx"
ingredient_df_with_allergens.to_excel(output_filename, index=False)
print(f"\n💾 Saved updated ingredient dataframe to '{output_filename}'")

# Display sample of final structure
print(f"\n📋 Final ingredient dataframe sample:")
display(ingredient_df_with_allergens[['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id']].head(10))

In [None]:
# Enhanced allergen detection with exclusion terms and isAllergen column
def add_exclusion_terms_and_allergen_flag(ingredient_df_with_allergens):
    """
    Add exclusion terms logic and isAllergen column to ingredient dataframe.
    
    Exclusion terms like "breast milk", "formula milk" should NOT be flagged as allergens
    even if they contain allergen keywords like "milk".
    
    Args:
        ingredient_df_with_allergens: DataFrame with allergen mapping results
        
    Returns:
        DataFrame: Enhanced dataframe with exclusion logic and isAllergen column
    """
    print("🚫 APPLYING EXCLUSION TERMS AND ADDING isAllergen COLUMN")
    print("=" * 60)
    
    # Define exclusion terms - ingredients that should NOT be considered allergens
    exclusion_terms = [
        "breast milk", "breastmilk", "formula milk", "formula", "breast"
    ]
    
    print(f"📋 Exclusion terms: {exclusion_terms}")
    
    # Create a copy to work with
    enhanced_df = ingredient_df_with_allergens.copy()
    
    # Check for exclusions and nullify allergen data if needed
    exclusion_count = 0
    excluded_ingredients = []
    
    for idx, row in enhanced_df.iterrows():
        ingredient_name = row['ingredient_name'].lower().strip()
        
        # Check if ingredient matches any exclusion term
        is_excluded = False
        for exclusion_term in exclusion_terms:
            if exclusion_term.lower() in ingredient_name:
                is_excluded = True
                exclusion_count += 1
                excluded_ingredients.append({
                    'ingredient_name': row['ingredient_name'],
                    'original_allergen': row['primary_allergen'],
                    'original_allergen_id': row['allergen_group_id'],
                    'exclusion_term': exclusion_term
                })
                break
        
        # If excluded, nullify allergen information
        if is_excluded:
            enhanced_df.at[idx, 'primary_allergen'] = None
            enhanced_df.at[idx, 'allergen_group_id'] = None
    
    # Add isAllergen column based on allergen_group_id
    enhanced_df['isAllergen'] = enhanced_df['allergen_group_id'].notna()
    
    # Calculate statistics
    total_ingredients = len(enhanced_df)
    ingredients_with_allergens = enhanced_df['isAllergen'].sum()
    ingredients_without_allergens = total_ingredients - ingredients_with_allergens
    
    print(f"\n✅ EXCLUSION AND FLAG RESULTS:")
    print(f"   Total ingredients processed: {total_ingredients}")
    print(f"   Excluded due to exclusion terms: {exclusion_count}")
    print(f"   Final ingredients with allergens: {ingredients_with_allergens}")
    print(f"   Final ingredients without allergens: {ingredients_without_allergens}")
    print(f"   Allergen rate: {(ingredients_with_allergens/total_ingredients)*100:.1f}%")
    
    # Show excluded ingredients
    if excluded_ingredients:
        print(f"\n🚫 Excluded ingredients (due to exclusion terms):")
        for item in excluded_ingredients:
            print(f"   - '{item['ingredient_name']}' (was: {item['original_allergen']}) → excluded by '{item['exclusion_term']}'")
    
    # Show updated allergen breakdown
    print(f"\n📊 Updated allergen breakdown:")
    allergen_counts = enhanced_df[enhanced_df['isAllergen']]['primary_allergen'].value_counts()
    for allergen, count in allergen_counts.items():
        print(f"   - {allergen}: {count} ingredients")
    
    # Show sample of final structure
    print(f"\n📋 Sample of enhanced dataframe:")
    sample_cols = ['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id', 'isAllergen']
    print("With allergens:")
    sample_with = enhanced_df[enhanced_df['isAllergen']].head(5)
    for _, row in sample_with.iterrows():
        print(f"   ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', Allergen: {row['primary_allergen']}, isAllergen: {row['isAllergen']}")
    
    print("\nWithout allergens:")
    sample_without = enhanced_df[~enhanced_df['isAllergen']].head(5)
    for _, row in sample_without.iterrows():
        print(f"   ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', isAllergen: {row['isAllergen']}")
    
    return enhanced_df

# Apply the enhancement
ingredient_df_final = add_exclusion_terms_and_allergen_flag(ingredient_df_with_allergens)

In [None]:
# Final validation and export of enhanced ingredient dataframe
print("📊 FINAL INGREDIENT DATAFRAME VALIDATION")
print("=" * 50)

# Check the final structure
print(f"Final dataframe columns: {list(ingredient_df_final.columns)}")
print(f"Final dataframe shape: {ingredient_df_final.shape}")

# Data type validation
print(f"\n📋 Column data types:")
for col in ingredient_df_final.columns:
    print(f"   {col}: {ingredient_df_final[col].dtype}")

# isAllergen column validation
allergen_true_count = ingredient_df_final['isAllergen'].sum()
allergen_false_count = (~ingredient_df_final['isAllergen']).sum()

print(f"\n✅ isAllergen column validation:")
print(f"   True (has allergen): {allergen_true_count}")
print(f"   False (no allergen): {allergen_false_count}")
print(f"   Total: {allergen_true_count + allergen_false_count}")

# Cross-validation: isAllergen should match allergen_group_id presence
validation_passed = True
mismatch_count = 0

for _, row in ingredient_df_final.iterrows():
    has_allergen_id = pd.notna(row['allergen_group_id'])
    is_allergen_flag = row['isAllergen']
    
    if has_allergen_id != is_allergen_flag:
        mismatch_count += 1
        if mismatch_count <= 5:  # Show first 5 mismatches
            print(f"   ⚠️ Mismatch: {row['ingredient_name']} - allergen_group_id: {row['allergen_group_id']}, isAllergen: {is_allergen_flag}")
        validation_passed = False

if validation_passed:
    print("   ✅ All isAllergen flags match allergen_group_id presence")
else:
    print(f"   ❌ Found {mismatch_count} mismatches between isAllergen and allergen_group_id")

# Show final breakdown by allergen type
print(f"\n📈 Final allergen distribution:")
allergen_breakdown = ingredient_df_final[ingredient_df_final['isAllergen']]['primary_allergen'].value_counts()
for allergen, count in allergen_breakdown.items():
    percentage = (count / allergen_true_count) * 100
    print(f"   {allergen}: {count} ingredients ({percentage:.1f}%)")

# Export final results
final_output_filename = "ingredient_master_final_with_allergens.xlsx"
ingredient_df_final.to_excel(final_output_filename, index=False)
print(f"\n💾 Exported final ingredient dataframe to '{final_output_filename}'")

# Display final structure sample
print(f"\n📋 Final dataframe structure sample:")
display(ingredient_df_final[['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id', 'isAllergen']].head(15))

Finish Populating Data

In [None]:
#save the final dataset
df.to_excel("cfirstversion_current_dataset.xlsx", index=False)
print("✅ DataFrame saved to 'cfirstversion_current_dataset.xlsx'")

Creating Category With Recipe Id

In [None]:
# Define categories and assign IDs
categories = [
    'vegan', 'vegetarian', 'pescetarian', 'dairy_free', 
    'egg_free', 'soy_free', 'nut_free', 'gluten_free',
    'halal', 'non_halal', 'non_veg'
]

category_id_map = {category: idx for idx, category in enumerate(categories, 1)}

# Create a reference DataFrame for categories
category_df = pd.DataFrame({
    'category_id': list(category_id_map.values()),
    'category_name': list(category_id_map.keys())
})

print("Category DataFrame:")
print(category_df)

In [None]:
df.shape
df.head(10)

## 🆔 Create Ingredient Master DataFrame with Unique IDs

##### Detail

In [None]:
def map_ingredient_ids_to_recipe_df(recipe_ingredient_df, ingredient_id_mapping, use_standardized=True):
    """
    Map ingredient IDs to the recipe ingredient DataFrame using cleaned ingredient names
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        ingredient_id_mapping: Dictionary mapping cleaned ingredient names to IDs
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with ingredient_id column
    """
    print("🔗 MAPPING INGREDIENT IDS TO RECIPE DATAFRAME")
    print("-" * 50)
    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return recipe_ingredient_df
    
    # Create a copy to avoid modifying the original
    updated_df = recipe_ingredient_df.copy()
    
    # Clean the ingredient names in the DataFrame and map to IDs
    def get_ingredient_id(ingredient_name):
        if pd.isna(ingredient_name):
            return None
        cleaned_name = clean_ingredient_name_symbols(ingredient_name)
        return ingredient_id_mapping.get(cleaned_name, None)
    
    updated_df['ingredient_id'] = updated_df[ingredient_column].apply(get_ingredient_id)
    
    # Check mapping results
    total_rows = len(updated_df)
    mapped_rows = updated_df['ingredient_id'].notna().sum()
    unmapped_rows = total_rows - mapped_rows
    
    print(f"✅ Ingredient ID mapping complete:")
    print(f"   Total rows: {total_rows}")
    print(f"   Successfully mapped: {mapped_rows} ({(mapped_rows/total_rows)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_rows} ({(unmapped_rows/total_rows)*100:.1f}%)")
    
    if unmapped_rows > 0:
        print(f"\n🔍 Sample unmapped ingredients:")
        unmapped_sample = updated_df[updated_df['ingredient_id'].isna()][ingredient_column].dropna().unique()[:5]
        for ingredient in unmapped_sample:
            cleaned = clean_ingredient_name_symbols(ingredient)
            print(f"   Original: '{ingredient}' → Cleaned: '{cleaned}'")
    
    return updated_df

In [None]:
# Step 2: Map ingredient IDs back to recipe DataFrame
if 'ingredient_id_mapping' in locals() and 'recipe_ingredient_df' in locals():
    print("\n📋 Step 2: Mapping ingredient IDs to recipe DataFrame...")
    
    # Update the recipe_ingredient_df with ingredient IDs
    recipe_ingredient_df_with_ids = map_ingredient_ids_to_recipe_df(
        recipe_ingredient_df,
        ingredient_id_mapping,
        use_standardized=True
    )
    
    print(f"\n📋 Sample of updated recipe_ingredient_df with IDs:")
    sample_cols = ['recipe_id', 'single_ingredient', 'standardized_ingredient', 'ingredient_id']
    available_cols = [col for col in sample_cols if col in recipe_ingredient_df_with_ids.columns]
    display(recipe_ingredient_df_with_ids[available_cols].head(10))
    
    print(f"\n📊 Final DataFrame Statistics:")
    print(f"   Recipe DataFrame shape: {recipe_ingredient_df_with_ids.shape}")
    print(f"   Unique recipes: {recipe_ingredient_df_with_ids['recipe_id'].nunique()}")
    print(f"   Unique ingredients: {len(ingredient_df)}")
    print(f"   Recipe-ingredient relationships: {len(recipe_ingredient_df_with_ids)}")
    
else:
    print("❌ Required variables not found for ID mapping")



In [None]:
# Step 3: Final validation and export
print("📋 Step 3: Final Data Validation and Export")
print("-" * 50)

if 'ingredient_df' in locals() and 'recipe_ingredient_df_with_ids' in locals():
    
    # Validation checks
    print("🔍 FINAL VALIDATION CHECKS:")
    
    # Check 1: Ingredient DataFrame cleanliness
    total_ingredients = len(ingredient_df)
    clean_ingredients = ingredient_df['ingredient_name'].notna().sum()
    empty_ingredients = ingredient_df['ingredient_name'].isna().sum()
    
    print(f"✅ Ingredient Master DataFrame:")
    print(f"   Total unique ingredients: {total_ingredients}")
    print(f"   Clean ingredients: {clean_ingredients}")
    print(f"   Empty/null ingredients: {empty_ingredients}")
    
    # Check 2: Recipe-Ingredient mapping completeness
    total_relationships = len(recipe_ingredient_df_with_ids)
    mapped_relationships = recipe_ingredient_df_with_ids['ingredient_id'].notna().sum()
    unmapped_relationships = total_relationships - mapped_relationships
    
    print(f"\n✅ Recipe-Ingredient Relationships:")
    print(f"   Total relationships: {total_relationships}")
    print(f"   Successfully mapped: {mapped_relationships} ({(mapped_relationships/total_relationships)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_relationships} ({(unmapped_relationships/total_relationships)*100:.1f}%)")
    
    # Check 3: Data consistency
    unique_ingredient_ids_in_recipes = recipe_ingredient_df_with_ids['ingredient_id'].dropna().nunique()
    total_ingredient_ids_in_master = ingredient_df['ingredient_id'].nunique()
    
    print(f"\n✅ Data Consistency:")
    print(f"   Unique ingredient IDs in recipes: {unique_ingredient_ids_in_recipes}")
    print(f"   Total ingredient IDs in master: {total_ingredient_ids_in_master}")
    print(f"   Coverage: {(unique_ingredient_ids_in_recipes/total_ingredient_ids_in_master)*100:.1f}%")
    
    # Export cleaned datasets
    print(f"\n💾 EXPORTING CLEANED DATASETS:")
    
    try:
        # Export ingredient master DataFrame
        ingredient_filename = "cleaned_ingredient_master.xlsx"
        ingredient_df.to_excel(ingredient_filename, index=False)
        print(f"✅ Exported ingredient master: {ingredient_filename}")
        
        # Export recipe-ingredient DataFrame with IDs
        recipe_ingredient_filename = "cleaned_recipe_ingredient_with_ids.xlsx"
        recipe_ingredient_df_with_ids.to_excel(recipe_ingredient_filename, index=False)
        print(f"✅ Exported recipe-ingredient data: {recipe_ingredient_filename}")
        
        print(f"\n🎉 DATA PROCESSING COMPLETE!")
        print(f"📁 Files ready for downstream modeling and analysis")
        
    except Exception as e:
        print(f"❌ Export error: {str(e)}")
        
else:
    print("❌ Required DataFrames not found for validation")

print("\n" + "=" * 60)

In [None]:
# Debug: Check the format of standardized ingredients
print("🔍 DEBUGGING STANDARDIZED INGREDIENT FORMAT")
print("-" * 50)

if 'recipe_ingredient_df' in locals():
    # Sample some standardized ingredients to see their actual format
    sample_ingredients = recipe_ingredient_df['standardized_ingredient'].dropna().head(20).tolist()
    
    print("📋 Sample standardized ingredients (raw format):")
    for i, ingredient in enumerate(sample_ingredients[:10], 1):
        print(f"   {i}. {repr(ingredient)}")  # repr shows quotes and special characters
    
    # Test the cleaning function on these samples
    print(f"\n🧹 Testing cleaning function:")
    for i, ingredient in enumerate(sample_ingredients[:5], 1):
        cleaned = clean_ingredient_name_symbols(ingredient)
        print(f"   {i}. Original: {repr(ingredient)}")
        print(f"      Cleaned:  {repr(cleaned)}")
        print()

else:
    print("❌ recipe_ingredient_df not found")

In [None]:
# Check what ner_ingredient_string contains
print("🔍 COMPARING ner_ingredient vs ner_ingredient_string")
print("=" * 60)

# Check first 3 recipes
for i in range(3):
    row = df.iloc[i]
    print(f"\n📋 Recipe {i+1}: {row['name'][:50]}...")
    print(f"ner_ingredient (type {type(row['ner_ingredient'])}): {row['ner_ingredient']}")
    print(f"ner_ingredient_string (type {type(row['ner_ingredient_string'])}): {row['ner_ingredient_string']}")
    print("-" * 50)

In [None]:
# Simple check of ner_ingredient_string
row = df.iloc[0]
print("ner_ingredient_string:")
print(f"Type: {type(row['ner_ingredient_string'])}")
print(f"Content: {row['ner_ingredient_string']}")
print(f"Length: {len(str(row['ner_ingredient_string']))}")

# Check if it's already a clean string
if isinstance(row['ner_ingredient_string'], str):
    print("✅ ner_ingredient_string is already a string - perfect for direct use!")
else:
    print("❌ ner_ingredient_string is not a string")

In [None]:
def classify_recipe_simplified(recipe_data, dietary_tags):
    """
    Simplified classify_recipe function using ner_ingredient_string directly
    """
    # Use ner_ingredient_string directly (it's already a clean string)
    ner_ingredient_string = recipe_data.get("ner_ingredient_string", "")
    combined_text = str(ner_ingredient_string).lower()

    # Handle allergen data (keeping existing logic)
    detected_allergens = recipe_data.get("allergen", [])
    if isinstance(detected_allergens, str):
        try:
            import ast
            detected_allergens = ast.literal_eval(detected_allergens)
        except (ValueError, SyntaxError):
            detected_allergens = []
    
    allergen = [str(i).strip().lower() for i in detected_allergens if i]
    allergen_text = ' '.join(allergen).lower()

    matched_tags = []

    # Step 1: Match all possible tags
    for tag, rules in dietary_tags.items():
        exclude_found = False

        # Step 1a: Check excluded ingredients
        if "excluded_ingredients" in rules:
            for word in rules["excluded_ingredients"]:
                if word.lower() in combined_text:
                    exclude_found = True
                    break

        # Step 1b: Check excluded allergen groups
        if "excluded_allergen_groups" in rules:
            for group in rules["excluded_allergen_groups"]:
                if group in allergen_text:
                    exclude_found = True
                    break

        # Step 1c: Required ingredients check (e.g., for non_halal)
        if "required_ingredients" in rules:
            required_match = any(word in combined_text for word in rules["required_ingredients"])
            if not required_match:
                continue

        if not exclude_found:
            matched_tags.append(tag)

    # Define categories according to user requirements
    GENERAL_DIET_TAGS = ["vegan", "vegetarian", "pescetarian", "non_veg"]
    RELIGIOUS_TAGS = ["halal", "non_halal"]
    ALLERGEN_TAGS = ["dairy_free", "egg_free", "soy_free", "nut_free", "gluten_free"]

    result = {
        "general_diet": None,
        "religious_tag": None,
        "allergen_tags": [],
    }

    # Step 2: Select only one general diet (priority-based)
    for tag in GENERAL_DIET_TAGS:
        if tag in matched_tags:
            result["general_diet"] = tag
            break
    else:
        # Fallback (shouldn't happen unless none of the 4 are valid)
        result["general_diet"] = "non_veg"

    # Step 3: Select at most one religious tag (prefer halal over non_halal)
    for tag in RELIGIOUS_TAGS:
        if tag in matched_tags:
            result["religious_tag"] = tag
            break

    # Step 4: Select all applicable allergen-friendly tags
    result["allergen_tags"] = [tag for tag in matched_tags if tag in ALLERGEN_TAGS]

    values = []
    
    # Add general diet if present
    if result.get("general_diet"):
        values.append(result["general_diet"])
    
    # Add religious tag if present
    if result.get("religious_tag"):
        values.append(result["religious_tag"])
    
    # Add allergen tags if present
    allergen_tags = result.get("allergen_tags", [])
    if allergen_tags:
        values.extend(allergen_tags)
    
    # Join all values with commas
    return ", ".join(values)