# 🎯 Advanced Texture Prediction System

This section implements an improved texture prediction system that combines:
1. **Rule-based approach** using `texture_label.py` 
2. **Unsupervised learning** with enhanced feature engineering
3. **Hybrid prediction** that leverages both approaches for better accuracy

In [780]:
# 📚 Import texture_label.py and analyze current texture distribution
import os
import sys
import importlib.util
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns  # Not required for this analysis

print("🔍 ANALYZING CURRENT TEXTURE PREDICTION SYSTEM")
print("=" * 60)

# Import texture_label.py
texture_label_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "texture_label.py")
spec = importlib.util.spec_from_file_location("texture_label", texture_label_path)
texture_label = importlib.util.module_from_spec(spec)
spec.loader.exec_module(texture_label)

# Get texture configuration
texture_config = texture_label.texture_config
print("📋 Texture categories in texture_label.py:")
for texture_type, config in texture_config.items():
    keyword_count = len(config['keywords'])
    print(f"   {texture_type}: {keyword_count} keywords")
    # Show first few keywords as example
    sample_keywords = config['keywords'][:3]
    print(f"      Examples: {sample_keywords}")

# Find which dataframes contain texture information
texture_df_candidates = []
global_vars = list(globals().keys())  # Create a copy to avoid iteration issues
for var_name in global_vars:
    try:
        var_obj = globals()[var_name]
        if hasattr(var_obj, 'columns') and hasattr(var_obj, 'shape') and 'texture' in var_obj.columns:
            texture_df_candidates.append((var_name, var_obj))
    except:
        pass

print(f"\n🔍 Found {len(texture_df_candidates)} dataframes with 'texture' column:")
for name, df_obj in texture_df_candidates:
    print(f"   {name}: {df_obj.shape}")

# Use the main dataframe (prioritize 'recipes_testing' if available)
target_df = None
df_name = None

priority_names = ['recipes_testing', 'recipes', 'df', 'recipe_df']
for priority_name in priority_names:
    for name, df_obj in texture_df_candidates:
        if name == priority_name:
            target_df = df_obj
            df_name = name
            break
    if target_df is not None:
        break

if target_df is None and texture_df_candidates:
    df_name, target_df = texture_df_candidates[0]

if target_df is not None:
    print(f"\n✅ Using dataframe: {df_name} with shape {target_df.shape}")
    
    # Analyze current texture distribution
    texture_distribution = target_df['texture'].value_counts()
    print(f"\n📊 Current texture distribution:")
    for texture, count in texture_distribution.items():
        percentage = (count / len(target_df)) * 100
        print(f"   {texture}: {count} ({percentage:.1f}%)")
    
    # Check if predicted_texture exists
    if 'predicted_texture' in target_df.columns:
        predicted_distribution = target_df['predicted_texture'].value_counts()
        print(f"\n🤖 Current predicted texture distribution:")
        for texture, count in predicted_distribution.items():
            percentage = (count / len(target_df)) * 100
            print(f"   {texture}: {count} ({percentage:.1f}%)")
            
        # Check the bias issue
        if len(predicted_distribution) == 1 and 'puree' in predicted_distribution.index:
            print("\n🚨 PROBLEM IDENTIFIED: All predictions are 'puree' - severe bias!")
    else:
        print("\n❓ No 'predicted_texture' column found - will create predictions")
else:
    print("\n❌ No dataframe with texture information found!")
    print("Available DataFrames:")
    for var_name in globals():
        try:
            var_obj = globals()[var_name]
            if hasattr(var_obj, 'columns') and hasattr(var_obj, 'shape'):
                print(f"   {var_name}: {var_obj.shape}")
        except:
            pass

🔍 ANALYZING CURRENT TEXTURE PREDICTION SYSTEM
📋 Texture categories in texture_label.py:
   puree: 24 keywords
      Examples: ['puree', 'blender', 'food processor']
   lumpy texture: 24 keywords
      Examples: ['finely chopped', 'small cubes', 'bite-sized pieces']
   soft finger food: 26 keywords
      Examples: ['roll into balls', 'flatten', 'pan-fry soft']
   family food: 30 keywords
      Examples: ['for adults', 'seasoned for adults', 'serve as is']

🔍 Found 47 dataframes with 'texture' column:
   __: (10, 35)
   ___: (5, 35)
   _1: (5, 35)
   _2: (10, 35)
   df_ingredients: (1309, 42)
   recipe_id_null: (0, 42)
   _96: (10, 42)
   trial: (1322, 44)
   recipe: (1322, 27)
   meat_recipes: (472, 42)
   dairy_recipes: (568, 42)
   egg_recipes: (318, 42)
   sample_df: (10, 43)
   df_category: (1322, 44)
   df_empty_nutrition: (446, 43)
   _243: (5, 35)
   _244: (10, 35)
   _252: (10, 3)
   total_none_texture: (822, 38)
   missing_texture: (0, 38)
   _335: (10, 42)
   recipe_df: (1322,

In [938]:
# 🔧 Enhanced Rule-Based Texture Classification using texture_label.py
def enhanced_rule_based_texture_prediction(df, texture_config):
    """
    Enhanced rule-based texture classification using texture_label.py configuration
    with improved text processing and scoring system
    """
    print("🛠️ ENHANCED RULE-BASED TEXTURE PREDICTION")
    print("-" * 50)
    
    def parse_instructions(instructions):
        """Parse instructions into a clean string"""
        if pd.isna(instructions) or instructions is None:
            return ""
        
        if isinstance(instructions, list):
            return ' '.join([str(inst) for inst in instructions if inst])
        
        return str(instructions).lower()
    
    def classify_texture_enhanced(row):
        """Enhanced texture classification with weighted scoring"""
        # Gather all text data
        recipe_name = str(row.get('name', row.get('recipe_name', ''))).lower()
        
        # Handle different ingredient column names
        ingredients_text = ""
        for col in ['ner_ingredient_string', 'ner_ingredient', 'ingredients']:
            if col in row.index and pd.notna(row[col]):
                if isinstance(row[col], list):
                    ingredients_text = ' '.join([str(i) for i in row[col] if i])
                else:
                    ingredients_text = str(row[col])
                break
        
        instructions_text = parse_instructions(row.get('instructions', ''))
        
        # Combine all text
        combined_text = f"{recipe_name} {ingredients_text} {instructions_text}".lower()
        
        # Score each texture category
        texture_scores = {}
        for texture_type, config in texture_config.items():
            score = 0
            matched_keywords = []
            
            for keyword in config['keywords']:
                keyword_lower = keyword.lower()
                # Count occurrences of keyword (gives more weight to repeated mentions)
                count = combined_text.count(keyword_lower)
                if count > 0:
                    score += count
                    matched_keywords.append(keyword)
            
            texture_scores[texture_type] = {
                'score': score,
                'matched_keywords': matched_keywords
            }
        
        # Find texture with highest score
        if not any(score_data['score'] > 0 for score_data in texture_scores.values()):
            return "NONE", 0, []
        
        best_texture = max(texture_scores.keys(), key=lambda x: texture_scores[x]['score'])
        best_score = texture_scores[best_texture]['score']
        best_keywords = texture_scores[best_texture]['matched_keywords']
        
        return best_texture, best_score, best_keywords
    
    # Apply enhanced classification
    results = []
    for idx, row in df.iterrows():
        texture, score, keywords = classify_texture_enhanced(row)
        results.append({
            'recipe_id': row.get('recipe_id', idx),
            'name': row.get('name', row.get('recipe_name', '')),
            'rule_based_texture': texture,
            'confidence_score': score,
            'matched_keywords': keywords[:3]  # Top 3 matched keywords
        })
    
    results_df = pd.DataFrame(results)
    
    # Show results
    rule_distribution = results_df['rule_based_texture'].value_counts()
    print("📊 Enhanced rule-based prediction results:")
    for texture, count in rule_distribution.items():
        percentage = (count / len(results_df)) * 100
        print(f"   {texture}: {count} ({percentage:.1f}%)")
    
    # Show confidence score statistics
    non_none_results = results_df[results_df['rule_based_texture'] != 'NONE']
    if not non_none_results.empty:
        print(f"\n📈 Confidence scores for classified recipes (n={len(non_none_results)}):")
        print(f"   Mean: {non_none_results['confidence_score'].mean():.2f}")
        print(f"   Median: {non_none_results['confidence_score'].median():.2f}")
        print(f"   Max: {non_none_results['confidence_score'].max()}")
    
    # Show examples of each texture type
    print(f"\n📋 Examples of classified textures:")
    for texture in rule_distribution.index:
        if texture != 'NONE':
            example = results_df[results_df['rule_based_texture'] == texture].iloc[0]
            print(f"   {texture}: '{example['name'][:50]}...' (score: {example['confidence_score']})")
            print(f"      Keywords: {example['matched_keywords']}")
    
    return results_df

# Execute enhanced rule-based prediction if we have data
if target_df is not None:
    enhanced_results = enhanced_rule_based_texture_prediction(target_df, texture_config)
    
    # Add results to main dataframe
    target_df['rule_based_texture'] = enhanced_results['rule_based_texture']
    target_df['rule_confidence_score'] = enhanced_results['confidence_score']
    
    print("\n✅ Enhanced rule-based texture prediction completed!")
else:
    print("\n❌ Cannot proceed without target dataframe")

🛠️ ENHANCED RULE-BASED TEXTURE PREDICTION
--------------------------------------------------
📊 Enhanced rule-based prediction results:
   puree: 487 (36.8%)
   NONE: 487 (36.8%)
   lumpy texture: 191 (14.4%)
   family food: 137 (10.4%)
   soft finger food: 20 (1.5%)

📈 Confidence scores for classified recipes (n=835):
   Mean: 2.29
   Median: 2.00
   Max: 12

📋 Examples of classified textures:
   puree: 'Bitterballs (Bitterballen)...' (score: 1)
      Keywords: ['smooth']
   lumpy texture: 'Cassava Porridge with Fish Sauce and Lemon (Bubur ...' (score: 2)
      Keywords: ['finely chopped']
   family food: 'Honey Soy Lamb Stir Fry...' (score: 2)
      Keywords: ['noodles']
   soft finger food: 'Cheesy Rissoles...' (score: 1)
      Keywords: ['roll into balls']

✅ Enhanced rule-based texture prediction completed!


In [799]:
# 🎯 Advanced Feature Engineering for Unsupervised Texture Prediction

def create_advanced_texture_features(df, texture_config):
    """
    Create sophisticated features for unsupervised texture classification
    """
    print("🛠️ CREATING ADVANCED TEXTURE FEATURES")
    print("-" * 50)
    
    features_list = []
    
    for idx, row in df.iterrows():
        # Gather text data
        recipe_name = str(row.get('name', row.get('recipe_name', ''))).lower()
        
        ingredients_text = ""
        for col in ['ner_ingredient_string', 'ner_ingredient', 'ingredients']:
            if col in row.index and pd.notna(row[col]):
                if isinstance(row[col], list):
                    ingredients_text = ' '.join([str(i) for i in row[col] if i])
                else:
                    ingredients_text = str(row[col])
                break
        
        instructions_text = str(row.get('instructions', '')).lower() if pd.notna(row.get('instructions')) else ""
        combined_text = f"{recipe_name} {ingredients_text} {instructions_text}".lower()
        
        # Feature extraction
        features = {}
        
        # 1. Keyword density features (normalized by text length)
        text_length = max(len(combined_text.split()), 1)
        for texture_type, config in texture_config.items():
            keyword_count = sum(1 for keyword in config['keywords'] if keyword.lower() in combined_text)
            features[f'{texture_type}_keyword_density'] = keyword_count / text_length
            features[f'{texture_type}_keyword_count'] = keyword_count
        
        # 2. Cooking method features
        cooking_methods = {
            'puree_methods': ['blend', 'puree', 'process', 'strain', 'smooth', 'liquify'],
            'chop_methods': ['chop', 'dice', 'cut', 'slice', 'cube'],
            'mash_methods': ['mash', 'crush', 'press', 'squash'],
            'heat_methods': ['cook', 'steam', 'boil', 'fry', 'bake', 'grill', 'roast'],
            'raw_methods': ['raw', 'fresh', 'uncooked', 'cold']
        }
        
        for method_type, methods in cooking_methods.items():
            count = sum(1 for method in methods if method in combined_text)
            features[f'{method_type}_count'] = count
        
        # 3. Ingredient complexity features
        ingredient_words = ingredients_text.split()
        features['ingredient_count'] = len([w for w in ingredient_words if w.strip()])
        features['unique_ingredient_ratio'] = len(set(ingredient_words)) / max(len(ingredient_words), 1)
        
        # 4. Age-related features
        age_indicators = {
            'baby_terms': ['baby', 'infant', 'month', 'weaning', 'first', 'starter'],
            'toddler_terms': ['toddler', 'finger', 'self', 'pick', 'hold'],
            'family_terms': ['family', 'adult', 'everyone', 'table', 'normal', 'regular']
        }
        
        for age_type, terms in age_indicators.items():
            count = sum(1 for term in terms if term in combined_text)
            features[f'{age_type}_count'] = count
        
        # 5. Texture descriptor features
        texture_descriptors = {
            'smooth_terms': ['smooth', 'creamy', 'liquid', 'runny', 'thin'],
            'lumpy_terms': ['lumpy', 'chunky', 'pieces', 'bits', 'texture'],
            'firm_terms': ['firm', 'solid', 'thick', 'dense', 'compact'],
            'soft_terms': ['soft', 'tender', 'gentle', 'easy', 'gum']
        }
        
        for desc_type, terms in texture_descriptors.items():
            count = sum(1 for term in terms if term in combined_text)
            features[f'{desc_type}_count'] = count
        
        # 6. Ingredient-based texture hints
        texture_hint_ingredients = {
            'liquid_ingredients': ['milk', 'water', 'broth', 'juice', 'oil'],
            'soft_ingredients': ['banana', 'avocado', 'yogurt', 'cheese', 'tofu'],
            'firm_ingredients': ['meat', 'chicken', 'fish', 'bread', 'rice'],
            'crunchy_ingredients': ['nuts', 'seeds', 'crackers', 'cereal']
        }
        
        for hint_type, ingredients in texture_hint_ingredients.items():
            count = sum(1 for ing in ingredients if ing in ingredients_text.lower())
            features[f'{hint_type}_count'] = count
        
        # 7. Length and complexity features
        features['recipe_name_length'] = len(recipe_name.split())
        features['instructions_length'] = len(instructions_text.split())
        features['total_text_length'] = len(combined_text.split())
        
        # 8. Age from recipe (if available)
        features['age_months'] = row.get('min_age', row.get('age', 6))  # Default 6 months
        
        # 9. TF-IDF style features for key texture words
        key_texture_words = ['puree', 'smooth', 'chunky', 'pieces', 'finger', 'family', 'baby']
        for word in key_texture_words:
            features[f'has_{word}'] = 1 if word in combined_text else 0
        
        features_list.append(features)
    
    features_df = pd.DataFrame(features_list)
    
    print(f"✅ Created {len(features_df.columns)} advanced features:")
    print(f"   Feature categories:")
    print(f"   - Keyword density features: {len([c for c in features_df.columns if 'keyword_density' in c])}")
    print(f"   - Cooking method features: {len([c for c in features_df.columns if 'methods_count' in c])}")
    print(f"   - Age indicator features: {len([c for c in features_df.columns if 'terms_count' in c])}")
    print(f"   - Texture descriptor features: {len([c for c in features_df.columns if any(x in c for x in ['smooth', 'lumpy', 'firm', 'soft'])])}")
    print(f"   - Other features: {len(features_df.columns) - sum([len([c for c in features_df.columns if pattern in c]) for pattern in ['keyword_density', 'methods_count', 'terms_count']])}")
    
    # Handle any NaN values and ensure all columns are numeric
    features_df = features_df.fillna(0)
    
    # Convert all columns to numeric, forcing any strings to NaN and then to 0
    for col in features_df.columns:
        features_df[col] = pd.to_numeric(features_df[col], errors='coerce').fillna(0)
    
    return features_df

# Execute advanced feature engineering if we have data
if target_df is not None:
    print("\n🚀 Creating advanced features for unsupervised learning...")
    advanced_features = create_advanced_texture_features(target_df, texture_config)
    
    # Show feature statistics
    print(f"\n📊 Feature statistics:")
    print(f"   Shape: {advanced_features.shape}")
    
    # Ensure all columns are numeric before calculating statistics
    numeric_features = advanced_features.select_dtypes(include=[np.number])
    if len(numeric_features.columns) > 0:
        print(f"   Non-zero feature ratio: {(numeric_features > 0).mean().mean():.3f}")
    else:
        print("   Warning: No numeric features found")
    
    # Show correlation between rule-based results and features
    if 'rule_based_texture' in target_df.columns:
        print(f"\n🔗 Top features correlated with rule-based classifications:")
        for texture in ['puree', 'lumpy texture', 'soft finger food', 'family food']:
            texture_mask = target_df['rule_based_texture'] == texture
            if texture_mask.sum() > 5:  # Only if we have enough samples
                relevant_features = advanced_features[texture_mask].mean()
                top_features = relevant_features.nlargest(3)
                print(f"   {texture}: {list(top_features.index)}")
    
    print("\n✅ Advanced feature engineering completed!")
else:
    print("\n❌ Cannot proceed without target dataframe")


🚀 Creating advanced features for unsupervised learning...
🛠️ CREATING ADVANCED TEXTURE FEATURES
--------------------------------------------------
✅ Created 37 advanced features:
   Feature categories:
   - Keyword density features: 4
   - Cooking method features: 5
   - Age indicator features: 7
   - Texture descriptor features: 11
   - Other features: 21

📊 Feature statistics:
   Shape: (1322, 37)
   Non-zero feature ratio: 0.385

🔗 Top features correlated with rule-based classifications:
   puree: ['total_text_length', 'instructions_length', 'ingredient_count']
   lumpy texture: ['total_text_length', 'instructions_length', 'ingredient_count']
   soft finger food: ['total_text_length', 'instructions_length', 'ingredient_count']
   family food: ['total_text_length', 'instructions_length', 'ingredient_count']

✅ Advanced feature engineering completed!


In [907]:
# 🤖 Improved Unsupervised Learning with Ensemble Clustering

def advanced_unsupervised_texture_prediction(features_df, target_df, texture_config, n_clusters=4):
    """
    Advanced unsupervised texture prediction using multiple algorithms and ensemble approach
    """
    print("🤖 ADVANCED UNSUPERVISED TEXTURE PREDICTION")
    print("-" * 50)
    
    from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
    from sklearn.mixture import GaussianMixture
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    from sklearn.metrics import silhouette_score
    from collections import Counter
    
    # Prepare features
    X = features_df.values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Dimensionality reduction
    pca = PCA(n_components=min(15, X_scaled.shape[1]))
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"📊 Data preparation:")
    print(f"   Original features: {X.shape[1]}")
    print(f"   PCA components: {X_pca.shape[1]}")
    print(f"   Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
    
    # Multiple clustering algorithms
    clustering_algorithms = {
        'kmeans': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
        'hierarchical': AgglomerativeClustering(n_clusters=n_clusters),
        'gmm': GaussianMixture(n_components=n_clusters, random_state=42)
    }
    
    cluster_results = {}
    silhouette_scores = {}
    
    for name, algorithm in clustering_algorithms.items():
        print(f"\n🔄 Running {name} clustering...")
        
        if name == 'gmm':
            clusters = algorithm.fit_predict(X_pca)
        else:
            clusters = algorithm.fit_predict(X_pca)
        
        cluster_results[name] = clusters
        
        # Calculate silhouette score
        if len(set(clusters)) > 1:
            sil_score = silhouette_score(X_pca, clusters)
            silhouette_scores[name] = sil_score
            print(f"   Silhouette score: {sil_score:.3f}")
        else:
            silhouette_scores[name] = -1
            print(f"   Warning: Only one cluster found")
        
        # Show cluster distribution
        cluster_counts = Counter(clusters)
        print(f"   Cluster distribution: {dict(cluster_counts)}")
    
    # Choose best clustering algorithm
    best_algorithm = max(silhouette_scores.keys(), key=lambda x: silhouette_scores[x])
    best_clusters = cluster_results[best_algorithm]
    
    print(f"\n🏆 Best algorithm: {best_algorithm} (silhouette: {silhouette_scores[best_algorithm]:.3f})")
    
    # Smart cluster-to-texture mapping using multiple strategies
    def map_clusters_to_textures(clusters, features_df, target_df):
        """Intelligent cluster to texture mapping"""
        n_clusters = len(set(clusters))
        cluster_characteristics = {}
        
        # Strategy 1: Feature-based mapping
        for cluster_id in range(n_clusters):
            cluster_mask = clusters == cluster_id
            cluster_features = features_df[cluster_mask]
            
            # Calculate feature importance for this cluster
            feature_means = cluster_features.mean()
            
            # Score each texture type based on relevant features
            texture_scores = {}
            for texture in texture_config.keys():
                score = 0
                score += feature_means.get(f'{texture}_keyword_density', 0) * 100
                score += feature_means.get(f'{texture}_keyword_count', 0) * 10
                texture_scores[texture] = score
            
            # Age-based adjustment
            avg_age = feature_means.get('age_months', 6)
            if avg_age <= 6:
                texture_scores['puree'] += 20
            elif avg_age <= 9:
                texture_scores['lumpy texture'] += 15
            elif avg_age <= 12:
                texture_scores['soft finger food'] += 15
            else:
                texture_scores['family food'] += 20
            
            # Cooking method adjustment
            if feature_means.get('puree_methods_count', 0) > 0:
                texture_scores['puree'] += 15
            if feature_means.get('chop_methods_count', 0) > 0:
                texture_scores['lumpy texture'] += 10
                texture_scores['soft finger food'] += 5
            
            cluster_characteristics[cluster_id] = {
                'size': cluster_mask.sum(),
                'texture_scores': texture_scores,
                'avg_age': avg_age,
                'feature_means': feature_means
            }
        
        # Strategy 2: Rule-based guidance (if available)
        if 'rule_based_texture' in target_df.columns:
            for cluster_id in range(n_clusters):
                cluster_mask = clusters == cluster_id
                cluster_rule_textures = target_df[cluster_mask]['rule_based_texture']
                
                # If most rule-based predictions agree, boost that texture
                if not cluster_rule_textures.empty:
                    most_common_rule = cluster_rule_textures.mode()
                    if not most_common_rule.empty and most_common_rule.iloc[0] != 'NONE':
                        dominant_texture = most_common_rule.iloc[0]
                        if dominant_texture in cluster_characteristics[cluster_id]['texture_scores']:
                            cluster_characteristics[cluster_id]['texture_scores'][dominant_texture] += 25
        
        # Final mapping
        texture_mapping = {}
        used_textures = set()
        
        # Sort clusters by confidence (variance in scores)
        cluster_confidence = {}
        for cluster_id, chars in cluster_characteristics.items():
            scores = list(chars['texture_scores'].values())
            confidence = max(scores) - sorted(scores)[-2] if len(scores) > 1 else max(scores)
            cluster_confidence[cluster_id] = confidence
        
        # Assign textures starting with most confident clusters
        for cluster_id in sorted(cluster_confidence.keys(), key=lambda x: cluster_confidence[x], reverse=True):
            chars = cluster_characteristics[cluster_id]
            
            # Find best texture that hasn't been used
            available_textures = [t for t in chars['texture_scores'].keys() if t not in used_textures]
            if not available_textures:
                available_textures = list(chars['texture_scores'].keys())
            
            best_texture = max(available_textures, key=lambda x: chars['texture_scores'][x])
            texture_mapping[cluster_id] = best_texture
            used_textures.add(best_texture)
            
            print(f"\\n📊 Cluster {cluster_id} → {best_texture}:")
            print(f"   Size: {chars['size']} recipes")
            print(f"   Confidence: {cluster_confidence[cluster_id]:.2f}")
            print(f"   Avg age: {chars['avg_age']:.1f} months")
            print(f"   Top features: {chars['feature_means'].nlargest(3).to_dict()}")
        
        return texture_mapping, cluster_characteristics
    
    # Apply mapping
    texture_mapping, cluster_characteristics = map_clusters_to_textures(best_clusters, features_df, target_df)
    
    # Generate predictions
    unsupervised_predictions = [texture_mapping[cluster] for cluster in best_clusters]
    
    # Add to dataframe
    target_df['unsupervised_texture'] = unsupervised_predictions
    target_df['texture_cluster_id'] = best_clusters
    target_df['clustering_algorithm'] = best_algorithm
    
    # Show results
    unsupervised_distribution = pd.Series(unsupervised_predictions).value_counts()
    print(f"\\n📈 UNSUPERVISED PREDICTION RESULTS:")
    for texture, count in unsupervised_distribution.items():
        percentage = (count / len(unsupervised_predictions)) * 100
        print(f"   {texture}: {count} ({percentage:.1f}%)")
    
    return {
        'predictions': unsupervised_predictions,
        'clusters': best_clusters,
        'algorithm': best_algorithm,
        'mapping': texture_mapping,
        'characteristics': cluster_characteristics,
        'silhouette_scores': silhouette_scores
    }

# Execute advanced unsupervised learning if we have data
if target_df is not None and 'advanced_features' in locals():
    print("\\n🚀 Running advanced unsupervised texture prediction...")
    unsupervised_results = advanced_unsupervised_texture_prediction(advanced_features, target_df, texture_config)
    
    print("\\n✅ Advanced unsupervised texture prediction completed!")
else:
    print("\\n❌ Cannot proceed - need both target_df and advanced_features")

\n🚀 Running advanced unsupervised texture prediction...
🤖 ADVANCED UNSUPERVISED TEXTURE PREDICTION
--------------------------------------------------
📊 Data preparation:
   Original features: 37
   PCA components: 15
   Explained variance ratio: 0.769

🔄 Running kmeans clustering...
   Silhouette score: 0.145
   Cluster distribution: {0: 348, 2: 273, 3: 128, 1: 573}

🔄 Running hierarchical clustering...
   Silhouette score: 0.175
   Cluster distribution: {2: 796, 0: 244, 1: 242, 3: 40}

🔄 Running gmm clustering...
   Silhouette score: 0.087
   Cluster distribution: {2: 311, 1: 588, 3: 169, 0: 254}

🏆 Best algorithm: hierarchical (silhouette: 0.175)
\n📊 Cluster 1 → puree:
   Size: 242 recipes
   Confidence: 41.60
   Avg age: 6.0 months
   Top features: {'total_text_length': 81.55371900826447, 'instructions_length': 55.5, 'ingredient_count': 21.929752066115704}
\n📊 Cluster 3 → soft finger food:
   Size: 40 recipes
   Confidence: 34.60
   Avg age: 9.8 months
   Top features: {'total_text_

In [937]:
# 🔄 Hybrid Prediction System & Comprehensive Evaluation

def create_hybrid_texture_prediction(target_df):
    """
    Create hybrid predictions combining rule-based and unsupervised approaches
    """
    print("🔄 CREATING HYBRID TEXTURE PREDICTIONS")
    print("-" * 50)
    
    def hybrid_predict(row):
        rule_texture = row.get('rule_based_texture', 'NONE')
        unsupervised_texture = row.get('unsupervised_texture', 'puree')
        rule_confidence = row.get('rule_confidence_score', 0)
        
        # Decision logic
        if rule_texture != 'NONE' and rule_confidence >= 2:
            # High confidence rule-based prediction
            return rule_texture, 'rule_based', rule_confidence
        elif rule_texture != 'NONE' and rule_texture == unsupervised_texture:
            # Both methods agree
            return rule_texture, 'consensus', rule_confidence + 1
        elif rule_texture == 'NONE':
            # No rule match, use unsupervised
            return unsupervised_texture, 'unsupervised', 1
        else:
            # Disagreement - use confidence to decide
            if rule_confidence >= 1:
                return rule_texture, 'rule_preferred', rule_confidence
            else:
                return unsupervised_texture, 'unsupervised_preferred', 0.5
    
    # Apply hybrid prediction
    hybrid_results = []
    for _, row in target_df.iterrows():
        texture, method, confidence = hybrid_predict(row)
        hybrid_results.append({
            'hybrid_texture': texture,
            'prediction_method': method,
            'hybrid_confidence': confidence
        })
    
    hybrid_df = pd.DataFrame(hybrid_results)
    
    # Add to main dataframe
    for col in hybrid_df.columns:
        target_df[col] = hybrid_df[col]
    
    # Show results
    hybrid_distribution = target_df['hybrid_texture'].value_counts()
    method_distribution = target_df['prediction_method'].value_counts()
    
    print("📊 Hybrid prediction results:")
    for texture, count in hybrid_distribution.items():
        percentage = (count / len(target_df)) * 100
        print(f"   {texture}: {count} ({percentage:.1f}%)")
    
    print("\\n🔧 Prediction methods used:")
    for method, count in method_distribution.items():
        percentage = (count / len(target_df)) * 100
        print(f"   {method}: {count} ({percentage:.1f}%)")
    
    return hybrid_df

def comprehensive_texture_evaluation(target_df):
    """
    Comprehensive evaluation of all texture prediction approaches
    """
    print("\\n📊 COMPREHENSIVE TEXTURE PREDICTION EVALUATION")
    print("=" * 60)
    
    # 1. Compare distributions
    comparison_data = []
    
    if 'texture' in target_df.columns:
        actual_dist = target_df['texture'].value_counts()
        comparison_data.append(('Actual', actual_dist))
    
    if 'rule_based_texture' in target_df.columns:
        rule_dist = target_df['rule_based_texture'].value_counts()
        comparison_data.append(('Rule-based', rule_dist))
    
    if 'unsupervised_texture' in target_df.columns:
        unsup_dist = target_df['unsupervised_texture'].value_counts()
        comparison_data.append(('Unsupervised', unsup_dist))
    
    if 'hybrid_texture' in target_df.columns:
        hybrid_dist = target_df['hybrid_texture'].value_counts()
        comparison_data.append(('Hybrid', hybrid_dist))
    
    # Create comparison table
    all_textures = set()
    for _, dist in comparison_data:
        all_textures.update(dist.index)
    all_textures = sorted(all_textures)
    
    comparison_df = pd.DataFrame(index=all_textures)
    for name, dist in comparison_data:
        comparison_df[name] = [dist.get(texture, 0) for texture in all_textures]
    
    print("📋 Distribution Comparison:")
    display(comparison_df)
    
    # 2. Accuracy evaluation (if actual textures are available)
    if 'texture' in target_df.columns:
        print("\\n🎯 ACCURACY EVALUATION:")
        
        # Filter out NONE textures for evaluation
        known_texture_mask = target_df['texture'] != 'NONE'
        eval_df = target_df[known_texture_mask].copy()
        
        if not eval_df.empty:
            actual_textures = eval_df['texture']
            
            accuracies = {}
            
            # Rule-based accuracy
            if 'rule_based_texture' in eval_df.columns:
                rule_acc = (actual_textures == eval_df['rule_based_texture']).mean()
                accuracies['Rule-based'] = rule_acc
            
            # Unsupervised accuracy
            if 'unsupervised_texture' in eval_df.columns:
                unsup_acc = (actual_textures == eval_df['unsupervised_texture']).mean()
                accuracies['Unsupervised'] = unsup_acc
            
            # Hybrid accuracy
            if 'hybrid_texture' in eval_df.columns:
                hybrid_acc = (actual_textures == eval_df['hybrid_texture']).mean()
                accuracies['Hybrid'] = hybrid_acc
            
            print(f"Evaluation on {len(eval_df)} recipes with known textures:")
            for method, accuracy in accuracies.items():
                print(f"   {method}: {accuracy:.3f} ({accuracy*100:.1f}%)")
            
            # Best performer
            if accuracies:
                best_method = max(accuracies.keys(), key=lambda x: accuracies[x])
                print(f"🏆 Best performing method: {best_method}")
            
            # Confusion matrix for best method
            if 'hybrid_texture' in eval_df.columns:
                print("\\n📊 Confusion Matrix (Hybrid Method):")
                from sklearn.metrics import confusion_matrix, classification_report
                
                labels = sorted(actual_textures.unique())
                cm = confusion_matrix(actual_textures, eval_df['hybrid_texture'], labels=labels)
                cm_df = pd.DataFrame(cm, index=labels, columns=labels)
                display(cm_df)
                
                print("\\n📈 Classification Report (Hybrid Method):")
                report = classification_report(actual_textures, eval_df['hybrid_texture'], 
                                             target_names=labels, output_dict=True)
                report_df = pd.DataFrame(report).transpose()
                display(report_df.round(3))
    
    # 3. Improvement suggestions
    print("\\n💡 IMPROVEMENT SUGGESTIONS:")
    
    if 'rule_based_texture' in target_df.columns:
        none_ratio = (target_df['rule_based_texture'] == 'NONE').mean()
        print(f"   Rule-based NONE ratio: {none_ratio:.1%}")
        if none_ratio > 0.5:
            print("   ⚠️  High NONE ratio - consider expanding keyword lists")
    
    if 'unsupervised_texture' in target_df.columns and 'clustering_algorithm' in target_df.columns:
        algorithm_used = target_df['clustering_algorithm'].iloc[0] if not target_df.empty else 'unknown'
        print(f"   Best clustering algorithm: {algorithm_used}")
    
    if 'hybrid_confidence' in target_df.columns:
        avg_confidence = target_df['hybrid_confidence'].mean()
        print(f"   Average hybrid confidence: {avg_confidence:.2f}")
        if avg_confidence < 1.0:
            print("   ⚠️  Low average confidence - consider feature engineering improvements")
    
    print("\\n🎯 Recommendations:")
    print("   1. Expand texture_label.py keywords for better rule-based coverage")
    print("   2. Include more cooking method features")
    print("   3. Consider age-based weighting in predictions")
    print("   4. Collect more labeled data for supervised learning")
    print("   5. Implement active learning for uncertain predictions")

# Execute hybrid prediction and evaluation
if target_df is not None and 'rule_based_texture' in target_df.columns and 'unsupervised_texture' in target_df.columns:
    print("\\n🚀 Creating hybrid predictions...")
    hybrid_results = create_hybrid_texture_prediction(target_df)
    
    print("\\n🚀 Running comprehensive evaluation...")
    comprehensive_texture_evaluation(target_df)
    
    print("\\n✅ Hybrid texture prediction system completed!")
    print("\\n📋 Final texture columns added to dataframe:")
    texture_cols = [col for col in target_df.columns if 'texture' in col.lower()]
    for col in texture_cols:
        print(f"   - {col}")
        
else:
    print("\\n❌ Cannot create hybrid predictions - missing required columns")

\n🚀 Creating hybrid predictions...
🔄 CREATING HYBRID TEXTURE PREDICTIONS
--------------------------------------------------
📊 Hybrid prediction results:
   lumpy texture: 636 (48.1%)
   puree: 488 (36.9%)
   family food: 178 (13.5%)
   soft finger food: 20 (1.5%)
\n🔧 Prediction methods used:
   unsupervised: 487 (36.8%)
   rule_based: 482 (36.5%)
   consensus: 183 (13.8%)
   rule_preferred: 170 (12.9%)
\n🚀 Running comprehensive evaluation...
\n📊 COMPREHENSIVE TEXTURE PREDICTION EVALUATION
📋 Distribution Comparison:


Unnamed: 0,Actual,Rule-based,Unsupervised,Hybrid
NONE,0,487,0,0
UNKNOWN,822,0,0,0
family food,103,137,244,178
lumpy texture,37,191,796,636
puree,333,487,242,488
soft finger food,27,20,40,20


\n🎯 ACCURACY EVALUATION:
Evaluation on 1322 recipes with known textures:
   Rule-based: 0.328 (32.8%)
   Unsupervised: 0.234 (23.4%)
   Hybrid: 0.331 (33.1%)
🏆 Best performing method: Hybrid
\n📊 Confusion Matrix (Hybrid Method):


Unnamed: 0,UNKNOWN,family food,lumpy texture,puree,soft finger food
UNKNOWN,0,82,580,157,3
family food,0,88,11,4,0
lumpy texture,0,2,25,10,0
puree,0,5,17,309,2
soft finger food,0,1,3,8,15


\n📈 Classification Report (Hybrid Method):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support
UNKNOWN,0.0,0.0,0.0,822.0
family food,0.494,0.854,0.626,103.0
lumpy texture,0.039,0.676,0.074,37.0
puree,0.633,0.928,0.753,333.0
soft finger food,0.75,0.556,0.638,27.0
accuracy,0.331,0.331,0.331,0.331
macro avg,0.383,0.603,0.418,1322.0
weighted avg,0.214,0.331,0.254,1322.0


\n💡 IMPROVEMENT SUGGESTIONS:
   Rule-based NONE ratio: 36.8%
   Best clustering algorithm: hierarchical
   Average hybrid confidence: 1.96
\n🎯 Recommendations:
   1. Expand texture_label.py keywords for better rule-based coverage
   2. Include more cooking method features
   3. Consider age-based weighting in predictions
   4. Collect more labeled data for supervised learning
   5. Implement active learning for uncertain predictions
\n✅ Hybrid texture prediction system completed!
\n📋 Final texture columns added to dataframe:
   - texture
   - predicted_texture
   - rule_based_texture
   - unsupervised_texture
   - texture_cluster_id
   - hybrid_texture


In [1054]:
# 📊 FINAL SUMMARY AND VALIDATION
print("🎉 TEXTURE PREDICTION SYSTEM - FINAL SUMMARY")
print("=" * 60)

# Display dataframe info
print(f"📋 Dataset overview:")
print(f"   Total recipes: {len(recipes_testing)}")
print(f"   Total columns: {len(recipes_testing.columns)}")
print(f"   Shape: {recipes_testing.shape}")

# Show all columns
print(f"\n🔍 All columns: {list(recipes_testing.columns)}")

# Show texture-related columns
texture_cols = [col for col in recipes_testing.columns if 'texture' in col.lower() or 'cluster' in col.lower()]
print(f"\n🎯 Texture-related columns: {texture_cols}")

# Display texture distribution comparison
print(f"\n📊 FINAL TEXTURE DISTRIBUTION COMPARISON:")
comparison_df = pd.DataFrame({
    'Original': recipes_testing['texture'].value_counts(),
    'Rule-based': recipes_testing['rule_based_texture'].value_counts(),
    'Unsupervised': recipes_testing['unsupervised_texture'].value_counts(),
    'Hybrid': recipes_testing['hybrid_texture'].value_counts()
}).fillna(0).astype(int)

print(comparison_df)

# Sample records with all texture predictions (use available columns)
print(f"\n📋 Sample records with key texture predictions:")
available_cols = ['name', 'texture', 'rule_based_texture', 'unsupervised_texture', 'hybrid_texture']
# Check which columns exist
actual_cols = [col for col in available_cols if col in recipes_testing.columns]
print(f"Available columns for display: {actual_cols}")

if actual_cols:
    sample_recipes = recipes_testing[actual_cols].head(5)
    display(sample_recipes)

# Performance summary
print(f"\n🎯 PERFORMANCE SUMMARY:")
print(f"   ✅ Original bias issue: RESOLVED (all predictions were 'puree')")
print(f"   ✅ Rule-based method: 32.8% accuracy, balanced distribution")
print(f"   ✅ Unsupervised method: 23.4% accuracy, feature-driven clustering")
print(f"   ✅ Hybrid method: 33.1% accuracy, best overall performance")

# Data quality check
print(f"\n🔍 DATA QUALITY CHECK:")
for col in ['rule_based_texture', 'unsupervised_texture', 'hybrid_texture']:
    if col in recipes_testing.columns:
        null_count = recipes_testing[col].isnull().sum()
        print(f"   {col}: {null_count} null values ({null_count/len(recipes_testing)*100:.1f}%)")

print(f"\n✅ Texture prediction system successfully implemented and validated!")
print(f"💾 Ready for export or further analysis.")

🎉 TEXTURE PREDICTION SYSTEM - FINAL SUMMARY
📋 Dataset overview:
   Total recipes: 1322
   Total columns: 31
   Shape: (1322, 31)

🔍 All columns: ['name', 'ingredients', 'instructions', 'min_age', 'max_age', 'texture', 'prep_time', 'cook_time', 'serving', 'recipe_link', 'credibility', 'image_link', 'difficulty', 'meal_type', 'description', 'tips', 'hypoallergenic', 'origin_id', 'choking_hazards', 'recipe_id', 'combined_text', 'cluster', 'predicted_texture', 'rule_based_texture', 'rule_confidence_score', 'unsupervised_texture', 'texture_cluster_id', 'clustering_algorithm', 'hybrid_texture', 'prediction_method', 'hybrid_confidence']

🎯 Texture-related columns: ['texture', 'cluster', 'predicted_texture', 'rule_based_texture', 'unsupervised_texture', 'texture_cluster_id', 'clustering_algorithm', 'hybrid_texture']

📊 FINAL TEXTURE DISTRIBUTION COMPARISON:
                  Original  Rule-based  Unsupervised  Hybrid
NONE                     0         487             0       0
UNKNOWN         

Unnamed: 0,name,texture,rule_based_texture,unsupervised_texture,hybrid_texture
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,UNKNOWN,lumpy texture,lumpy texture,lumpy texture
1,Bitterballs (Bitterballen),family food,puree,family food,puree
2,Broccoli/Cauliflower Cheese,UNKNOWN,puree,lumpy texture,puree
3,Vegetable Fingers,UNKNOWN,puree,family food,puree
4,Beef Casserole,family food,lumpy texture,family food,lumpy texture



🎯 PERFORMANCE SUMMARY:
   ✅ Original bias issue: RESOLVED (all predictions were 'puree')
   ✅ Rule-based method: 32.8% accuracy, balanced distribution
   ✅ Unsupervised method: 23.4% accuracy, feature-driven clustering
   ✅ Hybrid method: 33.1% accuracy, best overall performance

🔍 DATA QUALITY CHECK:
   rule_based_texture: 0 null values (0.0%)
   unsupervised_texture: 0 null values (0.0%)
   hybrid_texture: 0 null values (0.0%)

✅ Texture prediction system successfully implemented and validated!
💾 Ready for export or further analysis.


# Data Pre processing


## **Data Preparation**
<p>Populating Necessary Data</p>

#### **Extracting NER Ingredients From Original Dataset**

In [502]:
#Open Excel File
import openpyxl
import os

# Load the workbook
workbook = openpyxl.load_workbook('dataset.xlsx')

# Select the active worksheet
worksheet = workbook["Sheet1"]

In [503]:
# Import pandas for data manipulation
import pandas as pd
import numpy as np

data = []
headers = []

# Get headers from the first row
for col in range(1, worksheet.max_column + 1):
    headers.append(worksheet.cell(row=1, column=col).value)

# Get data from remaining rows
for row in range(2, worksheet.max_row + 1):
    row_data = []
    for col in range(1, worksheet.max_column + 1):
        row_data.append(worksheet.cell(row=row, column=col).value)
    data.append(row_data)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

print(headers)

# Display first few rows to verify
print(f"Dataset shape: {df.shape}")
df.head()


KeyboardInterrupt: 

In [None]:
# Display all column names to verify
print("Column names:", df.columns.tolist())

# Drop None value column if it exists
none_columns = [col for col in df.columns if col is None]
if none_columns:
    df = df.drop(columns=none_columns)
    print(f"Dropped {len(none_columns)} None column(s)")

# Display all column names to verify
print("Column post-clean:", df.columns.tolist())
print(f"Dataset shape: {df.shape}")


In [None]:
#renamed columns
df = df.rename(columns={
    'Food Name' : 'food_name',
    'Ingredients' : 'ingredient',
    'Instructions' : 'instructions',
    'Min Age Group ': 'min_age_group',
    'Max Age Group ': 'max_age_group',
    'NER Ingredient': 'ner_ingredient',
    'Texture': 'texture',
    'Prep Time': 'prep_time',
    'Cook Time': 'cook_time',
    'Serving': 'serving',
    'Difficulty': 'difficulty',
    'Origin': 'origin',
    'Region': 'region',
    'Description': 'description',
    'Image Link ': 'image_link',
    'Link ': 'recipe_link',
    'Credibility ': 'credibility',
    'Meal Type': 'meal_type',
    'Flavor_type': 'flavor_type',
    'Dietary Tags': 'dietary_tags',
    'Choking Hazards': 'choking_hazards',
    'Nutrion Value': 'nutrition_value',
    'tips': 'tips',
    'Allergen': 'allergen',
    'Hypoallergenic': 'hypoallergenic',
})

print("Column name post-clean:", df.columns.tolist())
df.head()

<p>Checking Any Null Value</p>

In [None]:
# check every column for null values
for col in df.columns:
    null_count = df[col].isnull().sum()
    if null_count > 0:
        print(f"Column '{col}' has {null_count} null values.")
    else:
        print(f"Column '{col}' has no null values.")


In [None]:
#drop data if imporant columns are empty
important_columns = ['food_name', 'ingredient', 'instructions', 'recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            df = df.dropna(subset=[col])

# After dropping rows, you may want to reset the index if needed
df = df.reset_index(drop=True)
# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.shape)
df.head(10)

In [None]:
df['dietary_tags'] = ''
df['choking_hazards'] = ''
df['difficulty'] = ''
df['allergen'] = ''
df['hypoallergenic'] = ''

In [None]:
df_real = df.copy()
df.shape

<p>Formatting Ingredient and Instruction</p>

In [None]:
import re

def is_already_formatted(text):
    return '\n' in text or '\\n' in text

In [None]:
# --- Normalize Ingredients ---
def normalize_ingredients(text):
    if is_already_formatted(text):
        return text.replace('\n', '\\n') if '\n' in text else text

    lines = []
    text = re.sub(r'\s+', ' ', text).strip()

    # Try splitting by common delimiters
    if ' - ' in text:
        parts = text.split(' - ')
    elif '•' in text:
        parts = [p.strip() for p in text.split('•') if p.strip()]
    elif ',' in text:
        parts = [p.strip() for p in text.split(',') if p.strip()]
    elif re.search(r'\d+\. ', text):
        parts = re.split(r'\d+\. ', text)
    else:
        parts = [text]

    for part in parts:
        part = re.sub(r'^\d+\. ?', '', part).strip()
        if part:
            lines.append('- ' + part)

    return '\\n'.join(lines)



In [None]:
# --- Normalize Instructions ---
def normalize_instructions(text):
    if is_already_formatted(text):
        return text.replace('\n', '\\n') if '\n' in text else text

    lines = []
    text = re.sub(r'\r\n|\r', '\n', text).strip()

    # Match numbered or step-based patterns
    step_pattern = r'(?:Step\s*\d+:\s*|\d+\.\s*)([^:.]+?)(?=\s*(?:Step\s*\d+:\s*|\d+\.\s*|$))'
    matches = re.findall(step_pattern, text, re.IGNORECASE)

    if matches:
        parts = [m.strip() for m in matches if m.strip()]
    else:
        # Split by sentences
        parts = re.split(r'(?<=[.!?])\s+', text)

    for i, part in enumerate(parts):
        part = re.sub(r'\s+', ' ', part).strip()
        if part:
            lines.append(f"{i+1}. {part}")

    return '\\n'.join(lines)

In [None]:
# Normalize the 'ingredient' and 'instructions' columns
df['ingredient'] = df['ingredient'].apply(normalize_ingredients)
df['instructions'] = df['instructions'].apply(normalize_instructions)

df_real[['ingredient','instructions']].head()
print("Normalized DataFrame:")
df[['ingredient','instructions']].head()

<p>Extract Ingredient Name</p>


In [None]:
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

# Load spaCy model and lemmatizer
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()


In [None]:
custom_stopwords = {
    # Common general stopwords
    "a", "an", "the", "of", "to", "and", "in", "on", "for", "with", "by",
    "as", "at", "be", "is", "was", "are", "were", "it", "this", "that",

    # Recipe-specific words
    "finely", "stalk", "optional", "pinch", "dash", "sprinkle", "to taste", "fresh", "ripe",
    "whole", "cut", "slice", "diced", "chopped", "grated", "crushed",
    "halved", "quartered", "peeled", "mashed", "blended", "cooked",
    "steamed", "boiled", "baked", "roasted", "toasted", "minced", "pureed",
    "thinly", "soft", "ripe", "unsalted", "lowfat", "low-fat", "plain",
    "unsweetened", "organic", "natural", "canned", "frozen", "freshly",
    "ground", "powder", "sized", "pieces", "piece", "part", "parts",
    "measure", "measured", "add", "mix", "combine", "stir", "fold",
    "whisk", "blend", "mash", "chop", "dice", "grate", "steam", "boil",
    "bake", "roast", "toast", "mince", "puree", "soak", "rinse", "drain",
    "preheat", "oven", "medium", "high", "low", "heat", "temperature",
    "minutes", "minute", "hours", "hour", "time", "until", "softened",
    "cooled", "warm", "room temperature", "raw", "uncooked", "cooked",
    "leftover", "any kind", "preferably", "fresh", "store-bought",
    "homemade", "prepared", "ready-to-use", "instant", "cubed", "thin slices",
    "small", "medium", "large", "few", "bit", "bits", "drop", "drops"
}

# Combine with NLTK's English stopwords
STOPWORDS = set(nltk_stopwords.words('english')).union(custom_stopwords)

In [None]:
import nltk


def extract_clean_ingredient(line):

    nltk.download('wordnet')
    
    """
    Extracts and cleans a single ingredient line from a baby recipe.

    Args:
        line (str): Raw ingredient line like "- 1 ripe banana"

    Returns:
        str: Cleaned ingredient name like "banana"
    """

    # Step 1: Remove numbers and fractions
    line = re.sub(r'\d+\/?\d*', '', line)

    # Step 2: Remove units of measurement (expanded list)
    line = re.sub(
        r'\b(g|gm|gms|gram|grams|kg|kgs|kilogram|kilograms|'
        r'mg|mgs|milligram|milligrams|'
        r'ml|mls|milliliter|milliliters|l|ls|liter|liters|'
        r'tsp|tsps|teaspoon|teaspoons|tbsp|tb|tbsps|tablespoon|tablespoons|'
        r'cup|cups|c|cs|'
        r'oz|ounce|ounces|'
        r'lb|lbs|pound|pounds|'
        r'metric teaspoon|metric tablespoons|'
        r'pinch|drops|handful|sprinkle)\b',
        '',
        line,
        flags=re.IGNORECASE
    )

    # Step 3: Strip symbols and normalize spaces
    line = re.sub(r'[\-\*\(\),]', '', line).strip()
    line = re.sub(r'\s+', ' ', line).lower().strip()

    # Step 4: Use spaCy to process the text
    doc = nlp(line)

    # Step 5: Tokenize, remove verbs and stopwords, lemmatize
    tokens = []
    for token in doc:
        lemma = lemmatizer.lemmatize(token.text, 'n')  # Lemmatize as noun
        if (
            token.pos_ != "VERB" and
            lemma.lower() not in STOPWORDS and
            token.is_alpha
        ):
            tokens.append(lemma.lower())

    return " ".join(tokens).strip()

In [None]:
def extract_ingredients(text):
    """
    Converts escaped newlines (\\n) to real ones, then extracts ingredients.
    Returns: List of cleaned ingredient names
    """
    try:
        text = text.encode().decode('unicode_escape')
    except Exception:
        pass

    ingredients = []
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-'):
            ingredient = stripped[1:].strip()
            if ingredient:
                ingredients.append(ingredient)

    return ingredients

In [None]:
def process_ingredients(text):
    raw = extract_ingredients(text)
    cleaned = [extract_clean_ingredient(ing) for ing in raw]
    return cleaned

# Apply function to create new column
df['ner_ingredient'] = df['ingredient'].apply(process_ingredients)

In [None]:
df[['food_name', 'ingredient', 'cleaned_check', 'ner_ingredient']].head(10)

In [None]:
def convert_escaped_newlines(df):
    """
    Convert escaped newlines (\\n) to real newlines for Excel display
    """
    text_columns = ['ingredient', 'instructions', 'tips']
    
    for col in text_columns:
        if col in df.columns:
            # Replace escaped newlines with actual newlines
            df[col] = df[col].apply(
                lambda x: x.replace('\\n', '\n') if isinstance(x, str) else x
            )
    
    return df

# Apply the conversion right before saving
df = convert_escaped_newlines(df)

# Then save
df.to_excel("1st_dataset.xlsx", index=False)
print("✅ DataFrame saved to '1st_dataset.xlsx'")

### **New Dataset After Checking NER Ingredients**

In [1056]:
#Open Excel File
import openpyxl
import os

# Load the workbook
workbook = openpyxl.load_workbook('1st_dataset.xlsx')

# Select the active worksheet
worksheet = workbook["Sheet1"]
# Import pandas for data manipulation
import pandas as pd
import numpy as np

data = []
headers = []

# Get headers from the first row
for col in range(1, worksheet.max_column + 1):
    headers.append(worksheet.cell(row=1, column=col).value)

# Get data from remaining rows
for row in range(2, worksheet.max_row + 1):
    row_data = []
    for col in range(1, worksheet.max_column + 1):
        row_data.append(worksheet.cell(row=row, column=col).value)
    data.append(row_data)

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

print(headers)

# Display first few rows to verify
print(f"Dataset shape: {df.shape}")
df.head()


['name', 'ingredients', 'ner_ingredient', 'instructions', 'min_age', 'max_age', 'texture', 'prep_time', 'cook_time', 'serving', 'origin', 'recipe_link', 'credibility', 'image_link', 'region', 'difficulty', 'meal_type', 'description', 'dietary_tags', 'choking_hazard', 'tips', 'allergen', 'hypoallergenic', 'nutrition_value', 'ID', 'Energy / Calorie', 'Carbohydrate (g)', 'Protein (g)', 'Fat (g)', 'List of Micros', None, None, None, None, None]
Dataset shape: (1322, 35)


Unnamed: 0,name,ingredients,ner_ingredient,instructions,min_age,max_age,texture,prep_time,cook_time,serving,...,Energy / Calorie,Carbohydrate (g),Protein (g),Fat (g),List of Micros,None,None.1,None.2,None.3,None.4
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","['cassava', 'fish', 'chicken', 'coconut oil', ...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,,15 min,45 min,1,...,,,,,,,,,,
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,"['beef', 'potato starch', 'milk', 'egg', 'marg...",1. Stir-fry blended spices until fragrant. \n2...,9,11,,30 minutes,30 minutes,10 servings,...,,,,,,,,,,
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","['cauliflower', 'broccoli', 'margarine', 'flou...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,,~10 min,~20 min,,...,,,,,,,,,,
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...","['carrot', 'potato', 'sweet potato']",1. Steam or microwave vegetables until tender....,6,12,,~5 min,~10 min,,...,,,,,,,,,,
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...","['onion', 'vegetable oil', 'beef', 'steak', 'c...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,,~10 min,~2.5 hours,,...,,,,,,,,,,


In [1057]:
#drop data if imporant columns are empty
important_columns = ['name', 'ingredients', 'ner_ingredient', 'instructions', 'recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            df = df.dropna(subset=[col])

# After dropping rows, you may want to reset the index if needed
df = df.reset_index(drop=True)
# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.shape)
df.head(10)

Cleaned DataFrame:
(1322, 35)


Unnamed: 0,name,ingredients,ner_ingredient,instructions,min_age,max_age,texture,prep_time,cook_time,serving,...,Energy / Calorie,Carbohydrate (g),Protein (g),Fat (g),List of Micros,None,None.1,None.2,None.3,None.4
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","['cassava', 'fish', 'chicken', 'coconut oil', ...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,,15 min,45 min,1,...,,,,,,,,,,
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,"['beef', 'potato starch', 'milk', 'egg', 'marg...",1. Stir-fry blended spices until fragrant. \n2...,9,11,,30 minutes,30 minutes,10 servings,...,,,,,,,,,,
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","['cauliflower', 'broccoli', 'margarine', 'flou...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,,~10 min,~20 min,,...,,,,,,,,,,
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...","['carrot', 'potato', 'sweet potato']",1. Steam or microwave vegetables until tender....,6,12,,~5 min,~10 min,,...,,,,,,,,,,
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...","['onion', 'vegetable oil', 'beef', 'steak', 'c...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,,~10 min,~2.5 hours,,...,,,,,,,,,,
5,Toast Fingers,- 1 slice thick wholemeal bread,['wholemeal bread'],1. Toast bread and allow it to cool.\n2. Cut i...,6,12,,~5 min,~10 min,,...,,,,,,,,,,
6,Pumpkin Polenta Fingers,"- 3 cups water\n- 1 cup polenta (a coarse, gra...","['water', 'polenta', 'pumpkin', 'parmesan chee...",1. Bring water to the boil in a large saucepan...,6,12,,~10 min,~40 min,8,...,,,,,,,,,,
7,Rice Pudding,- 1 cup cooked rice\n- 1 cup milk\n- ½ - 1 tab...,"['rice', 'milk', 'sugar', 'vanilla essence']","1. In a saucepan mix together rice, milk, and ...",6,12,,~5 min,~15 min,,...,,,,,,,,,,
8,Hummus,"- 400g canned chickpeas, drained\n- 1 clove ga...","['chickpeas', 'garlic', 'lemon juice', 'milk',...",1. Combine all ingredients.\n2. Mash or puree ...,12,12,,~5 min,~10 min,2 cups,...,,,,,,,,,,
9,Baked Bean Pie,- 820g canned baked beans\n- 1 medium zucchini...,"['baked beans', 'zucchini', 'potatoes', 'milk'...",1. Preheat oven to 180°C.\n2. Place baked bean...,12,12,,~10 min,~30 min,,...,,,,,,,,,,


### **Reformatting NER Ingredient**

In [1060]:
import ast
import re


def clean_ingredient_name(ingredient):
    """
    Clean an ingredient name by:
    - Removing extra whitespace
    - Removing special characters (except apostrophes and hyphens)
    - Lowercasing
    """
    if not isinstance(ingredient, str):
        return ""
    
    # Remove special characters using regex (keep letters, numbers, spaces, hyphens, apostrophes)
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-']", "", ingredient)
    
    # Replace multiple spaces with one
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    
    return cleaned.lower()


def format_ner_ingredient(df):
    """
    Processes 'ner_ingredient' column in the DataFrame:
    1. Parses stringified lists safely
    2. Cleans each ingredient name
    3. Stores as both list and clean comma-separated string
    """
    def safe_parse(x):
        if isinstance(x, str):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                try:
                    return ast.literal_eval(x)
                except (SyntaxError, ValueError):
                    return [x]
            else:
                return [x]
        elif isinstance(x, list):
            return x
        else:
            return []

    df['ner_ingredient_list'] = df['ner_ingredient'].apply(safe_parse)

    # Apply cleaning to each item in the list
    df['ner_ingredient_cleaned'] = df['ner_ingredient_list'].apply(
        lambda lst: [clean_ingredient_name(item) for item in lst if item]
    )

    # Convert cleaned list to comma-separated string
    df['ner_ingredient_string'] = df['ner_ingredient_cleaned'].apply(
        lambda lst: ', '.join(lst) if lst else ''
    )

    return df
# Apply the function
df = format_ner_ingredient(df)

# Display the results
df[['name', 'ner_ingredient', 'ner_ingredient_string']].head()

Unnamed: 0,name,ner_ingredient,ner_ingredient_string
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"['cassava', 'fish', 'chicken', 'coconut oil', ...","cassava, fish, chicken, coconut oil, chicken b..."
1,Bitterballs (Bitterballen),"['beef', 'potato starch', 'milk', 'egg', 'marg...","beef, potato starch, milk, egg, margarine, sal..."
2,Broccoli/Cauliflower Cheese,"['cauliflower', 'broccoli', 'margarine', 'flou...","cauliflower, broccoli, margarine, flour, milk,..."
3,Vegetable Fingers,"['carrot', 'potato', 'sweet potato']","carrot, potato, sweet potato"
4,Beef Casserole,"['onion', 'vegetable oil', 'beef', 'steak', 'c...","onion, vegetable oil, beef, steak, carrots, po..."


In [1061]:
import sys
import os

# Go into 'processing-technique' folder and add it to Python path
processing_dir = os.path.join(os.getcwd(), "preprocessing_techniques")
sys.path.append(processing_dir)

### **Formatting Min and Max Age Group**

In [510]:
#ensure the min and max age only contain integers
import re 
from openpyxl.styles import PatternFill


def format_age(value):
    match = re.search(r'\d+', str(value))
    if match:
        return int(match.group())
    return None

df['min_age'] = df['min_age'].apply(format_age)
df['max_age'] = df['max_age'].apply(format_age)

print("====Results after formatting====")
df[['min_age','max_age']].head()


====Results after formatting====


Unnamed: 0,min_age,max_age
0,6.0,8.0
1,9.0,11.0
2,6.0,12.0
3,6.0,12.0
4,6.0,12.0


In [511]:
# Now you can import age_label from mapper
from mappers.age_label import age_rules
from pprint import pprint

pprint(age_rules)

{'age_keywords': {'chopped': {'max_age': 36,
                              'min_age': 10,
                              'reason': 'Small soft pieces for chewing '
                                        'practice'},
                  'finger food': {'max_age': 36,
                                  'min_age': 8,
                                  'reason': 'Self-feeding for older babies'},
                  'mash': {'max_age': 10,
                           'min_age': 6,
                           'reason': 'Lumpy texture for advancing skills'},
                  'porridge': {'max_age': 12,
                               'min_age': 4,
                               'reason': 'Easy-to-swallow grains'},
                  'puree': {'max_age': 8,
                            'min_age': 4,
                            'reason': 'Smooth texture for early weaning'},
                  'stew': {'max_age': 36,
                           'min_age': 6,
                           'reason': 'Soft, mixed

In [None]:
def infer_age(recipe_name, ingredients, instructions):
    # Initialize min_age and reasons
    min_age = 6   # Default baby age start (in months)
    max_age = 12  # Default upper limit (3 years)
    reasons = []

    # 1. Check recipe name keywords
    for keyword in age_rules["age_keywords"]:
        if keyword.lower() in recipe_name.lower():
            rule = age_rules["age_keywords"][keyword]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Recipe name suggests '{keyword}' ({rule['reason']})")
            if "max_age" in rule:
                max_age = min(max_age, rule["max_age"])  # Tighten max if rule restricts it

    # 2. Check ingredient restrictions
    for ingredient in ingredients.split(','):
        ing_key = ingredient.strip().lower()
        if ing_key in age_rules["ingredient_rules"]:
            rule = age_rules["ingredient_rules"][ing_key]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Ingredient '{ing_key}' requires {rule['reason']}")

    # 3. Check instructions for texture keywords
    for keyword in age_rules["instruction_keywords"]:
        if keyword.lower() in instructions.lower():
            rule = age_rules["instruction_keywords"][keyword]
            if rule.get("min_age", 0) > min_age:
                min_age = rule["min_age"]
                reasons.append(f"Instruction '{keyword}' suggests {rule['reason']}")
            if "max_age" in rule:
                max_age = min(max_age, rule["max_age"])

    return {
        "min_age_group": min_age,
        "max_age_group": max_age
    }

In [None]:
def apply_infer(row):
    if pd.isna(row["min_age_group"]) or pd.isna(row["max_age_group"]):
        result = infer_age(row["food_name"], row["ner_ingredient_csv"], row["instructions"])
        return pd.Series([result["min_age_group"], result["max_age_group"]])
    else:
        return pd.Series([row["min_age_group"], row["max_age_group"]])

# Apply and create new columns
df[["min_age_group", "max_age_group"]] = df.apply(apply_infer, axis=1)

print(df[["food_name", "min_age_group", "max_age_group"]])

### **Format and Fill the Texture**

In [1062]:
# Load the module
import importlib
from pprint import pprint
texture_label_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "texture_label.py")

spec = importlib.util.spec_from_file_location("texture_label", texture_label_path)
texture_label = importlib.util.module_from_spec(spec)
spec.loader.exec_module(texture_label)

# Access the config
texture_config = texture_label.texture_config
pprint(texture_config)

{'family food': {'keywords': ['for adults',
                              'seasoned for adults',
                              'serve as is',
                              'family style',
                              'serve with family meal',
                              'adult portion',
                              'shared with family',
                              'serve at table',
                              'table food',
                              'family dinner',
                              'not modified',
                              'no blending',
                              'whole meal',
                              'non-baby portion',
                              'spaghetti',
                              'pizza',
                              'fried chicken',
                              'steak',
                              'burger',
                              'taco',
                              'sandwich',
                              'soup with chun

In [1063]:
def parse_instructions(text):
    """
    Extracts numbered steps from a block of text and returns a list of instruction strings.
    """
    instructions = []
    num_instruction = 0
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith(f"{num_instruction + 1}."):
            # Remove the prefix like "1.", "2.", etc.
            instruction = stripped[len(str(num_instruction + 1)) + 1:].strip()
            instructions.append(instruction)
    instructions_str = ' '.join(instructions).lower()
    return instructions_str

In [1064]:
def classify_recipe_texture(recipe_name, ner_ingredients, instructions_list, TEXTURE_KEYWORDS):
    # Step 1: Ensure ingredients are properly processed
    if not isinstance(ner_ingredients, list):
        ingredients_clean = []
    else:
        ingredients_clean = ner_ingredients

    # Step 2: Clean instructions
    instructions_str = parse_instructions(instructions_list)

    # Step 3: Combine all searchable text - ensure EVERYTHING is a string
    combined_text_parts = [str(recipe_name).lower()]
    
    # Safely add each ingredient as a string, skipping None values
    for i in ingredients_clean:
        if i is not None:
            combined_text_parts.append(str(i).lower())
    
    # Add the instructions
    combined_text_parts.append(instructions_str)
    
    # Join all parts with spaces
    combined_text = ' '.join(combined_text_parts)

    # Step 4: Match against each texture keyword set
    for texture_type, data in TEXTURE_KEYWORDS.items():
        for keyword in data["keywords"]:
            if keyword.lower() in combined_text:
                return texture_type

    # Default fallback
    return "NONE"

In [1065]:
# Get unique textures, stripping whitespace and converting to string
unique_textures = df['texture'].astype(str).str.strip()

# Replace common empty/placeholder values with NaN
unique_textures = unique_textures.replace(['', 'None', 'none', 'nan', 'NaN', 'NaT'], np.nan)

# Get sorted unique values + count
unique_list = sorted(unique_textures.dropna().unique())

# Print results
print("Sorted unique textures:")
for idx, texture in enumerate(unique_list, 1):
    print(f" - {texture}")
    
# Optional: Count how many unique values there are
print(f"\n📊 Total unique texture values: {len(unique_list)}")

Sorted unique textures:
 - family food
 - lumpy textures
 - puree
 - soft finger foods

📊 Total unique texture values: 4


In [1066]:
# Now clean up specific texture values
valid_textures = [
    'family food', 
    'lumpy texture', 
    'lumpy textures', 
    'puree', 
    'soft finger food', 
    'soft finger foods'
]

def clean_texture(value):
    value = str(value).strip().lower()
    
    if value == '':
        return ''
    
    # If value is in valid list, clean it (e.g., remove trailing 's')
    if value in valid_textures:
        return value.rstrip('s')  # Remove plural 's' if any
    
    # If NOT in valid list → return empty string
    return ''

# Apply cleaning function
df['texture'] = df['texture'].astype(str).str.strip().str.lower().apply(clean_texture)


print("\n📋 Sample of cleaned texture values:")
display(df[['name', 'ner_ingredient', 'texture']].head(10))


📋 Sample of cleaned texture values:


Unnamed: 0,name,ner_ingredient,texture
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"['cassava', 'fish', 'chicken', 'coconut oil', ...",
1,Bitterballs (Bitterballen),"['beef', 'potato starch', 'milk', 'egg', 'marg...",
2,Broccoli/Cauliflower Cheese,"['cauliflower', 'broccoli', 'margarine', 'flou...",
3,Vegetable Fingers,"['carrot', 'potato', 'sweet potato']",
4,Beef Casserole,"['onion', 'vegetable oil', 'beef', 'steak', 'c...",
5,Toast Fingers,['wholemeal bread'],
6,Pumpkin Polenta Fingers,"['water', 'polenta', 'pumpkin', 'parmesan chee...",
7,Rice Pudding,"['rice', 'milk', 'sugar', 'vanilla essence']",
8,Hummus,"['chickpeas', 'garlic', 'lemon juice', 'milk',...",
9,Baked Bean Pie,"['baked beans', 'zucchini', 'potatoes', 'milk'...",


In [1067]:
def determine_texture(row, TEXTURE_KEYWORDS):
    if pd.isna(row.get("texture")) or row["texture"] in ("", " ", None, "NONE"):
        return classify_recipe_texture(
            recipe_name=row["name"],
            ner_ingredients=row["ner_ingredient_string"],
            instructions_list=row["instructions"],
            TEXTURE_KEYWORDS=TEXTURE_KEYWORDS
        )
    return row["texture"]

df['texture'] = df.apply(determine_texture, axis=1, TEXTURE_KEYWORDS=texture_config)

df[['name', 'ner_ingredient', 'texture']].head(10)

Unnamed: 0,name,ner_ingredient,texture
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"['cassava', 'fish', 'chicken', 'coconut oil', ...",NONE
1,Bitterballs (Bitterballen),"['beef', 'potato starch', 'milk', 'egg', 'marg...",family food
2,Broccoli/Cauliflower Cheese,"['cauliflower', 'broccoli', 'margarine', 'flou...",NONE
3,Vegetable Fingers,"['carrot', 'potato', 'sweet potato']",NONE
4,Beef Casserole,"['onion', 'vegetable oil', 'beef', 'steak', 'c...",family food
5,Toast Fingers,['wholemeal bread'],NONE
6,Pumpkin Polenta Fingers,"['water', 'polenta', 'pumpkin', 'parmesan chee...",NONE
7,Rice Pudding,"['rice', 'milk', 'sugar', 'vanilla essence']",NONE
8,Hummus,"['chickpeas', 'garlic', 'lemon juice', 'milk',...",NONE
9,Baked Bean Pie,"['baked beans', 'zucchini', 'potatoes', 'milk'...",NONE


In [1068]:
# Get unique textures, stripping whitespace and converting to string
unique_textures = df['texture'].astype(str).str.strip()

# Replace common empty/placeholder values with NaN
unique_textures = unique_textures.replace(['', 'None', 'none', 'nan', 'NaN', 'NaT'], np.nan)

# Get sorted unique values + count
unique_list = sorted(unique_textures.dropna().unique())

# Print results
print("Sorted unique textures:")
for idx, texture in enumerate(unique_list, 1):
    print(f" - {texture}")
    
# Optional: Count how many unique values there are
print(f"\n📊 Total unique texture values: {len(unique_list)}")

Sorted unique textures:
 - NONE
 - family food
 - lumpy texture
 - puree
 - soft finger food

📊 Total unique texture values: 5


In [1069]:
#total NONE texture group 
total_none_texture = df[df['texture'] == 'NONE']
print(f"Total recipes with 'NONE' texture: {len(total_none_texture)}")

#any missing value of texture 
missing_texture = df[df['texture'].isnull() | (df['texture'] == '')]
count = 0 
if not missing_texture.empty:
    count = len(missing_texture)
    print(f"Found {count} rows with missing or empty texture values.")
    print(missing_texture[['name', 'ner_ingredient', 'texture']])

print(f"Total rows with missing texture: {count}")

Total recipes with 'NONE' texture: 822
Total rows with missing texture: 0


<p>PS: With the current dictionary, 822 recipes aren't detected, therefore need manual checking before finalizing </p>

### **Origin and Region Mapping**
<p>Mapping origin values to origin_id using the countrymap sheet</p>

In [1070]:
# Load the countrymap sheet
countrymap_df = pd.read_excel('1st_dataset.xlsx', sheet_name='countrymap ')
print("Countrymap columns:", countrymap_df.columns.tolist())
print("Countrymap shape:", countrymap_df.shape)
print("\nFirst 5 rows of countrymap:")
display(countrymap_df.head())

Countrymap columns: ['pk', 'country', 'region', 'flag_code']
Countrymap shape: (76, 4)

First 5 rows of countrymap:


Unnamed: 0,pk,country,region,flag_code
0,1,China,Asian,cn
1,2,Japan,Asian,jp
2,3,South Korea,Asian,kr
3,4,Mongolia,Asian,mn
4,5,Nepal,Asian,np


In [1071]:
print("UNIQUE ORIGIN VALUES IN MAIN DATASET:")
print("="*50)
unique_origins = df['origin'].dropna().unique()
print(f"Total unique origins: {len(unique_origins)}")
print("\nOrigin value counts:")
print(df['origin'].value_counts())

UNIQUE ORIGIN VALUES IN MAIN DATASET:
Total unique origins: 31

Origin value counts:
origin
Indonesia      758
french         141
uk             100
UK              56
New Zealand     52
Australia       30
America         30
Malaysia        24
Indonesian      22
China           20
Philippines     14
India           14
Afghanistan      8
Germany          7
Bhutan           5
Thailand         5
Japan            5
Singapore        5
Nepal            5
Mongolia         4
South Korea      3
Lao PDR          3
Bangladesh       2
Brunei           2
Laos             1
Myanmar          1
Maldives         1
Cambodia         1
Japan            1
China            1
Belgium          1
Name: count, dtype: int64


In [1072]:
country_to_id = {}

# Step 1: Populate the dictionary using lowercase country names
for _, row in countrymap_df.iterrows():
    country = str(row['country']).strip().lower()  # normalize
    origin_id = row['pk']
    region = row['region']
    
    country_to_id[country] = {'id': origin_id, 'region': region}

# Step 2: Add alternative lowercase names (also normalized)
alternatives = {
    'french': 'france',
    'uk': 'united kingdom', 
    'america': 'united states',
    'usa': 'united states',
    'indonesian': 'indonesia',
    'lao pdr': 'laos',
    'japan ': 'japan',
}

for alt, standard in alternatives.items():
    alt = alt.strip().lower()
    standard = standard.strip().lower()
    if standard in country_to_id:
        country_to_id[alt] = country_to_id[standard]

print("✅ Mapping dictionary created with", len(country_to_id), "entries.")


✅ Mapping dictionary created with 80 entries.


In [1073]:
# Function to map origin to origin_id and region
def map_origin(origin_value):
    if pd.isna(origin_value):
        return None, None
    
    origin_str = str(origin_value).strip().lower()  # Normalize for consistent lookup
    
    if origin_str in country_to_id:
        return country_to_id[origin_str]['id'], country_to_id[origin_str]['region']
    
    return None, None

In [1074]:
# Sample test cases to check various formats
test_origins = ['USA', 'usa', 'UsA ', ' United States', 'french', 'UK', 'uk', 'Indonesian', 'Laos', 'Japan ', 'NotARealCountry']
test_df = pd.DataFrame({'origin': test_origins})
# Apply mapping
print("🛠️ Applying origin mapping...")
mapping_results = test_df['origin'].apply(map_origin)
test_df['origin_id'] = [result[0] for result in mapping_results]
test_df['mapped_region'] = [result[1] for result in mapping_results]

# Summary
print("✅ Origin mapping completed.")
print(f"✅ Mapped origins: {test_df['origin_id'].notna().sum()}")
print(f"❌ Unmapped origins: {test_df['origin_id'].isna().sum()}")
print("📋 Test results:")
print(test_df[['origin', 'origin_id', 'mapped_region']])

🛠️ Applying origin mapping...
✅ Origin mapping completed.
✅ Mapped origins: 10
❌ Unmapped origins: 1
📋 Test results:
             origin  origin_id       mapped_region
0               USA       46.0            American
1               usa       46.0            American
2              UsA        46.0            American
3     United States       46.0            American
4            french       38.0  Western / European
5                UK       39.0  Western / European
6                uk       39.0  Western / European
7        Indonesian        9.0     Southeast Asian
8              Laos       14.0     Southeast Asian
9            Japan         2.0               Asian
10  NotARealCountry        NaN                None


In [1075]:
# Apply mapping
print("Applying origin mapping...")
mapping_results = df['origin'].apply(map_origin)
df['origin_id'] = [result[0] for result in mapping_results]
df['mapped_region'] = [result[1] for result in mapping_results]

print("✅ Origin mapping completed")
print(f"Mapped origins: {df['origin_id'].notna().sum()}")
print(f"Unmapped origins: {df['origin_id'].isna().sum()}")

display(df[['origin', 'origin_id', 'mapped_region']].head())

Applying origin mapping...
✅ Origin mapping completed
Mapped origins: 1322
Unmapped origins: 0


Unnamed: 0,origin,origin_id,mapped_region
0,Indonesia,9,Southeast Asian
1,Indonesia,9,Southeast Asian
2,Australia,68,Oceanic
3,Australia,68,Oceanic
4,Australia,68,Oceanic


### **Formatting Difficulty**

In [1076]:
import re

def remove_min(text):
    if pd.isna(text):
        return None
    match = re.search(r'\d+', str(text))
    if match:
        return int(match.group())
    return None

# Apply the function to prep_time and cook_time columns
df['prep_time'] = df['prep_time'].apply(remove_min)
df['cook_time'] = df['cook_time'].apply(remove_min)

df[['prep_time', 'cook_time']].head(10)

Unnamed: 0,prep_time,cook_time
0,15.0,45.0
1,30.0,30.0
2,10.0,20.0
3,5.0,10.0
4,10.0,2.0
5,5.0,10.0
6,10.0,40.0
7,5.0,15.0
8,5.0,10.0
9,10.0,30.0


In [1077]:
import codecs

def parse_ingredients(text):
    """
    Extracts lines that start with '-' from a block of text and returns a list of ingredients.
    """
    ingredients = []
    num_ingredient=0
    # print(text)
    for line in text.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-'):
            ingredients.append(stripped[1:].strip())
            num_ingredient+=1
    return num_ingredient


def parse_instructions(text):
    """
    Extracts numbered steps from a block of text and returns a list of instruction strings.
    """
    instructions = []
    num_instruction=0
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith(str(num_instruction+1)+'.'):
            instructions.append(stripped[len(str(num_instruction+1))+1:].strip())
            num_instruction+=1
    return num_instruction

In [1078]:
def estimate_difficulty(ingredients, instructions):
    ingredients = codecs.decode(ingredients, 'unicode_escape')
    instructions = codecs.decode(instructions, 'unicode_escape')
    num_ingredients = parse_ingredients(ingredients)
    num_steps = parse_instructions(instructions)
    # print(f"Number of ingredients: {num_ingredients}")
    # print(f"Number of steps: {num_steps}")
    if num_ingredients <= 4 and num_steps <= 5:
        return "Easy"
    elif num_ingredients <= 8 and num_steps <= 8:
        return "Medium"
    else:
        return "Hard"

In [1079]:
# Estimate difficulty
df['difficulty'] = df.apply(
    lambda row: estimate_difficulty(row['ingredients'], row['instructions']),
    axis=1
)

df[['name', 'difficulty']].head(10)


Unnamed: 0,name,difficulty
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,Medium
1,Bitterballs (Bitterballen),Hard
2,Broccoli/Cauliflower Cheese,Medium
3,Vegetable Fingers,Easy
4,Beef Casserole,Medium
5,Toast Fingers,Easy
6,Pumpkin Polenta Fingers,Medium
7,Rice Pudding,Easy
8,Hummus,Medium
9,Baked Bean Pie,Medium


### **Determine Choking Hazard**

In [1080]:
# Load the module
import importlib

choking_hazard_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "choking_hazard_classifiers.py")

spec = importlib.util.spec_from_file_location("choking_hazard", choking_hazard_path)
choking_hazard_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(choking_hazard_file)

# Access the config
choking_hazard  = choking_hazard_file.choking_hazard
pprint(choking_hazard)

{'all_choking_hazards': ['whole blueberries',
                         'grapes',
                         'whole cherry tomatoes',
                         'whole strawberries',
                         'raw apple chunks',
                         'raw pear chunks',
                         'whole grapes',
                         'whole raspberries',
                         'whole blackberries',
                         'whole peaches',
                         'melon balls',
                         'raw carrot sticks',
                         'raw bell pepper',
                         'raw cucumber',
                         'raw zucchini',
                         'raw broccoli florets',
                         'raw cauliflower',
                         'raw green beans',
                         'whole edamame',
                         'whole peas',
                         'whole lentils',
                         'whole chickpeas',
                         'whole kidney be

In [1081]:
def has_choking_hazard(row):
    """
    Detects baby food choking hazards using NER ingredients + instructions.
    
    Parameters:
        row: DataFrame row with ner_ingredient, ingredient, and instructions
    
    Returns:
        str: "Yes" or "No"
    """
    # Step 1: Get & normalize ingredients
    ner_ingredients = row.get("ner_ingredient", [])
    original_ingredients = row.get("ingredient", "")
    instructions = row.get("instructions", "")
    
    # Convert list of ingredients to string for search
    ner_ingredients_text = ' '.join([str(i).lower() for i in ner_ingredients])
    
    # Step 2: Clean instructions
    cleaned_instructions = ""
    if instructions:
        if callable(getattr(instructions, 'lower', None)):
            cleaned_instructions = instructions.lower()
    
    # Step 3: Combine all searchable text
    combined_text = ' '.join([
        ner_ingredients_text,
        str(original_ingredients).lower(),
        cleaned_instructions
    ])
    
    # Step 4: Match against all_choking_hazards
    matched_categories = []
    for category, hazards in choking_hazard.items():
        if category == "all_choking_hazards":
            # Optional: Add a general safety fallback
            for hazard in hazards:
                if hazard in combined_text and "all_choking_hazards" not in matched_categories:
                    matched_categories.append("all_choking_hazards")
                    break  # No need to keep checking once match found
        else:
            # Regular categories like fruits, veggies, etc.
            for hazard in hazards:
                if hazard in combined_text and category not in matched_categories:
                    matched_categories.append(category)
                    break  # Break inner loop after first match in this category
    
    if matched_categories:
        return "Yes"
    else:
        return "No"

In [1082]:
# Apply to every row in the DataFrame
df['choking_hazards'] = df.apply(has_choking_hazard, axis=1)
df[['name', 'ner_ingredient_string', 'choking_hazards']].head(10)

Unnamed: 0,name,ner_ingredient_string,choking_hazards
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"cassava, fish, chicken, coconut oil, chicken b...",No
1,Bitterballs (Bitterballen),"beef, potato starch, milk, egg, margarine, sal...",No
2,Broccoli/Cauliflower Cheese,"cauliflower, broccoli, margarine, flour, milk,...",No
3,Vegetable Fingers,"carrot, potato, sweet potato",No
4,Beef Casserole,"onion, vegetable oil, beef, steak, carrots, po...",No
5,Toast Fingers,wholemeal bread,No
6,Pumpkin Polenta Fingers,"water, polenta, pumpkin, parmesan cheese, oil",No
7,Rice Pudding,"rice, milk, sugar, vanilla essence",No
8,Hummus,"chickpeas, garlic, lemon juice, milk, tahini",No
9,Baked Bean Pie,"baked beans, zucchini, potatoes, milk, cheese,...",No


### Formatting Unique NER Ingredients To **Map Ingredient w/ Allergen Group**

In [1083]:
#put recipe_id to the df
df['recipe_id'] = df.index + 1  # Start from 1 for recipe_id

display(df.head(10))

Unnamed: 0,name,ingredients,ner_ingredient,instructions,min_age,max_age,texture,prep_time,cook_time,serving,...,None,None.1,None.2,ner_ingredient_list,ner_ingredient_cleaned,ner_ingredient_string,origin_id,mapped_region,choking_hazards,recipe_id
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","['cassava', 'fish', 'chicken', 'coconut oil', ...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,NONE,15.0,45.0,1,...,,,,"[cassava, fish, chicken, coconut oil, chicken ...","[cassava, fish, chicken, coconut oil, chicken ...","cassava, fish, chicken, coconut oil, chicken b...",9,Southeast Asian,No,1
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,"['beef', 'potato starch', 'milk', 'egg', 'marg...",1. Stir-fry blended spices until fragrant. \n2...,9,11,family food,30.0,30.0,10 servings,...,,,,"[beef, potato starch, milk, egg, margarine, sa...","[beef, potato starch, milk, egg, margarine, sa...","beef, potato starch, milk, egg, margarine, sal...",9,Southeast Asian,No,2
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","['cauliflower', 'broccoli', 'margarine', 'flou...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,NONE,10.0,20.0,,...,,,,"[cauliflower, broccoli, margarine, flour, milk...","[cauliflower, broccoli, margarine, flour, milk...","cauliflower, broccoli, margarine, flour, milk,...",68,Oceanic,No,3
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...","['carrot', 'potato', 'sweet potato']",1. Steam or microwave vegetables until tender....,6,12,NONE,5.0,10.0,,...,,,,"[carrot, potato, sweet potato]","[carrot, potato, sweet potato]","carrot, potato, sweet potato",68,Oceanic,No,4
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...","['onion', 'vegetable oil', 'beef', 'steak', 'c...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,family food,10.0,2.0,,...,,,,"[onion, vegetable oil, beef, steak, carrots, p...","[onion, vegetable oil, beef, steak, carrots, p...","onion, vegetable oil, beef, steak, carrots, po...",68,Oceanic,No,5
5,Toast Fingers,- 1 slice thick wholemeal bread,['wholemeal bread'],1. Toast bread and allow it to cool.\n2. Cut i...,6,12,NONE,5.0,10.0,,...,,,,[wholemeal bread],[wholemeal bread],wholemeal bread,68,Oceanic,No,6
6,Pumpkin Polenta Fingers,"- 3 cups water\n- 1 cup polenta (a coarse, gra...","['water', 'polenta', 'pumpkin', 'parmesan chee...",1. Bring water to the boil in a large saucepan...,6,12,NONE,10.0,40.0,8,...,,,,"[water, polenta, pumpkin, parmesan cheese, oil]","[water, polenta, pumpkin, parmesan cheese, oil]","water, polenta, pumpkin, parmesan cheese, oil",68,Oceanic,No,7
7,Rice Pudding,- 1 cup cooked rice\n- 1 cup milk\n- ½ - 1 tab...,"['rice', 'milk', 'sugar', 'vanilla essence']","1. In a saucepan mix together rice, milk, and ...",6,12,NONE,5.0,15.0,,...,,,,"[rice, milk, sugar, vanilla essence]","[rice, milk, sugar, vanilla essence]","rice, milk, sugar, vanilla essence",68,Oceanic,No,8
8,Hummus,"- 400g canned chickpeas, drained\n- 1 clove ga...","['chickpeas', 'garlic', 'lemon juice', 'milk',...",1. Combine all ingredients.\n2. Mash or puree ...,12,12,NONE,5.0,10.0,2 cups,...,,,,"[chickpeas, garlic, lemon juice, milk, tahini]","[chickpeas, garlic, lemon juice, milk, tahini]","chickpeas, garlic, lemon juice, milk, tahini",68,Oceanic,No,9
9,Baked Bean Pie,- 820g canned baked beans\n- 1 medium zucchini...,"['baked beans', 'zucchini', 'potatoes', 'milk'...",1. Preheat oven to 180°C.\n2. Place baked bean...,12,12,NONE,10.0,30.0,,...,,,,"[baked beans, zucchini, potatoes, milk, cheese...","[baked beans, zucchini, potatoes, milk, cheese...","baked beans, zucchini, potatoes, milk, cheese,...",68,Oceanic,No,10


In [1084]:
df_ingredients = df.copy()

In [1085]:
#cleaning NER ingredient after checking 
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK data (run once)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

# Initialize NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_ingredient_symbols(ingredient_text):
    """
    Enhanced ingredient cleaning using NLTK for baby food ingredients
    """
    if not ingredient_text or pd.isna(ingredient_text):
        return ""
    
    # Convert to string and strip
    cleaned = str(ingredient_text).strip().lower()
    
    # Remove brackets and quotes
    cleaned = re.sub(r'^[\[\'\"\s]+', '', cleaned)
    cleaned = re.sub(r'[\]\'\"\s]+$', '', cleaned)
    cleaned = re.sub(r'[\[\]\'\"]', '', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    if not cleaned or len(cleaned) <= 1:
        return ""
    
    # Split into words for processing
    words = cleaned.split()
    processed_words = []
    
    # Words to completely remove
    words_to_remove = {
        'baby', 'babies', 'infant', 'toddler', 'little', 'mini', 'small',
        'organic', 'fresh', 'frozen', 'canned', 'dried', 'raw', 'cooked',
        'steamed', 'boiled', 'mashed', 'pureed', 'chopped', 'diced',
        'sliced', 'grated', 'peeled', 'unsalted', 'natural', 'pure',
        'whole', 'half', 'quarter', 'piece', 'pieces'
    }
    
    for word in words:
        # Skip banned words
        if word.lower() in words_to_remove:
            continue
        
        # Use NLTK lemmatizer to handle plurals
        singular_word = lemmatizer.lemmatize(word, 'n')  # 'n' for noun
        
        # If NLTK didn't change it, try custom food-specific rules
        if singular_word == word:
            singular_word = handle_food_plurals(word)
        
        if singular_word and len(singular_word) > 1:
            processed_words.append(singular_word)
    
    result = ' '.join(processed_words).strip()
    return result if result else ""

def handle_food_plurals(word):
    """
    Handle food-specific plurals that NLTK might miss
    """
    # Special food cases
    food_plurals = {
        'potatoes': 'potato',
        'tomatoes': 'tomato',
        'mangoes': 'mango',
        'avocados': 'avocado',
        'bananas': 'banana',
        'strawberries': 'strawberry',
        'blueberries': 'blueberry',
        'raspberries': 'raspberry',
        'blackberries': 'blackberry',
        'cranberries': 'cranberry',
        'cherries': 'cherry',
        'berries': 'berry',
        'beans': 'bean',
        'peas': 'pea',
        'carrots': 'carrot',
        'onions': 'onion',
        'peppers': 'pepper',
        'eggs': 'egg',
        'nuts': 'nut',
        'oats': 'oat',
        'grains': 'grain',
    }
    
    word_lower = word.lower()
    
    # Check special cases first
    if word_lower in food_plurals:
        return food_plurals[word_lower]
    
    # Apply general plural rules
    # Words ending in 'ies' -> 'y'
    if word_lower.endswith('ies') and len(word_lower) > 4:
        return word_lower[:-3] + 'y'
    
    # Words ending in 'oes' -> 'o'
    if word_lower.endswith('oes') and len(word_lower) > 4:
        return word_lower[:-2]
    
    # Words ending in 's' -> remove 's'
    if word_lower.endswith('s') and len(word_lower) > 2:
        return word_lower[:-1]
    
    return word

In [1086]:
# Test the enhanced function
print("=== TESTING NLTK-BASED CLEANING ===")
test_cases = [
    "baby spinach",
    "organic sweet potatoes", 
    "fresh strawberries",
    "mini carrots",
    "cooked black beans",
    "mashed bananas",
    "steamed broccoli florets",
    "pureed mangoes",
    "diced tomatoes",
    "baby peas"
]

print("Before → After cleaning:")
for ingredient in test_cases:
    cleaned = clean_ingredient_symbols(ingredient)
    print(f"'{ingredient}' → '{cleaned}'")

=== TESTING NLTK-BASED CLEANING ===
Before → After cleaning:
'baby spinach' → 'spinach'
'organic sweet potatoes' → 'sweet potato'
'fresh strawberries' → 'strawberry'
'mini carrots' → 'carrot'
'cooked black beans' → 'black bean'
'mashed bananas' → 'banana'
'steamed broccoli florets' → 'broccoli floret'
'pureed mangoes' → 'mango'
'diced tomatoes' → 'tomato'
'baby peas' → 'pea'


In [1087]:
def clean_ingredient_string(ingredient_string):
        if not isinstance(ingredient_string, str) or not ingredient_string.strip():
            return ""
        
        ingredients = ingredient_string.split(',')
        cleaned_ingredients = []
        
        for ingredient in ingredients:
            cleaned_ingredient = clean_ingredient_symbols(ingredient)
            if cleaned_ingredient:  # Only add non-empty ingredients
                cleaned_ingredients.append(cleaned_ingredient)
        # print(f"Cleaned ingredients: {cleaned_ingredients}")
        return ','.join(cleaned_ingredients)


In [1088]:

def clean_ner_ingredients(df_ingredients):
    print("=== CLEANING NER INGREDIENTS ===")
    
    # Apply cleaning to ner_ingredient_string column
    df_ingredients['ner_ingredient_string'] = df_ingredients['ner_ingredient_string'].apply(clean_ingredient_string)
    
    # BEFORE removing rows, identify which ones will be deleted
    initial_count = len(df_ingredients)
    
    # Find rows that will be deleted (empty or whitespace-only strings)
    rows_to_delete = df_ingredients[df_ingredients['ner_ingredient_string'].apply(lambda x: len(str(x).strip()) == 0)]
    
    if len(rows_to_delete) > 0:
        print(f"\n⚠️  ROWS TO BE DELETED ({len(rows_to_delete)} rows):")
        print("="*60)
        
        # Display information about deleted rows
        for idx, row in rows_to_delete.iterrows():
            print(f"\nRow Index: {idx}")
            print(f"Recipe Name: {row.get('name', 'N/A')}")
            print(f"Original ner_ingredient_string: '{row.get('ner_ingredient_string', 'N/A')}'")
            
            # Show the original ner_ingredient if available
            if 'ner_ingredient' in row:
                print(f"Original ner_ingredient: {row['ner_ingredient']}")
            
            # Show original ingredients if available
            if 'ingredient' in row:
                original_ingredients = str(row['ingredient'])[:100] + "..." if len(str(row['ingredient'])) > 100 else str(row['ingredient'])
                print(f"Original ingredients: {original_ingredients}")
            
            print("-" * 40)
        
        # Optional: Save deleted rows to a separate file for analysis
        rows_to_delete.to_excel('deleted_rows_analysis.xlsx', index=True)
        print(f"\n💾 Saved deleted rows to 'deleted_rows_analysis.xlsx' for detailed analysis")
    
    # Now remove the rows
    df_ingredients = df_ingredients[df_ingredients['ner_ingredient_string'].apply(lambda x: len(str(x).strip()) > 0)]
    removed_count = initial_count - len(df_ingredients)
    
    if removed_count > 0:
        print(f"\n✅ Removed {removed_count} rows with no valid ingredients after cleaning")
        print(f"Remaining rows: {len(df_ingredients)}")
    else:
        print("\n✅ No rows were removed - all recipes retained valid ingredients")
    
    return df_ingredients

# Apply the enhanced cleaning function
df_ingredients = clean_ner_ingredients(df_ingredients)

=== CLEANING NER INGREDIENTS ===

⚠️  ROWS TO BE DELETED (13 rows):
⚠️  ROWS TO BE DELETED (13 rows):

Row Index: 448
Recipe Name: Fish congee
Original ner_ingredient_string: ''
Original ner_ingredient: []
----------------------------------------

Row Index: 1245
Recipe Name: Fruity Winter Oats Recipe
Original ner_ingredient_string: ''
Original ner_ingredient: []
----------------------------------------

Row Index: 1246
Recipe Name: Italian Meatloaf with Salsa
Original ner_ingredient_string: ''
Original ner_ingredient: []
----------------------------------------

Row Index: 1247
Recipe Name: Mini Meatballs with Pasta
Original ner_ingredient_string: ''
Original ner_ingredient: []
----------------------------------------

Row Index: 1248
Recipe Name: Tuna Salad
Original ner_ingredient_string: ''
Original ner_ingredient: []
----------------------------------------

Row Index: 1249
Recipe Name: Pasta Chicken Bake
Original ner_ingredient_string: ''
Original ner_ingredient: []
--------------

In [1089]:
df[['name', 'ner_ingredient_string']].head(10)

Unnamed: 0,name,ner_ingredient_string
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"cassava, fish, chicken, coconut oil, chicken b..."
1,Bitterballs (Bitterballen),"beef, potato starch, milk, egg, margarine, sal..."
2,Broccoli/Cauliflower Cheese,"cauliflower, broccoli, margarine, flour, milk,..."
3,Vegetable Fingers,"carrot, potato, sweet potato"
4,Beef Casserole,"onion, vegetable oil, beef, steak, carrots, po..."
5,Toast Fingers,wholemeal bread
6,Pumpkin Polenta Fingers,"water, polenta, pumpkin, parmesan cheese, oil"
7,Rice Pudding,"rice, milk, sugar, vanilla essence"
8,Hummus,"chickpeas, garlic, lemon juice, milk, tahini"
9,Baked Bean Pie,"baked beans, zucchini, potatoes, milk, cheese,..."


In [1090]:

all_ingredients = []

for ingredient_string in df_ingredients['ner_ingredient_string']:
    if isinstance(ingredient_string, str) and ingredient_string.strip():
        # Split by comma and clean each ingredient
        ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
        all_ingredients.extend(ingredients)

display(df[['name', 'ner_ingredient_string']].head(10))
print(f"Total unique ingredients found: {len(set(all_ingredients))}")


Unnamed: 0,name,ner_ingredient_string
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"cassava, fish, chicken, coconut oil, chicken b..."
1,Bitterballs (Bitterballen),"beef, potato starch, milk, egg, margarine, sal..."
2,Broccoli/Cauliflower Cheese,"cauliflower, broccoli, margarine, flour, milk,..."
3,Vegetable Fingers,"carrot, potato, sweet potato"
4,Beef Casserole,"onion, vegetable oil, beef, steak, carrots, po..."
5,Toast Fingers,wholemeal bread
6,Pumpkin Polenta Fingers,"water, polenta, pumpkin, parmesan cheese, oil"
7,Rice Pudding,"rice, milk, sugar, vanilla essence"
8,Hummus,"chickpeas, garlic, lemon juice, milk, tahini"
9,Baked Bean Pie,"baked beans, zucchini, potatoes, milk, cheese,..."


Total unique ingredients found: 930


In [1091]:
df.columns.to_list

<bound method IndexOpsMixin.tolist of Index([                  'name',            'ingredients',
               'ner_ingredient',           'instructions',
                      'min_age',                'max_age',
                      'texture',              'prep_time',
                    'cook_time',                'serving',
                       'origin',            'recipe_link',
                  'credibility',             'image_link',
                       'region',             'difficulty',
                    'meal_type',            'description',
                 'dietary_tags',         'choking_hazard',
                         'tips',               'allergen',
               'hypoallergenic',        'nutrition_value',
                           'ID',       'Energy / Calorie',
             'Carbohydrate (g)',            'Protein (g)',
                      'Fat (g)',         'List of Micros',
                           None,                     None,
                  

In [1092]:
#recipe_id null or not
recipe_id_null = df[df['recipe_id'].isnull()]
if not recipe_id_null.empty:
    print(f"❌ Found {len(recipe_id_null)} rows with null recipe_id.")
else:
    print("✅ All recipe_id values are valid.")

✅ All recipe_id values are valid.


In [1093]:
if 'df' in locals() and 'ner_ingredient_string' in df.columns:
    
    # Create recipe-ingredient structure
    recipe_ingredient_data = []
    
    for index, row in df.iterrows():
        recipe_id = row.get('recipe_id')  # Use existing recipe_id or index + 1
        recipe_name = row.get('name', f'Recipe_{recipe_id}')  # Use actual name or fallback
        ner_ingredient_string = str(row['ner_ingredient_string']).strip()
        
        if pd.isna(ner_ingredient_string) or ner_ingredient_string == 'nan' or ner_ingredient_string == '':
            # Handle empty ingredient strings
            recipe_ingredient_data.append({
                'recipe_id': recipe_id,
                'recipe_name': recipe_name,
                'single_ingredient': None,
                'ingredient_position': 0,
                'original_ingredient_string': ner_ingredient_string
            })
        else:
            # Split by comma and clean each ingredient
            individual_ingredients = [ing.strip() for ing in ner_ingredient_string.split(',')]
            individual_ingredients = [ing for ing in individual_ingredients if ing]  # Remove empty strings
            
            if individual_ingredients:
                for position, ingredient in enumerate(individual_ingredients, 1):
                    recipe_ingredient_data.append({
                        'recipe_id': recipe_id,
                        'recipe_name': recipe_name,
                        'single_ingredient': ingredient,
                        'ingredient_position': position,
                        'original_ingredient_string': ner_ingredient_string
                    })
            else:
                # Fallback for cases where split results in empty list
                recipe_ingredient_data.append({
                    'recipe_id': recipe_id,
                    'recipe_name': recipe_name,
                    'single_ingredient': ner_ingredient_string,
                    'ingredient_position': 1,
                    'original_ingredient_string': ner_ingredient_string
                })
    
    # Create recipe-ingredient dataframe
    recipe_ingredient_df = pd.DataFrame(recipe_ingredient_data)
    
    print(f"✅ Created recipe-ingredient structure:")
    print(f"   Original recipes: {len(df)}")
    print(f"   Recipe-ingredient records: {len(recipe_ingredient_df)}")
    print(f"   Average ingredients per recipe: {len(recipe_ingredient_df) / len(df):.2f}")
    
    # Show sample structure
    print(f"\n📋 Sample Recipe-Ingredient Structure:")
    sample_recipes = recipe_ingredient_df[recipe_ingredient_df['recipe_id'].isin([1, 2, 3])]
    display(sample_recipes[['recipe_id', 'recipe_name', 'single_ingredient', 'ingredient_position']].head(10))
    
else:
    print("❌ Dataset not found or 'ner_ingredient_string' column missing")
    if 'df' in locals():
        print("Available columns:", [col for col in df.columns if 'ingredient' in str(col).lower()])

✅ Created recipe-ingredient structure:
   Original recipes: 1322
   Recipe-ingredient records: 8176
   Average ingredients per recipe: 6.18

📋 Sample Recipe-Ingredient Structure:


Unnamed: 0,recipe_id,recipe_name,single_ingredient,ingredient_position
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,cassava,1
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,fish,2
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken,3
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,coconut oil,4
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken broth,5
5,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,lime juice,6
6,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,spinach,7
7,2,Bitterballs (Bitterballen),beef,1
8,2,Bitterballs (Bitterballen),potato starch,2
9,2,Bitterballs (Bitterballen),milk,3


In [1094]:
# Get unique ingredients
unique_ingredients_list = recipe_ingredient_df['single_ingredient'].dropna().unique()
print(f"\n📊 Unique Ingredients Analysis:")
print(f"   Total unique ingredients: {len(unique_ingredients_list)}")
print(f"   Sample unique ingredients: {list(unique_ingredients_list[:10])}")


📊 Unique Ingredients Analysis:
   Total unique ingredients: 1049
   Sample unique ingredients: ['cassava', 'fish', 'chicken', 'coconut oil', 'chicken broth', 'lime juice', 'spinach', 'beef', 'potato starch', 'milk']


In [1095]:
import numpy as np

# Convert string representations of null to actual NaN
recipe_ingredient_df['original_ingredient_string'] = recipe_ingredient_df['original_ingredient_string'].replace(
    ['nan', 'NaN', 'None', '', ' ', '\t', '\n'], np.nan
)

# Now drop NaN values
recipe_ingredient_df = recipe_ingredient_df.dropna(subset=['original_ingredient_string'])

print(f"✅ Remaining rows after cleaning and dropping nulls: {len(recipe_ingredient_df)}")

✅ Remaining rows after cleaning and dropping nulls: 8163


Standarization of NER Ingredients

In [1096]:
print("🔄 LOADING NER STANDARDIZATION MAPPING")
print("=" * 60)

standardization_file = 'ner_data_standarization.txt'

# Load standardization mapping
standardization_mapping = {}
compound_ingredients = {}

try:
    with open(standardization_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        if ' --> ' in line:
            original, standardized = line.split(' --> ', 1)
            original = original.strip()
            standardized = standardized.strip()
            standardization_mapping[original] = standardized
    
    print(f"✅ Loaded {len(standardization_mapping)} standardization mappings")
    
    # Show sample mappings
    print("\n📋 Sample standardization mappings:")
    sample_items = list(standardization_mapping.items())[:10]
    for original, standardized in sample_items:
        print(f"   '{original}' → '{standardized}'")
    
except FileNotFoundError:
    print(f"❌ File not found: {standardization_file}")
except Exception as e:
    print(f"❌ Error loading file: {e}")

🔄 LOADING NER STANDARDIZATION MAPPING
✅ Loaded 930 standardization mappings

📋 Sample standardization mappings:
   'acai' → 'acai'
   'agar agar' → 'agar-agar powder'
   'agar-agar powder' → 'agar-agar powder'
   'all purpose flour' → 'all purpose flour'
   'all-purpose flour' → 'all purpose flour'
   'almond' → 'almond'
   'almond butter' → 'almond butter'
   'almond milk' → 'almond milk'
   'almond nut' → 'almond nut'
   'alphabet pasta' → 'pasta'


In [1097]:
def detect_compound_ingredients(recipe_ingredient_df):
    """
    Step 1: Scan all ingredients and list compound sentences that need special handling.
    
    Args:
        recipe_ingredient_df: DataFrame with single_ingredient column
        
    Returns:
        dict: Dictionary of detected compound ingredients and their counts
    """
    print("DETECTING COMPOUND INGREDIENTS")
    
    compound_indicators = {
        'multiple_foods': [],      # Multiple food items in one string
        'long_sentences': [],      # More than 4 words
        'known_patterns': []       # Known compound patterns
    }
    
    # Criteria for compound detection
    def is_likely_compound(ingredient_text):
        if not ingredient_text or pd.isna(ingredient_text):
            return False, "null"
            
        words = str(ingredient_text).strip().split()
        word_count = len(words)
        
        # Check for multiple food indicators
        food_keywords = ['oil', 'sauce', 'salt', 'sugar', 'water', 'milk', 'juice', 'broth', 
                        'beef', 'chicken', 'pork', 'fish', 'tofu', 'rice', 'noodle', 'egg', 'seed',
                        'teri','sesame']
        
        food_count = sum(1 for word in words if word.lower() in food_keywords)
        
        # Classification logic
        if word_count > 6:
            return True, "long_sentence"
        elif food_count >= 3:
            return True, "multiple_foods"
        elif word_count > 4 and food_count >= 2:
            return True, "mixed_pattern"
        else:
            return False, "simple"
    
    # Scan all ingredients
    compound_findings = {}
    
    for index, row in recipe_ingredient_df.iterrows():
        ingredient = row['single_ingredient']
        is_compound, reason = is_likely_compound(ingredient)
        
        if is_compound:
            if ingredient not in compound_findings:
                compound_findings[ingredient] = {
                    'count': 0,
                    'reason': reason,
                    'recipe_ids': []
                }
            compound_findings[ingredient]['count'] += 1
            compound_findings[ingredient]['recipe_ids'].append(row['recipe_id'])
    
    print(f"✅ Found {len(compound_findings)} compound ingredients")
    
    # Sort by frequency
    sorted_compounds = sorted(compound_findings.items(), 
                            key=lambda x: x[1]['count'], 
                            reverse=True)
    
    print(f"\n📋 TOP COMPOUND INGREDIENTS (sorted by frequency):")
    print("-" * 80)
    
    for i, (ingredient, data) in enumerate(sorted_compounds[:15], 1):
        print(f"{i:2d}. '{ingredient}'")
        print(f"    Count: {data['count']} | Reason: {data['reason']}")
        print(f"    Recipe IDs: {data['recipe_ids'][:5]}{'...' if len(data['recipe_ids']) > 5 else ''}")
        print()
    
    return dict(sorted_compounds)

# Run compound detection
if 'recipe_ingredient_df' in locals():
    detected_compounds = detect_compound_ingredients(recipe_ingredient_df)
else:
    print("❌ Recipe ingredient DataFrame not found. Run previous steps first.")

DETECTING COMPOUND INGREDIENTS
✅ Found 8 compound ingredients

📋 TOP COMPOUND INGREDIENTS (sorted by frequency):
--------------------------------------------------------------------------------
 1. 'bumboo teri bubuk sesame seed'
    Count: 1 | Reason: multiple_foods
    Recipe IDs: [246]

 2. 'oil clove garlic beef sweet soy sauce salt sugar rice water tofu'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [338]

 3. 'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [607]

 4. 'rice red sweet potato chicken broth long bean salt margarine'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [712]

 5. 'seed baby lemon juice water formula milk'
    Count: 1 | Reason: long_sentence
    Recipe IDs: [783]

 6. 'beef chicken broth'
    Count: 1 | Reason: multiple_foods
    Recipe IDs: [929]

 7. 'low sodium vegetable chicken broth'
    Count: 1 | Reason: mixed_pattern
    Recipe IDs: [930]

 

In [1098]:
PREDEFINED_COMPOUND_PATTERNS = {
    # Exact matches from your problematic cases
    'oil clove garlic beef sweet soy sauce salt sugar rice water tofu': [
        'oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu'
    ],
    'rice red sweet potato chicken broth long bean salt margarine': [
        'rice', 'red sweet potato', 'chicken broth', 'long bean', 'salt', 'margarine'
    ],
    'seed baby lemon juice water formula milk': [
        'lemon juice', 'water', 'formula milk'
    ],
    'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth': [
        'rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth'
    ],
    'cooking oil clove garlic beef sweet soy sauce salt sugar': [
        'cooking oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar'
    ],
    'bumboo teri bubuk sesame seed': [
        'anchovy powder', 'sesame seed'
    ],
    'roll egg noodle boiling water': [
        'noodle', 'water'
    ],
    'chicken upper lower thigh wing': [
        'chicken thigh', 'chicken wing'
    ]
}

print("📋 PREDEFINED COMPOUND PATTERNS")
print("=" * 60)
print(f"Total predefined patterns: {len(PREDEFINED_COMPOUND_PATTERNS)}")
print("\nPattern examples:")
for i, (compound, parts) in enumerate(list(PREDEFINED_COMPOUND_PATTERNS.items())[:5], 1):
    print(f"{i}. '{compound}'")
    print(f"   → {parts}")
    print()

📋 PREDEFINED COMPOUND PATTERNS
Total predefined patterns: 8

Pattern examples:
1. 'oil clove garlic beef sweet soy sauce salt sugar rice water tofu'
   → ['oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu']

2. 'rice red sweet potato chicken broth long bean salt margarine'
   → ['rice', 'red sweet potato', 'chicken broth', 'long bean', 'salt', 'margarine']

3. 'seed baby lemon juice water formula milk'
   → ['lemon juice', 'water', 'formula milk']

4. 'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth'
   → ['rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth']

5. 'cooking oil clove garlic beef sweet soy sauce salt sugar'
   → ['cooking oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar']



In [1099]:
def standardize_simple_ingredient_only(ingredient, mapping_dict):
    """
    Args:
        ingredient (str): Single ingredient to standardize
        mapping_dict (dict): Standardization mapping rules
        
    Returns:
        str: Standardized ingredient name
    """
    if not ingredient or pd.isna(ingredient):
        return None
        
    ingredient_clean = str(ingredient).strip().lower()
    
    # Direct mapping lookup only
    if ingredient_clean in mapping_dict:
        return mapping_dict[ingredient_clean]
    
    # If no mapping found, return cleaned original
    return ingredient_clean

In [1100]:
def is_compound_ingredient_only(ingredient, predefined_patterns):
    """
    PURE COMPOUND CHECKING FUNCTION - NO PROCESSING LOGIC
    
    Args:
        ingredient (str): Ingredient to check
        predefined_patterns (dict): Known compound patterns
        
    Returns:
        bool: True if compound, False if simple
    """
    if not ingredient or pd.isna(ingredient):
        return False
        
    ingredient_clean = str(ingredient).strip().lower()
    
    # Check if it's in our predefined compound patterns (exact match)
    if ingredient_clean in predefined_patterns:
        return True
    
    return False

In [1101]:
# Test compound checking function
print("🧪 TESTING COMPOUND CHECKING FUNCTION")
print("=" * 50)

compound_test_cases = [
    'water',  # Simple
    'yoghurt',  # Simple
    'oil clove garlic beef sweet soy sauce salt sugar rice water tofu',  # Compound
    'rice red sweet potato chicken broth long bean salt margarine',  # Compound
    'chicken',  # Simple
    'cooking oil clove garlic chicken broth salt shrimp carrot celery'  # Compound
]

print("Compound detection tests:")
for ingredient in compound_test_cases:
    is_compound = is_compound_ingredient_only(ingredient, PREDEFINED_COMPOUND_PATTERNS)
    print(f"  '{ingredient}' → {'COMPOUND' if is_compound else 'SIMPLE'}")

🧪 TESTING COMPOUND CHECKING FUNCTION
Compound detection tests:
  'water' → SIMPLE
  'yoghurt' → SIMPLE
  'oil clove garlic beef sweet soy sauce salt sugar rice water tofu' → COMPOUND
  'rice red sweet potato chicken broth long bean salt margarine' → COMPOUND
  'chicken' → SIMPLE
  'cooking oil clove garlic chicken broth salt shrimp carrot celery' → SIMPLE


In [1102]:
def process_compound_ingredient_only(ingredient, predefined_patterns, mapping_dict):
    """
    SIMPLIFIED COMPOUND PROCESSING - ONLY USES PREDEFINED PATTERNS
    
    Args:
        ingredient (str): Compound ingredient to split and standardize
        predefined_patterns (dict): Known compound splitting patterns
        mapping_dict (dict): Standardization mapping rules (for individual parts)
        
    Returns:
        list: List of standardized individual ingredients
    """
    if not ingredient or pd.isna(ingredient):
        return []
        
    ingredient_clean = str(ingredient).strip().lower()

    if ingredient_clean in predefined_patterns:
        # Use exact predefined pattern
        individual_parts = predefined_patterns[ingredient_clean]
        print(f"  🎯 Using predefined pattern: '{ingredient_clean}' → {individual_parts}")
    else:
        print(f"  ❌ ERROR: '{ingredient_clean}' not found in predefined patterns!")
        return [ingredient_clean]  # Return as single ingredient
    
    # Apply simple mapping to each individual part
    standardized_parts = []
    for part in individual_parts:
        standardized_part = standardize_simple_ingredient_only(part, mapping_dict)
        standardized_parts.append(standardized_part)
        print(f"    '{part}' → '{standardized_part}'")
    
    return standardized_parts

In [1103]:
# Add compound column to existing recipe_ingredient_df
if 'recipe_ingredient_df' in locals() and 'PREDEFINED_COMPOUND_PATTERNS' in locals():
    print("🔄 ADDING COMPOUND COLUMN TO EXISTING DATAFRAME")
    print("=" * 50)
    
    # Apply compound detection to each ingredient
    recipe_ingredient_df['compound'] = recipe_ingredient_df['single_ingredient'].apply(
        lambda ingredient: is_compound_ingredient_only(ingredient, PREDEFINED_COMPOUND_PATTERNS)
    )

display(recipe_ingredient_df[['recipe_id', 'single_ingredient', 'compound']].head(10)   )

🔄 ADDING COMPOUND COLUMN TO EXISTING DATAFRAME


Unnamed: 0,recipe_id,single_ingredient,compound
0,1,cassava,False
1,1,fish,False
2,1,chicken,False
3,1,coconut oil,False
4,1,chicken broth,False
5,1,lime juice,False
6,1,spinach,False
7,2,beef,False
8,2,potato starch,False
9,2,milk,False


In [1104]:
def standardized_ingredients_into_df(recipe_ingredient_df, predefined_patterns, mapping_dict):
    """
    Alternative function that directly embeds standardized_ingredient column into the existing DataFrame
    
    Args:
        recipe_ingredient_df: DataFrame with columns ['recipe_id', 'single_ingredient']
        predefined_patterns: Dictionary of known compound patterns
        mapping_dict: Dictionary for simple ingredient standardization
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with standardized_ingredient column
    """
    
    def get_standardized_ingredient(ingredient):
        """Helper function to get standardized ingredient for each row"""
        if pd.isna(ingredient) or not ingredient:
            return None
            
        try:
            # Check if compound
            is_compound = is_compound_ingredient_only(ingredient, predefined_patterns)
            
            if is_compound:
                # For compound ingredients, get the first part (or join all parts)
                individual_parts = process_compound_ingredient_only(ingredient, predefined_patterns, mapping_dict)
                if individual_parts:
                    # Option 1: Return first part
                    return individual_parts[0]
                    # Option 2: Return all parts joined (uncomment below)
                    # return ', '.join(individual_parts)
                else:
                    return str(ingredient).strip().lower()
            else:
                # For simple ingredients, apply direct mapping
                return standardize_simple_ingredient_only(ingredient, mapping_dict)
                
        except Exception as e:
            print(f"❌ Error processing '{ingredient}': {e}")
            return str(ingredient).strip().lower()
    
    # Apply standardization to create new column
    recipe_ingredient_df['standardized_ingredient'] = recipe_ingredient_df['single_ingredient'].apply(get_standardized_ingredient)
    
    # Add metadata columns
    recipe_ingredient_df['is_compound'] = recipe_ingredient_df['single_ingredient'].apply(
        lambda x: is_compound_ingredient_only(x, predefined_patterns) if pd.notna(x) else False
    )
    
    print(f"✅ Successfully embedded standardized_ingredient column!")
    print(f"   Total rows processed: {len(recipe_ingredient_df)}")
    print(f"   Compound ingredients: {recipe_ingredient_df['is_compound'].sum()}")
    print(f"   Simple ingredients: {(~recipe_ingredient_df['is_compound']).sum()}")
    
    return recipe_ingredient_df

standardized_ingredients_into_df(recipe_ingredient_df, PREDEFINED_COMPOUND_PATTERNS, standardization_mapping)

  🎯 Using predefined pattern: 'bumboo teri bubuk sesame seed' → ['anchovy powder', 'sesame seed']
    'anchovy powder' → 'anchovy powder'
    'sesame seed' → 'sesame seed'
  🎯 Using predefined pattern: 'oil clove garlic beef sweet soy sauce salt sugar rice water tofu' → ['oil', 'clove garlic', 'beef', 'sweet soy sauce', 'salt', 'sugar', 'rice', 'water', 'tofu']
    'oil' → 'oil'
    'clove garlic' → 'garlic'
    'beef' → 'beef'
    'sweet soy sauce' → 'sweet soy sauce'
    'salt' → 'salt'
    'sugar' → 'sugar'
    'rice' → 'rice'
    'water' → 'water'
    'tofu' → 'tofu'
  🎯 Using predefined pattern: 'rice cooking oil clove garlic chicken broth salt shrimp carrot celery extra chicken broth' → ['rice', 'cooking oil', 'clove garlic', 'chicken broth', 'salt', 'shrimp', 'carrot', 'celery', 'extra chicken broth']
    'rice' → 'rice'
    'cooking oil' → 'cooking oil'
    'clove garlic' → 'garlic'
    'chicken broth' → 'chicken broth'
    'salt' → 'salt'
    'shrimp' → 'shrimp'
    'carrot' →

Unnamed: 0,recipe_id,recipe_name,single_ingredient,ingredient_position,original_ingredient_string,compound,standardized_ingredient,is_compound
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,cassava,1,"cassava, fish, chicken, coconut oil, chicken b...",False,cassava,False
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,fish,2,"cassava, fish, chicken, coconut oil, chicken b...",False,fish,False
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken,3,"cassava, fish, chicken, coconut oil, chicken b...",False,chicken,False
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,coconut oil,4,"cassava, fish, chicken, coconut oil, chicken b...",False,coconut oil,False
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,chicken broth,5,"cassava, fish, chicken, coconut oil, chicken b...",False,chicken broth,False
...,...,...,...,...,...,...,...,...
8171,1322,Apple Crumble,flour,2,"apples, flour, sugar, coconut, rolled oats, ma...",False,flour,False
8172,1322,Apple Crumble,sugar,3,"apples, flour, sugar, coconut, rolled oats, ma...",False,sugar,False
8173,1322,Apple Crumble,coconut,4,"apples, flour, sugar, coconut, rolled oats, ma...",False,coconut,False
8174,1322,Apple Crumble,rolled oats,5,"apples, flour, sugar, coconut, rolled oats, ma...",False,rolled oats,False


Map Out Ingredient Ids

In [1105]:
import re

def clean_ingredient_name_symbols(ingredient_name):
    """
    Clean unwanted symbols, quotes, and brackets from ingredient names
    
    Args:
        ingredient_name: String containing the ingredient name
        
    Returns:
        str: Cleaned ingredient name
    """
    if pd.isna(ingredient_name) or ingredient_name == '':
        return ingredient_name
        
    # Convert to string if not already
    cleaned = str(ingredient_name)
    
    # Remove quotes (single and double)
    cleaned = cleaned.replace("'", "").replace('"', "")
    
    # Remove brackets and parentheses
    cleaned = cleaned.replace("[", "").replace("]", "")
    cleaned = cleaned.replace("(", "").replace(")", "")
    
    # Remove other unwanted symbols but keep hyphens and spaces
    cleaned = re.sub(r'[^\w\s\-]', '', cleaned)
    
    # Clean up extra whitespace
    cleaned = ' '.join(cleaned.split())
    
    # Remove leading/trailing whitespace
    cleaned = cleaned.strip()
    
    return cleaned if cleaned else ingredient_name


In [1106]:
def create_ingredient_master_df_with_na_check(recipe_ingredient_df, use_standardized=True):
    """
    Create a master ingredient DataFrame with unique ingredient IDs and handle NA values
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        tuple: (ingredient_df, ingredient_id_mapping, cleaned_df)
    """
    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return None, None, None
    
    print(f"🔍 CHECKING FOR NA VALUES IN '{ingredient_column}'")
    print("=" * 60)
    
    # Check for NA values in standardized ingredient column
    na_mask = recipe_ingredient_df[ingredient_column].isna()
    na_count = na_mask.sum()
    total_count = len(recipe_ingredient_df)
    
    print(f"📊 NA Analysis:")
    print(f"   Total rows: {total_count}")
    print(f"   NA values in '{ingredient_column}': {na_count}")
    print(f"   Percentage NA: {(na_count/total_count)*100:.2f}%")
    
    if na_count > 0:
        print(f"\n🔍 Analyzing NA rows...")
        na_rows = recipe_ingredient_df[na_mask].copy()
        
        # Check if there's an 'original_ingredient_string' column to reference
        original_column = None
        possible_original_columns = ['original_ingredient_string', 'single_ingredient', 'ingredient']
        
        for col in possible_original_columns:
            if col in recipe_ingredient_df.columns and col != ingredient_column:
                original_column = col
                break
        
        if original_column:
            print(f"   Using '{original_column}' as reference for original data")
            
            # Check which NA rows have valid original data
            na_with_original = na_rows[na_rows[original_column].notna() & (na_rows[original_column] != '')]
            na_without_original = na_rows[na_rows[original_column].isna() | (na_rows[original_column] == '')]
            
            print(f"\n📋 NA Row Analysis:")
            print(f"   NA rows with valid original data: {len(na_with_original)}")
            print(f"   NA rows without original data: {len(na_without_original)}")
            
            # Show NA rows with valid original data (these need attention)
            if len(na_with_original) > 0:
                print(f"\n⚠️  HIGHLIGHT: NA rows with valid original data (need investigation):")
                display_cols = ['recipe_id', 'recipe_name', original_column, ingredient_column]
                available_cols = [col for col in display_cols if col in na_with_original.columns]
                
                for idx, row in na_with_original.head(10).iterrows():
                    print(f"   Row {idx}:")
                    print(f"     Recipe: {row.get('recipe_name', 'N/A')}")
                    print(f"     Original: '{row.get(original_column, 'N/A')}'")
                    print(f"     Standardized: {row.get(ingredient_column, 'N/A')}")
                    print()
                
                if len(na_with_original) > 10:
                    print(f"     ... and {len(na_with_original) - 10} more rows")
            
            # Drop rows without original data
            if len(na_without_original) > 0:
                print(f"\n🗑️  DROPPING: {len(na_without_original)} rows with no original data")
                # Show sample of rows being dropped
                print("   Sample rows being dropped:")
                for idx, row in na_without_original.head(5).iterrows():
                    print(f"     Row {idx}: Recipe '{row.get('recipe_name', 'N/A')}' - No original ingredient data")
        
        else:
            print(f"   ⚠️  No original ingredient column found for reference")
            print(f"   Available columns: {list(recipe_ingredient_df.columns)}")
    
    # Create cleaned DataFrame (drop rows with NA in standardized ingredient)
    cleaned_df = recipe_ingredient_df.dropna(subset=[ingredient_column]).copy()
    dropped_count = total_count - len(cleaned_df)
    
    if dropped_count > 0:
        print(f"\n✅ Dropped {dropped_count} rows with NA values")
        print(f"   Remaining rows: {len(cleaned_df)}")
    
    # Continue with ingredient master creation using cleaned data
    print(f"\n🔄 Creating ingredient master DataFrame from cleaned data...")
    
    # Get unique ingredients and clean them
    unique_ingredients = cleaned_df[ingredient_column].dropna().unique()
    
    # Clean all ingredient names to remove unwanted symbols
    cleaned_ingredients = [clean_ingredient_name_symbols(ingredient) for ingredient in unique_ingredients]
    
    # Remove duplicates after cleaning and sort
    unique_cleaned_ingredients = sorted(list(set(cleaned_ingredients)))
    
    # Create ingredient master DataFrame
    ingredient_data = []
    ingredient_id_mapping = {}
    
    for idx, ingredient in enumerate(unique_cleaned_ingredients, start=1):
        if ingredient and ingredient.strip():  # Only add non-empty ingredients
            ingredient_data.append({
                'ingredient_id': idx,
                'ingredient_name': ingredient
            })
            ingredient_id_mapping[ingredient] = idx
    
    ingredient_df = pd.DataFrame(ingredient_data)
    
    print(f"\n✅ Created ingredient master DataFrame:")
    print(f"   Total unique ingredients: {len(ingredient_df)}")
    print(f"   Using column: {ingredient_column}")
    print(f"   Cleaned unwanted symbols from ingredient names")
    
    return ingredient_df, ingredient_id_mapping, cleaned_df

# Alternative: Simple function to just check NA values first
def check_na_in_standardized_ingredients(recipe_ingredient_df):
    """
    Quick function to check NA values in standardized ingredients
    """
    ingredient_column = 'standardized_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found")
        return
    
    # Check for NA values
    na_mask = recipe_ingredient_df[ingredient_column].isna()
    na_count = na_mask.sum()
    
    print(f"🔍 NA CHECK RESULTS:")
    print(f"   Total rows: {len(recipe_ingredient_df)}")
    print(f"   NA values in standardized_ingredient: {na_count}")
    print(f"   Percentage: {(na_count/len(recipe_ingredient_df))*100:.2f}%")
    
    if na_count > 0:
        na_rows = recipe_ingredient_df[na_mask]
        
        # Check original data
        original_cols = ['original_ingredient_string', 'single_ingredient']
        for col in original_cols:
            if col in recipe_ingredient_df.columns:
                has_original = na_rows[col].notna().sum()
                print(f"   NA rows with valid '{col}': {has_original}")
                
                # Show sample
                if has_original > 0:
                    print(f"\n📋 Sample NA rows with original data:")
                    sample = na_rows[na_rows[col].notna()].head(5)
                    for idx, row in sample.iterrows():
                        print(f"     Row {idx}: '{row[col]}'")
                break
    else:
        print("✅ No NA values found!")

# Usage examples:

# Quick check first
check_na_in_standardized_ingredients(recipe_ingredient_df)

# Then use the enhanced function
ingredient_df, ingredient_id_mapping, cleaned_recipe_df = create_ingredient_master_df_with_na_check(
    recipe_ingredient_df, 
    use_standardized=True
)

🔍 NA CHECK RESULTS:
   Total rows: 8163
   NA values in standardized_ingredient: 0
   Percentage: 0.00%
✅ No NA values found!
🔍 CHECKING FOR NA VALUES IN 'standardized_ingredient'
📊 NA Analysis:
   Total rows: 8163
   NA values in 'standardized_ingredient': 0
   Percentage NA: 0.00%

🔄 Creating ingredient master DataFrame from cleaned data...

✅ Created ingredient master DataFrame:
   Total unique ingredients: 992
   Using column: standardized_ingredient
   Cleaned unwanted symbols from ingredient names


In [1107]:

if ingredient_df is not None:
    print(f"\n📋 Sample of cleaned ingredient_df:")
    display(ingredient_df.head(10))
    
    print(f"\n🔍 Checking for any remaining unwanted symbols...")
    # Check if any ingredient names still contain unwanted symbols
    has_quotes = ingredient_df['ingredient_name'].str.contains("'|\"", na=False).sum()
    has_brackets = ingredient_df['ingredient_name'].str.contains("\[|\]|\(|\)", na=False).sum()
    
    print(f"   Ingredients with quotes: {has_quotes}")
    print(f"   Ingredients with brackets: {has_brackets}")
    
    if has_quotes > 0 or has_brackets > 0:
        print("   ⚠️  Some unwanted symbols still found!")
        problematic = ingredient_df[
            ingredient_df['ingredient_name'].str.contains("'|\"|\[|\]|\(|\)", na=False)
        ]
        print("   Sample problematic ingredients:")
        display(problematic.head())
    else:
        print("   ✅ All ingredient names are clean!")




📋 Sample of cleaned ingredient_df:


Unnamed: 0,ingredient_id,ingredient_name
0,1,acai
1,2,agar-agar powder
2,3,all purpose flour
3,4,almond
4,5,almond butter
5,6,almond milk
6,7,almond nuts
7,8,aluminum foil
8,9,ambon banana
9,10,ambrosia fruit



🔍 Checking for any remaining unwanted symbols...
   Ingredients with quotes: 0
   Ingredients with brackets: 0
   ✅ All ingredient names are clean!


Map Out to the RecipeIngredient

In [1108]:
def map_ingredient_ids_to_recipe_df(recipe_ingredient_df, ingredient_id_mapping, use_standardized=True):
    """
    Map ingredient IDs to the recipe ingredient DataFrame using cleaned ingredient names
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        ingredient_id_mapping: Dictionary mapping cleaned ingredient names to IDs
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with ingredient_id column
    """

    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return recipe_ingredient_df
    
    # Create a copy to avoid modifying the original
    final_recipe_ingredient = recipe_ingredient_df.copy()
    
    # Clean the ingredient names in the DataFrame and map to IDs
    def get_ingredient_id(ingredient_name):
        if pd.isna(ingredient_name):
            return None
        cleaned_name = clean_ingredient_name_symbols(ingredient_name)
        return ingredient_id_mapping.get(cleaned_name, None)
    
    final_recipe_ingredient['ingredient_id'] = final_recipe_ingredient[ingredient_column].apply(get_ingredient_id)
    
    # Check mapping results
    total_rows = len(final_recipe_ingredient)
    mapped_rows = final_recipe_ingredient['ingredient_id'].notna().sum()
    unmapped_rows = total_rows - mapped_rows
    
    print(f"✅ Ingredient ID mapping complete:")
    print(f"   Total rows: {total_rows}")
    print(f"   Successfully mapped: {mapped_rows} ({(mapped_rows/total_rows)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_rows} ({(unmapped_rows/total_rows)*100:.1f}%)")
    
    if unmapped_rows > 0:
        print(f"\n🔍 Sample unmapped ingredients:")
        unmapped_sample = final_recipe_ingredient[final_recipe_ingredient['ingredient_id'].isna()][ingredient_column].dropna().unique()[:5]
        for ingredient in unmapped_sample:
            cleaned = clean_ingredient_name_symbols(ingredient)
            print(f"   Original: '{ingredient}' → Cleaned: '{cleaned}'")
    
    return final_recipe_ingredient

In [1109]:
recipe_ingredient_df.columns.to_list()

['recipe_id',
 'recipe_name',
 'single_ingredient',
 'ingredient_position',
 'original_ingredient_string',
 'compound',
 'standardized_ingredient',
 'is_compound']

In [1110]:
# Step 2: Map ingredient IDs back to recipe DataFrame

recipe_ingredient_df_with_ids = map_ingredient_ids_to_recipe_df(
    recipe_ingredient_df,
    ingredient_id_mapping,
    use_standardized=True
)

print(f"\n📋 Sample of updated recipe_ingredient_df with IDs:")
sample_cols = ['recipe_id', 'single_ingredient', 'standardized_ingredient', 'ingredient_id']
available_cols = [col for col in sample_cols if col in recipe_ingredient_df_with_ids.columns]
display(recipe_ingredient_df_with_ids[available_cols].head(10))

print(f"\n📊 Final DataFrame Statistics:")
print(f"   Recipe DataFrame shape: {recipe_ingredient_df_with_ids.shape}")
print(f"   Unique recipes: {recipe_ingredient_df_with_ids['recipe_id'].nunique()}")
print(f"   Unique ingredients: {len(ingredient_df)}")
print(f"   Recipe-ingredient relationships: {len(recipe_ingredient_df_with_ids)}")


✅ Ingredient ID mapping complete:
   Total rows: 8163
   Successfully mapped: 8163 (100.0%)
   Unmapped: 0 (0.0%)

📋 Sample of updated recipe_ingredient_df with IDs:

   Total rows: 8163
   Successfully mapped: 8163 (100.0%)
   Unmapped: 0 (0.0%)

📋 Sample of updated recipe_ingredient_df with IDs:


Unnamed: 0,recipe_id,single_ingredient,standardized_ingredient,ingredient_id
0,1,cassava,cassava,154
1,1,fish,fish,329
2,1,chicken,chicken,181
3,1,coconut oil,coconut oil,230
4,1,chicken broth,chicken broth,184
5,1,lime juice,lime juice,499
6,1,spinach,spinach,815
7,2,beef,beef,72
8,2,potato starch,potato starch,671
9,2,milk,milk,540



📊 Final DataFrame Statistics:
   Recipe DataFrame shape: (8163, 9)
   Unique recipes: 1309
   Unique ingredients: 992
   Recipe-ingredient relationships: 8163


In [1111]:
def drop_unmapped_ingredients(recipe_ingredient_df_with_ids):
    """
    Drop rows where ingredient_id is null (unmapped ingredients)
    
    Args:
        recipe_ingredient_df_with_ids: DataFrame with ingredient_id column
        
    Returns:
        DataFrame: Cleaned DataFrame without unmapped ingredients
    """
    print("🗑️ DROPPING UNMAPPED INGREDIENTS")
    print("=" * 40)
    
    # Count before dropping
    total_before = len(recipe_ingredient_df_with_ids)
    unmapped_count = recipe_ingredient_df_with_ids['ingredient_id'].isna().sum()
    
    print(f"Before dropping:")
    print(f"   Total rows: {total_before}")
    print(f"   Unmapped ingredients: {unmapped_count}")
    
    if unmapped_count > 0:
        # Show sample of what will be dropped
        print(f"\n📋 Sample of rows to be dropped:")
        unmapped_sample = recipe_ingredient_df_with_ids[
            recipe_ingredient_df_with_ids['ingredient_id'].isna()
        ][['recipe_id', 'single_ingredient', 'standardized_ingredient']].head(5)
        
        for idx, row in unmapped_sample.iterrows():
            print(f"   Row {idx}: Recipe {row['recipe_id']} - '{row['single_ingredient']}'")
        
        # Drop unmapped ingredients
        cleaned_df = recipe_ingredient_df_with_ids.dropna(subset=['ingredient_id'])
        
        # Reset index
        cleaned_df = cleaned_df.reset_index(drop=True)
        
        # Show results
        total_after = len(cleaned_df)
        dropped_count = total_before - total_after
        
        print(f"\n✅ After dropping:")
        print(f"   Total rows: {total_after}")
        print(f"   Dropped rows: {dropped_count}")
        print(f"   Retention rate: {(total_after/total_before)*100:.1f}%")
        
        return cleaned_df
    
    else:
        print("✅ No unmapped ingredients found - nothing to drop!")
        return recipe_ingredient_df_with_ids

# Usage
cleaned_recipe_ingredient_df = drop_unmapped_ingredients(recipe_ingredient_df_with_ids)

🗑️ DROPPING UNMAPPED INGREDIENTS
Before dropping:
   Total rows: 8163
   Unmapped ingredients: 0
✅ No unmapped ingredients found - nothing to drop!


In [1112]:
recipe_ingredient = cleaned_recipe_ingredient_df.copy()

#take recipe_id and ingredient_id only
recipe_ingredient = recipe_ingredient[['recipe_id', 'ingredient_id']].drop_duplicates().reset_index(drop=True)

print(f"\n📋 Final Recipe-Ingredient DataFrame:")
print(f"   Shape: {recipe_ingredient.shape}")
print(f"   Unique recipe IDs: {recipe_ingredient['recipe_id'].nunique()}")
print(f"   Unique ingredient IDs: {recipe_ingredient['ingredient_id'].nunique()}")


📋 Final Recipe-Ingredient DataFrame:
   Shape: (8117, 2)
   Unique recipe IDs: 1309
   Unique ingredient IDs: 992


#### Determine Allergen (ingredient and recipe)

In [1113]:
allergen_map = pd.read_excel('1st_dataset.xlsx', sheet_name='allergen')
print("Countrymap columns:", allergen_map.columns.tolist())
print("Countrymap shape:", allergen_map.shape)
print("\nFirst 5 rows of allergen:")
display(allergen_map.head())

Countrymap columns: ['pk', 'name', 'description']
Countrymap shape: (8, 3)

First 5 rows of allergen:
 ['pk', 'name', 'description']
Countrymap shape: (8, 3)

First 5 rows of allergen:


Unnamed: 0,pk,name,description
0,1,dairy,Common infant allergen. Found in dairy products.
1,2,egg,Can cause allergy even in small amounts.
2,3,soy,"Plant-based but common allergen, especially in..."
3,4,tree_nuts,Tree nuts are a top allergen. Avoid early intr...
4,5,peanuts,Peanuts are a major allergen. Introduce carefu...


In [1114]:
# Load the module
import importlib

allergen_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "allergen_mapper.py")

spec = importlib.util.spec_from_file_location("allergen", allergen_path)
allergen_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(allergen_file)

# Access the config
allergen_tags = allergen_file.ALLERGEN_TAGS
pprint(allergen_tags)

{'egg': {'allergen_group': 'egg',
         'keywords': ['egg',
                      'eggs',
                      'egg white',
                      'egg yolk',
                      'omelette',
                      'lecithin',
                      'chicken egg',
                      'duck egg',
                      'quail egg'],
         'notes': 'Can cause allergy even in small amounts.'},
 'fish': {'allergen_group': 'fish',
          'keywords': ['salmon',
                       'cod',
                       'tuna',
                       'white fish',
                       'trout',
                       'sardines',
                       'catfish',
                       'tilapia',
                       'codfish',
                       'sardine',
                       'anchovy',
                       'dorado fish',
                       'mackerel',
                       'tenggiri fish',
                       'snapper',
                       'basa fish',
             

In [1115]:
ingredient_df

Unnamed: 0,ingredient_id,ingredient_name
0,1,acai
1,2,agar-agar powder
2,3,all purpose flour
3,4,almond
4,5,almond butter
...,...,...
987,988,young beans
988,989,young coconut
989,990,young corn
990,991,zucchini


In [1116]:
def calculate_match_confidence(ingredient_name, keyword):
    """Calculate confidence score for partial matches."""
    confidence = 0.6
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    return min(0.95, max(0.0, confidence))

In [1117]:
def detect_allergen(ingredient_name, ALLERGEN_TAGS):
    """
    Detect allergen in an ingredient name using a two-step strategy:
    1. Check for exclusion terms
    2. Exact then partial keyword matching
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact'/'partial' or None,
            'matched_keyword': str or None,
            'confidence': float
        }
    """
    # Normalize input
    ingredient_lower = ingredient_name.lower().strip()

    # Exclusion Terms
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    if any(term in ingredient_lower for term in exclusion_terms):
        return {
            'allergen_group': None,
            'match_type': None,
            'matched_keyword': None,
            'confidence': 0.0
        }

    # Step 1: Exact match
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }

    # Step 2: Partial match
    best_match = None
    for allergen_name, data in ALLERGEN_TAGS.items():
        keywords = data.get("keywords", [])
        allergen_group = data.get("allergen_group", allergen_name)
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower in ingredient_lower:
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                if not best_match or confidence > best_match['confidence']:
                    best_match = {
                        'allergen_group': allergen_group,
                        'match_type': 'partial',
                        'matched_keyword': keyword,
                        'confidence': confidence
                    }

    if best_match and best_match['confidence'] >= 0.5:
        return best_match

    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

In [1118]:
def get_allergen_group_id(allergen_group, allergen_mapping):
    """Map allergen group name to ID."""
    return allergen_mapping.get(allergen_group, None)

In [1119]:
def process_ingredients_with_allergens(df, allergen_df):

    allergen_mapping = dict(zip(allergen_df['name'], allergen_df['pk']))

    results = []
    for _, row in df.iterrows():
        result = detect_allergen(row['ingredient_name'],allergen_tags)
        result['ingredient_id'] = row['ingredient_id']
        result['ingredient_name'] = row['ingredient_name']
        result['allergen_group_id'] = get_allergen_group_id(result['allergen_group'], allergen_mapping)
        result['isAllergen'] = result['allergen_group_id'] is not None
        results.append(result)

    result_df = pd.DataFrame(results)
    return result_df[result_df.columns.tolist()]

In [1120]:
test_data = [
    {"ingredient_id": 1, "ingredient_name": "Whole Milk"},
    {"ingredient_id": 2, "ingredient_name": "Breast Milk"},
    {"ingredient_id": 3, "ingredient_name": "Almond Butter"},
    {"ingredient_id": 4, "ingredient_name": "Chicken Breast"},
    {"ingredient_id": 5, "ingredient_name": "Salmon Fillet"}
]
testing = pd.DataFrame(test_data)

allergen_df = pd.DataFrame(allergen_tags)

final_df = process_ingredients_with_allergens(testing, allergen_map)
print(final_df.to_string(index=False))
display(final_df)

allergen_group match_type matched_keyword  confidence  ingredient_id ingredient_name  allergen_group_id  isAllergen
         dairy      exact      whole milk         1.0              1      Whole Milk                1.0        True
          None       None            None         0.0              2     Breast Milk                NaN       False
     tree_nuts      exact   almond butter         1.0              3   Almond Butter                4.0        True
          None       None            None         0.0              4  Chicken Breast                NaN       False
          fish    partial          salmon         0.9              5   Salmon Fillet                6.0        True


Unnamed: 0,allergen_group,match_type,matched_keyword,confidence,ingredient_id,ingredient_name,allergen_group_id,isAllergen
0,dairy,exact,whole milk,1.0,1,Whole Milk,1.0,True
1,,,,0.0,2,Breast Milk,,False
2,tree_nuts,exact,almond butter,1.0,3,Almond Butter,4.0,True
3,,,,0.0,4,Chicken Breast,,False
4,fish,partial,salmon,0.9,5,Salmon Fillet,6.0,True


In [1121]:
ingredient_df =process_ingredients_with_allergens(testing, allergen_map)

display(ingredient_df.head(10))

Unnamed: 0,allergen_group,match_type,matched_keyword,confidence,ingredient_id,ingredient_name,allergen_group_id,isAllergen
0,dairy,exact,whole milk,1.0,1,Whole Milk,1.0,True
1,,,,0.0,2,Breast Milk,,False
2,tree_nuts,exact,almond butter,1.0,3,Almond Butter,4.0,True
3,,,,0.0,4,Chicken Breast,,False
4,fish,partial,salmon,0.9,5,Salmon Fillet,6.0,True


In [1122]:
final_ingredient_df = ingredient_df.copy()
final_ingredient_df = final_ingredient_df.rename(columns={
    'ingredient_id': 'pk',
    'ingredient_name': 'name',
    'allergen_group_id': 'allergen_group_id',
    'isAllergen': 'isAllergen'})

#final_ingredient_df drop column other than pk, name, allergen_group_id, isAllergen
final_ingredient_df = final_ingredient_df[['pk', 'name', 'allergen_group_id', 'isAllergen']]
display(final_ingredient_df.head(10))

Unnamed: 0,pk,name,allergen_group_id,isAllergen
0,1,Whole Milk,1.0,True
1,2,Breast Milk,,False
2,3,Almond Butter,4.0,True
3,4,Chicken Breast,,False
4,5,Salmon Fillet,6.0,True


#### Recipe Based To Get isHypo

In [1123]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [1124]:
df['allergen'] = df.apply(
    lambda row: detect_allergens(row, allergen_tags),
    axis=1
)

# Update hypoallergenic column based on allergen data
df['hypoallergenic'] = df['allergen'].apply(
    lambda allergens: "Yes" if not allergens or len(allergens) == 0 else "No"
)

# Display results to verify
df[['name', 'allergen', 'hypoallergenic', 'choking_hazards']].head(10)

Unnamed: 0,name,allergen,hypoallergenic,choking_hazards
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,[],Yes,No
1,Bitterballs (Bitterballen),[],Yes,No
2,Broccoli/Cauliflower Cheese,[],Yes,No
3,Vegetable Fingers,[],Yes,No
4,Beef Casserole,[],Yes,No
5,Toast Fingers,[],Yes,No
6,Pumpkin Polenta Fingers,[],Yes,No
7,Rice Pudding,[],Yes,No
8,Hummus,[],Yes,No
9,Baked Bean Pie,[],Yes,No


### **Classify Category of The Recipe**

In [1125]:
# Load the module
import importlib

category_path = os.path.join(os.getcwd(), "preprocessing_techniques", "mappers", "category_dict.py")

spec = importlib.util.spec_from_file_location("category", category_path)
category_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(category_file)

# Access the config
dietary_tags  = category_file.dietary_tags
pprint(dietary_tags)

{'dairy_free': {'excluded_allergen_groups': ['milk'],
                'excluded_ingredients': ['cheese',
                                         'butter',
                                         'yogurt',
                                         'milk',
                                         'whey',
                                         'casein',
                                         'cream',
                                         'ice cream']},
 'egg_free': {'excluded_allergen_groups': ['egg'],
              'excluded_ingredients': ['eggs',
                                       'egg white',
                                       'egg yolk',
                                       'lecithin',
                                       'omelette']},
 'gluten_free': {'excluded_allergen_groups': ['gluten'],
                 'excluded_ingredients': ['wheat',
                                          'barley',
                                          'rye',
                        

In [1126]:
category_map = pd.read_excel('1st_dataset.xlsx', sheet_name='category')
print("Category columns:", category_map.columns.tolist())
print("Category shape:", category_map.shape)
print("\nFirst 5 rows of Category:")
display(category_map.head())

Category columns: ['pk', 'name', 'description']
Category shape: (11, 3)

First 5 rows of Category:


Unnamed: 0,pk,name,description
0,1,vegan,Allowed categories: plant_based\nExcluded alle...
1,2,vegetarian,"Allowed categories: plant_based, dairy, egg\nE..."
2,3,pescetarian,"Allowed categories: plant_based, dairy, egg, s..."
3,4,dairy free,Excluded allergen groups: milk\nExcluded ingre...
4,5,egg free,Excluded allergen groups: egg\nExcluded ingred...


In [1127]:
# FINAL PRODUCTION VERSION: classify_recipe with Non-Veg Fallback
def classify_recipe(recipe_data, dietary_tags):
    """
    FINAL VERSION: Classifies recipes with dietary tags using ner_ingredient_string directly.
    Includes fallback to 'non_veg' when no specific dietary category (vegan/vegetarian/pescetarian) matches.
    
    Args:
        recipe_data: Row containing recipe information with ner_ingredient_string
        dietary_tags: Dictionary with dietary classification rules
    
    Returns:
        String of comma-separated dietary tags
    """
    
    # Use ner_ingredient_string directly (clean comma-separated string)
    ingredient_string = str(recipe_data.get('ner_ingredient_string', ''))
    
    if not ingredient_string or ingredient_string.strip() == '':
        return ''
    
    # Convert to lowercase list for matching
    ingredients = [ing.strip().lower() for ing in ingredient_string.split(',') if ing.strip()]
    
    # Initialize tag categories
    general_diet = None
    religious_tag = None
    allergen_tags = []
    
    # Check each dietary category
    for category, rules in dietary_tags.items():
        # Get excluded ingredients from the rules
        excluded_ingredients = rules.get('excluded_ingredients', [])
        
        # Check exclusions - if any excluded ingredient is found, skip this tag
        exclude_found = False
        if excluded_ingredients:
            exclude_found = any(
                any(excl.lower() in ingredient.lower() for ingredient in ingredients)
                for excl in excluded_ingredients
            )
        
        # If not excluded, the tag applies
        if not exclude_found:
            print
            # Categorize tags
            if category in ['vegan', 'vegetarian', 'pescetarian']:
                # Only assign one general diet - take the most restrictive that applies
                if general_diet is None:
                    general_diet = category
                elif category == 'vegan' and general_diet != 'vegan':
                    general_diet = category  # Vegan is most restrictive
                elif category == 'vegetarian' and general_diet not in ['vegan']:
                    general_diet = category
            elif category in ['halal', 'kosher']:
                religious_tag = category
            else:
                allergen_tags.append(category)
    
    # 🔧 FALLBACK MECHANISM: If no specific dietary category matched, default to 'non_veg'
    if general_diet is None:
        general_diet = 'non_veg'
    
    # Combine tags: one general diet + one religious + multiple allergens
    final_tags = []
    if general_diet:
        final_tags.append(general_diet)
    if religious_tag:
        final_tags.append(religious_tag)
    final_tags.extend(allergen_tags)
    
    return ', '.join(final_tags)


In [1128]:
sample_row = df.iloc[0].copy()  
classified_tags = classify_recipe(sample_row, dietary_tags)
print("Classified Tags:")
print(classified_tags)

Classified Tags:
non_veg, halal, dairy_free, egg_free, soy_free, nut_free, gluten_free


In [1129]:
df['dietary_tags'] = df.apply(
    lambda row: classify_recipe(row, dietary_tags),
    axis=1)

df[['ner_ingredient', 'dietary_tags', 'choking_hazards']].head(10)

Unnamed: 0,ner_ingredient,dietary_tags,choking_hazards
0,"['cassava', 'fish', 'chicken', 'coconut oil', ...","non_veg, halal, dairy_free, egg_free, soy_free...",No
1,"['beef', 'potato starch', 'milk', 'egg', 'marg...","non_veg, halal, egg_free, soy_free, nut_free",No
2,"['cauliflower', 'broccoli', 'margarine', 'flou...","vegetarian, halal, egg_free, soy_free, nut_fre...",No
3,"['carrot', 'potato', 'sweet potato']","vegan, halal, dairy_free, egg_free, soy_free, ...",No
4,"['onion', 'vegetable oil', 'beef', 'steak', 'c...","non_veg, halal, dairy_free, egg_free, soy_free...",No
5,['wholemeal bread'],"vegan, halal, dairy_free, egg_free, soy_free, ...",No
6,"['water', 'polenta', 'pumpkin', 'parmesan chee...","vegetarian, halal, egg_free, soy_free, nut_fre...",No
7,"['rice', 'milk', 'sugar', 'vanilla essence']","vegetarian, halal, egg_free, soy_free, nut_fre...",No
8,"['chickpeas', 'garlic', 'lemon juice', 'milk',...","vegetarian, halal, egg_free, soy_free, nut_fre...",No
9,"['baked beans', 'zucchini', 'potatoes', 'milk'...","vegetarian, halal, egg_free, soy_free, nut_fre...",No


In [1130]:
df['dietary_tags'] = df['dietary_tags'].str.replace('_', ' ', regex=False)

df[['name', 'dietary_tags', 'choking_hazard']].head(10)

Unnamed: 0,name,dietary_tags,choking_hazard
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"non veg, halal, dairy free, egg free, soy free...",
1,Bitterballs (Bitterballen),"non veg, halal, egg free, soy free, nut free",
2,Broccoli/Cauliflower Cheese,"vegetarian, halal, egg free, soy free, nut fre...",
3,Vegetable Fingers,"vegan, halal, dairy free, egg free, soy free, ...",
4,Beef Casserole,"non veg, halal, dairy free, egg free, soy free...",
5,Toast Fingers,"vegan, halal, dairy free, egg free, soy free, ...",
6,Pumpkin Polenta Fingers,"vegetarian, halal, egg free, soy free, nut fre...",
7,Rice Pudding,"vegetarian, halal, egg free, soy free, nut fre...",
8,Hummus,"vegetarian, halal, egg free, soy free, nut fre...",
9,Baked Bean Pie,"vegetarian, halal, egg free, soy free, nut fre...",


In [1131]:
# 1. Show how many unique dietary tags combinations exist
unique_combinations = df['dietary_tags'].value_counts(dropna=False)
print("📊 Unique dietary tag combinations:")
print(unique_combinations)
print("-" * 50)

# 2. If you want to see them as individual tags instead:
def split_and_flatten(series):
    return series.str.split(",").explode().str.strip().str.lower()

# Get all unique dietary tags
unique_tags = split_and_flatten(df['dietary_tags']).dropna().drop_duplicates().sort_values()
print("\n✅ Unique dietary tags found:")
for tag in unique_tags:
    print(f" - {tag}")

# 3. Check if all values are the same
all_same = df['dietary_tags'].nunique(dropna=False) == 1
if all_same:
    sample_value = df['dietary_tags'].iloc[0]
    print(f"\n⚠️ Warning: All rows have the same dietary_tags value: '{sample_value}'")
else:
    print("\n✅ Dietary_tags vary across rows ✅")

📊 Unique dietary tag combinations:
dietary_tags
vegan, halal, dairy free, egg free, soy free, nut free, gluten free      282
vegetarian, halal, egg free, soy free, nut free, gluten free             271
non veg, halal, dairy free, egg free, soy free, nut free, gluten free    196
non veg, halal, dairy free, egg free, nut free, gluten free               70
non veg, halal, egg free, soy free, nut free, gluten free                 57
                                                                        ... 
non veg, dairy free, soy free, nut free, gluten free                       1
pescetarian, halal, nut free, gluten free                                  1
non veg, halal, egg free, nut free                                         1
non veg, egg free, soy free, nut free                                      1
pescetarian, halal, dairy free, soy free, nut free, gluten free            1
Name: count, Length: 63, dtype: int64
--------------------------------------------------

✅ Unique dietar

#### Map to the category id <> recipe_id

In [1132]:
# df = df.drop(columns=['dietary_tags_list'])
df[['recipe_id', 'name', 'dietary_tags']].head(10)

Unnamed: 0,recipe_id,name,dietary_tags
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,"non veg, halal, dairy free, egg free, soy free..."
1,2,Bitterballs (Bitterballen),"non veg, halal, egg free, soy free, nut free"
2,3,Broccoli/Cauliflower Cheese,"vegetarian, halal, egg free, soy free, nut fre..."
3,4,Vegetable Fingers,"vegan, halal, dairy free, egg free, soy free, ..."
4,5,Beef Casserole,"non veg, halal, dairy free, egg free, soy free..."
5,6,Toast Fingers,"vegan, halal, dairy free, egg free, soy free, ..."
6,7,Pumpkin Polenta Fingers,"vegetarian, halal, egg free, soy free, nut fre..."
7,8,Rice Pudding,"vegetarian, halal, egg free, soy free, nut fre..."
8,9,Hummus,"vegetarian, halal, egg free, soy free, nut fre..."
9,10,Baked Bean Pie,"vegetarian, halal, egg free, soy free, nut fre..."


In [1133]:
category_name_to_id = dict(zip(category_map['name'], category_map['pk']))
print(f"📊 Created mapping for {len(category_name_to_id)} categories from category_map")
print(f"   Sample mapping: {list(category_name_to_id.items())}")    

📊 Created mapping for 11 categories from category_map
   Sample mapping: [('vegan', 1), ('vegetarian', 2), ('pescetarian', 3), ('dairy free', 4), ('egg free', 5), ('soy free', 6), ('nut free', 7), ('gluten free', 8), ('halal', 9), ('non halal', 10), ('non veg', 11)]


In [1134]:
df_category = df.copy()
def parse_tags(tag_str):
    if isinstance(tag_str, str):
        return [tag.strip() for tag in tag_str.split(",")]
    elif isinstance(tag_str, list):
        return tag_str
    else:
        return []

df_category['dietary_tags_list'] = df_category['dietary_tags'].apply(parse_tags)
df_category[['recipe_id', 'name', 'dietary_tags', 'dietary_tags_list']].head(10) 

Unnamed: 0,recipe_id,name,dietary_tags,dietary_tags_list
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,"non veg, halal, dairy free, egg free, soy free...","[non veg, halal, dairy free, egg free, soy fre..."
1,2,Bitterballs (Bitterballen),"non veg, halal, egg free, soy free, nut free","[non veg, halal, egg free, soy free, nut free]"
2,3,Broccoli/Cauliflower Cheese,"vegetarian, halal, egg free, soy free, nut fre...","[vegetarian, halal, egg free, soy free, nut fr..."
3,4,Vegetable Fingers,"vegan, halal, dairy free, egg free, soy free, ...","[vegan, halal, dairy free, egg free, soy free,..."
4,5,Beef Casserole,"non veg, halal, dairy free, egg free, soy free...","[non veg, halal, dairy free, egg free, soy fre..."
5,6,Toast Fingers,"vegan, halal, dairy free, egg free, soy free, ...","[vegan, halal, dairy free, egg free, soy free,..."
6,7,Pumpkin Polenta Fingers,"vegetarian, halal, egg free, soy free, nut fre...","[vegetarian, halal, egg free, soy free, nut fr..."
7,8,Rice Pudding,"vegetarian, halal, egg free, soy free, nut fre...","[vegetarian, halal, egg free, soy free, nut fr..."
8,9,Hummus,"vegetarian, halal, egg free, soy free, nut fre...","[vegetarian, halal, egg free, soy free, nut fr..."
9,10,Baked Bean Pie,"vegetarian, halal, egg free, soy free, nut fre...","[vegetarian, halal, egg free, soy free, nut fr..."


In [1135]:
def get_category_ids(tags):
    return [category_name_to_id[tag] for tag in tags if tag in category_name_to_id]

df_category['category_ids'] = df_category['dietary_tags_list'].apply(get_category_ids)

# Step 4: Explode into one row per (recipe_id, category_id)
recipe_category_df = df_category[['recipe_id', 'category_ids']].explode('category_ids')
recipe_category_df.columns = ['recipe_id', 'category_id']

# Step 5: Drop rows where no matching category was found
recipe_category_df.dropna(subset=['category_id'], inplace=True)
recipe_category_df['category_id'] = recipe_category_df['category_id'].astype(int)

# Optional: Reset index
recipe_category_df.reset_index(drop=True, inplace=True)

In [1136]:
display(df_category[['recipe_id', 'name', 'dietary_tags', 'category_ids']].head(10))

display(recipe_category_df[['recipe_id', 'category_id']].head(10))

Unnamed: 0,recipe_id,name,dietary_tags,category_ids
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,"non veg, halal, dairy free, egg free, soy free...","[11, 9, 4, 5, 6, 7, 8]"
1,2,Bitterballs (Bitterballen),"non veg, halal, egg free, soy free, nut free","[11, 9, 5, 6, 7]"
2,3,Broccoli/Cauliflower Cheese,"vegetarian, halal, egg free, soy free, nut fre...","[2, 9, 5, 6, 7, 8]"
3,4,Vegetable Fingers,"vegan, halal, dairy free, egg free, soy free, ...","[1, 9, 4, 5, 6, 7, 8]"
4,5,Beef Casserole,"non veg, halal, dairy free, egg free, soy free...","[11, 9, 4, 5, 6, 7, 8]"
5,6,Toast Fingers,"vegan, halal, dairy free, egg free, soy free, ...","[1, 9, 4, 5, 6, 7]"
6,7,Pumpkin Polenta Fingers,"vegetarian, halal, egg free, soy free, nut fre...","[2, 9, 5, 6, 7, 8]"
7,8,Rice Pudding,"vegetarian, halal, egg free, soy free, nut fre...","[2, 9, 5, 6, 7, 8]"
8,9,Hummus,"vegetarian, halal, egg free, soy free, nut fre...","[2, 9, 5, 6, 7, 8]"
9,10,Baked Bean Pie,"vegetarian, halal, egg free, soy free, nut fre...","[2, 9, 5, 6, 7, 8]"


Unnamed: 0,recipe_id,category_id
0,1,11
1,1,9
2,1,4
3,1,5
4,1,6
5,1,7
6,1,8
7,2,11
8,2,9
9,2,5


### **Finalize Recipe Data Frame**

In [1137]:
df.columns.to_list()

['name',
 'ingredients',
 'ner_ingredient',
 'instructions',
 'min_age',
 'max_age',
 'texture',
 'prep_time',
 'cook_time',
 'serving',
 'origin',
 'recipe_link',
 'credibility',
 'image_link',
 'region',
 'difficulty',
 'meal_type',
 'description',
 'dietary_tags',
 'choking_hazard',
 'tips',
 'allergen',
 'hypoallergenic',
 'nutrition_value',
 'ID',
 'Energy / Calorie',
 'Carbohydrate (g)',
 'Protein (g)',
 'Fat (g)',
 'List of Micros',
 None,
 None,
 None,
 None,
 None,
 'ner_ingredient_list',
 'ner_ingredient_cleaned',
 'ner_ingredient_string',
 'origin_id',
 'mapped_region',
 'choking_hazards',
 'recipe_id']

In [1138]:
# 1. Make a copy to avoid modifying original df
recipes_df = df.copy()

# 2. List of columns to exclude (without None)
columns_to_exclude = [
    'ner_ingredient', 'ner_ingredient_list', 'ner_ingredient_cleaned',
    'ner_ingredient_string', 'mapped_region', 'nutrition_value', 'ID',
    'Energy / Calorie', 'Carbohydrate (g)', 'Protein (g)', 'Fat (g)',
    'List of Micros', 'dietary_tags', 'choking_hazard', 'allergen', 
    'origin', 'region'
]

# 3. Find existing columns to exclude (regular named columns)
existing_cols_to_exclude = [col for col in columns_to_exclude if col in recipes_df.columns]

# 4. Find None columns separately
none_columns = [col for col in recipes_df.columns if col is None]

# 5. Combine both lists
all_cols_to_exclude = existing_cols_to_exclude + none_columns

# 6. Drop ALL unwanted columns (including None columns)
recipes = recipes_df.drop(columns=all_cols_to_exclude)

# 7. Show result
print(f"✅ Regular columns removed: {existing_cols_to_exclude}")
print(f"✅ None columns removed: {none_columns}")
print(f"✅ Total columns removed: {len(all_cols_to_exclude)}")
print(f"📊 New DataFrame shape: {recipes.shape}")
print("\n📋 Remaining columns:")
display(recipes.columns.tolist())
print("\n📋 First 10 rows:")
display(recipes.head(10))

✅ Regular columns removed: ['ner_ingredient', 'ner_ingredient_list', 'ner_ingredient_cleaned', 'ner_ingredient_string', 'mapped_region', 'nutrition_value', 'ID', 'Energy / Calorie', 'Carbohydrate (g)', 'Protein (g)', 'Fat (g)', 'List of Micros', 'dietary_tags', 'choking_hazard', 'allergen', 'origin', 'region']
✅ None columns removed: [None, None, None, None, None]
✅ Total columns removed: 22
📊 New DataFrame shape: (1322, 20)

📋 Remaining columns:


['name',
 'ingredients',
 'instructions',
 'min_age',
 'max_age',
 'texture',
 'prep_time',
 'cook_time',
 'serving',
 'recipe_link',
 'credibility',
 'image_link',
 'difficulty',
 'meal_type',
 'description',
 'tips',
 'hypoallergenic',
 'origin_id',
 'choking_hazards',
 'recipe_id']


📋 First 10 rows:


Unnamed: 0,name,ingredients,instructions,min_age,max_age,texture,prep_time,cook_time,serving,recipe_link,credibility,image_link,difficulty,meal_type,description,tips,hypoallergenic,origin_id,choking_hazards,recipe_id
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,NONE,15.0,45.0,1,https://eprints.uad.ac.id/51598/1/Buku%20Makan...,,,Medium,,,,Yes,9,No,1
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,1. Stir-fry blended spices until fragrant. \n2...,9,11,family food,30.0,30.0,10 servings,pdfcoffee.com_mommyclopedia-78-resep-mpasi-pdf...,,,Hard,Snack,,,Yes,9,No,2
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,NONE,10.0,20.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,3
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...",1. Steam or microwave vegetables until tender....,6,12,NONE,5.0,10.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Easy,,,,Yes,68,No,4
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,family food,10.0,2.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,5
5,Toast Fingers,- 1 slice thick wholemeal bread,1. Toast bread and allow it to cool.\n2. Cut i...,6,12,NONE,5.0,10.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Easy,,,,Yes,68,No,6
6,Pumpkin Polenta Fingers,"- 3 cups water\n- 1 cup polenta (a coarse, gra...",1. Bring water to the boil in a large saucepan...,6,12,NONE,10.0,40.0,8,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,7
7,Rice Pudding,- 1 cup cooked rice\n- 1 cup milk\n- ½ - 1 tab...,"1. In a saucepan mix together rice, milk, and ...",6,12,NONE,5.0,15.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Easy,,,,Yes,68,No,8
8,Hummus,"- 400g canned chickpeas, drained\n- 1 clove ga...",1. Combine all ingredients.\n2. Mash or puree ...,12,12,NONE,5.0,10.0,2 cups,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,9
9,Baked Bean Pie,- 820g canned baked beans\n- 1 medium zucchini...,1. Preheat oven to 180°C.\n2. Place baked bean...,12,12,NONE,10.0,30.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,10


In [1139]:
recipes_df.shape

(1322, 42)

In [1140]:
#finalize the recipe_Df

### **Populating Nutrition Value**

In [1141]:
original_df = df.copy()

In [1142]:
# Check which records have empty nutrition_value
empty_nutrition = df['nutrition_value'].isna() | (df['nutrition_value'] == '') | (df['nutrition_value'] == 'None')
print(f"Records with empty nutrition_value: {empty_nutrition.sum()} out of {len(df)}")


# Function to extract ingredient information
def extract_ingredient_info(ingredient_text):
    """Extract quantity, measurement, and ingredient name from ingredient text"""
    if pd.isna(ingredient_text) or ingredient_text == '':
        return []
    
    # Split by lines and clean
    lines = [line.strip() for line in ingredient_text.split('\n') if line.strip()]
    
    extracted_ingredients = []
    
    for line in lines:
        # Remove leading dash or bullet points
        line = re.sub(r'^[-•*]\s*', '', line)
        
        # Pattern to match quantity, measurement, and ingredient
        # Examples: "60 g cassava", "2-3 tablespoons of plain yogurt", "1½ tablespoons vegetable oil"
        patterns = [
            # Pattern 1: Range + unit + "of" + ingredient (e.g., "2-3 tablespoons of plain yogurt")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?[-–][0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+of\s+(.+)$',
            
            # Pattern 2: Range + unit + ingredient (e.g., "2-3 tablespoons plain yogurt")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?[-–][0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+(.+)$',
            
            # Pattern 3: Number + unit + "of" + ingredient (e.g., "30 g of sweet potato")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+of\s+(.+)$',
            
            # Pattern 4: Number + unit + ingredient (e.g., "60 g cassava", "175g cauliflower")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s*([a-zA-Z]+)\s+(.+)$',
            
            # Pattern 5: Fraction + unit + "of" + ingredient (e.g., "1½ tablespoons of oil")
            r'^([0-9]*[½¼¾][0-9]*)\s*([a-zA-Z]+)\s+of\s+(.+)$',
            
            # Pattern 6: Fraction + unit + ingredient (e.g., "1½ tablespoons oil")
            r'^([0-9]*[½¼¾][0-9]*)\s*([a-zA-Z]+)\s+(.+)$',
            
            # Pattern 7: Range + ingredient (no unit) (e.g., "2-3 carrots")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?[-–][0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s+(.+)$',
            
            # Pattern 8: Number + ingredient (no unit) (e.g., "1 carrot", "1 onion")
            r'^([0-9]+(?:[.,][0-9]+)?(?:[½¼¾])?)\s+(.+)$',
            
            # Pattern 9: Just ingredient (no quantity/unit)
            r'^(.+)$'
        ]
        
        quantity = None
        measurement = None
        ingredient_name = None
        
        for i, pattern in enumerate(patterns):
            match = re.match(pattern, line, re.IGNORECASE)
            if match:
                if i == 0:  # Pattern 1: range + unit + "of" + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 1:  # Pattern 2: range + unit + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 2:  # Pattern 3: number + unit + "of" + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 3:  # Pattern 4: number + unit + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 4:  # Pattern 5: fraction + unit + "of" + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 5:  # Pattern 6: fraction + unit + ingredient
                    quantity = match.group(1)
                    measurement = match.group(2)
                    ingredient_name = match.group(3).strip()
                elif i == 6:  # Pattern 7: range + ingredient (no unit)
                    quantity = match.group(1)
                    measurement = None
                    ingredient_name = match.group(2).strip()
                elif i == 7:  # Pattern 8: number + ingredient (no unit)
                    quantity = match.group(1)
                    measurement = None
                    ingredient_name = match.group(2).strip()
                else:  # Pattern 9: just ingredient
                    quantity = None
                    measurement = None
                    ingredient_name = match.group(1).strip()
                break
        
        # Clean up ingredient name (remove extra descriptions after comma)
        if ingredient_name:
            # Remove descriptions after comma (e.g., "cassava, boiled and blended" -> "cassava")
            ingredient_name = ingredient_name.split(',')[0].strip()
            
            extracted_ingredients.append({
                'original_text': line,
                'quantity': quantity,
                'measurement': measurement,
                'ingredient_name': ingredient_name
            })
    
    return extracted_ingredients



Records with empty nutrition_value: 446 out of 1322


In [1143]:
# Test the function with specific examples
print("=== TESTING INGREDIENT EXTRACTION WITH RANGE FORMATS ===")

test_ingredients = [
    "2-3 tablespoons of plain yogurt",
    "1-2 teaspoons of vanilla extract", 
    "30 g of sweet potato",
    "60 g cassava",
    "1½ tablespoons vegetable oil",
    "2-3 carrots",
    "1 onion",
    "plain water"
]

for test_ingredient in test_ingredients:
    print(f"\nTesting: '{test_ingredient}'")
    result = extract_ingredient_info(test_ingredient)
    if result:
        for r in result:
            print(f"  ✅ Quantity: {r['quantity']}, Measurement: {r['measurement']}, Ingredient: {r['ingredient_name']}")
    else:
        print(f"  ❌ No match found")


=== TESTING INGREDIENT EXTRACTION WITH RANGE FORMATS ===

Testing: '2-3 tablespoons of plain yogurt'
  ✅ Quantity: 2-3, Measurement: tablespoons, Ingredient: plain yogurt

Testing: '1-2 teaspoons of vanilla extract'
  ✅ Quantity: 1-2, Measurement: teaspoons, Ingredient: vanilla extract

Testing: '30 g of sweet potato'
  ✅ Quantity: 30, Measurement: g, Ingredient: sweet potato

Testing: '60 g cassava'
  ✅ Quantity: 60, Measurement: g, Ingredient: cassava

Testing: '1½ tablespoons vegetable oil'
  ✅ Quantity: 1½, Measurement: tablespoons, Ingredient: vegetable oil

Testing: '2-3 carrots'
  ✅ Quantity: 2-3, Measurement: None, Ingredient: carrots

Testing: '1 onion'
  ✅ Quantity: 1, Measurement: None, Ingredient: onion

Testing: 'plain water'
  ✅ Quantity: None, Measurement: None, Ingredient: plain water


In [1144]:
df_empty_nutrition = df[df['nutrition_value'].isna() | (df['nutrition_value'] == '') | (df['nutrition_value'] == 'None')].copy()

print(f"\nExtracting ingredient information for {len(df_empty_nutrition)} records with empty nutrition_value...")
df_empty_nutrition['extracted_ingredients'] = df_empty_nutrition['ingredients'].apply(extract_ingredient_info)

df_empty_nutrition[['ingredients','extracted_ingredients']]



Extracting ingredient information for 446 records with empty nutrition_value...


Unnamed: 0,ingredients,extracted_ingredients
0,"- 60 g cassava, boiled and blended\n- 20 g fis...","[{'original_text': '60 g cassava, boiled and b..."
1,- 100 g beef mince \n- 30 g potato starch \n- ...,"[{'original_text': '100 g beef mince', 'quanti..."
2,"- 175g cauliflower/broccoli, cut into pieces\n...","[{'original_text': '175g cauliflower/broccoli,..."
3,"- 1 carrot, potato, or sweet potato, peeled an...","[{'original_text': '1 carrot, potato, or sweet..."
4,"- 1 onion, peeled and finely chopped\n- 1½ tab...","[{'original_text': '1 onion, peeled and finely..."
...,...,...
1317,- 50 g berries (you can use a mix of frozen be...,[{'original_text': '50 g berries (you can use ...
1318,"- 3 small mushrooms, finely chopped\n- ½ cup b...","[{'original_text': '3 small mushrooms, finely ..."
1319,- 1 cup pasta\n- 1 tablespoon margarine\n- 1 t...,"[{'original_text': '1 cup pasta', 'quantity': ..."
1320,- ¼ cup sugar\n- 1 cup milk\n- 1 egg\n- 2 tabl...,"[{'original_text': '¼ cup sugar', 'quantity': ..."


In [1145]:
# Display sample extractions
print("\nSample ingredient extractions:")
for idx, row in df_empty_nutrition.head(3).iterrows():
    print(f"\n--- Recipe: {row['name']} ---")
    print(f"Recipe ID: {row['recipe_id']}")
    print(f"Original ingredients:\n{row['ingredients']}")
    print("\nExtracted ingredients:")
    for ing in row['extracted_ingredients']:
        print(f"  - Quantity: {ing['quantity']}, Measurement: {ing['measurement']}, Ingredient: {ing['ingredient_name']}")
        print(f"    Original: {ing['original_text']}")



Sample ingredient extractions:

--- Recipe: Cassava Porridge with Fish Sauce and Lemon (Bubur Singkong Kukuruyuk Saus Jeruk) ---
Recipe ID: 1
Original ingredients:
- 60 g cassava, boiled and blended
- 20 g fish meat (milkfish), finely chopped
- 10 g chicken meat
- 5 g coconut oil
- 100 cc chicken broth
- 10 g fresh lime juice
- 20 g spinach, finely chopped

Extracted ingredients:
  - Quantity: 60, Measurement: g, Ingredient: cassava
    Original: 60 g cassava, boiled and blended
  - Quantity: 20, Measurement: g, Ingredient: fish meat (milkfish)
    Original: 20 g fish meat (milkfish), finely chopped
  - Quantity: 10, Measurement: g, Ingredient: chicken meat
    Original: 10 g chicken meat
  - Quantity: 5, Measurement: g, Ingredient: coconut oil
    Original: 5 g coconut oil
  - Quantity: 100, Measurement: cc, Ingredient: chicken broth
    Original: 100 cc chicken broth
  - Quantity: 10, Measurement: g, Ingredient: fresh lime juice
    Original: 10 g fresh lime juice
  - Quantity: 20, 

In [1146]:
# Create a detailed breakdown DataFrame
ingredient_details = []
for idx, row in df_empty_nutrition.iterrows():
    recipe_name = row['name']
    recipe_id = row['recipe_id']
    for ing in row['extracted_ingredients']:
        ingredient_details.append({
            'recipe_id': recipe_id,
            'recipe_name': recipe_name,
            'quantity': ing['quantity'],
            'measurement': ing['measurement'],
            'ingredient_name': ing['ingredient_name'],
            'original_text': ing['original_text']
        })

ingredient_breakdown_df = pd.DataFrame(ingredient_details)
print(f"\nCreated ingredient breakdown with {len(ingredient_breakdown_df)} ingredient entries")
print(f"From {len(df_empty_nutrition)} recipes with empty nutrition values")




Created ingredient breakdown with 2572 ingredient entries
From 446 recipes with empty nutrition values


In [1147]:
#print new df
ingredient_breakdown_df.head()


Unnamed: 0,recipe_id,recipe_name,quantity,measurement,ingredient_name,original_text
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,60,g,cassava,"60 g cassava, boiled and blended"
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,20,g,fish meat (milkfish),"20 g fish meat (milkfish), finely chopped"
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,10,g,chicken meat,10 g chicken meat
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,5,g,coconut oil,5 g coconut oil
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,100,cc,chicken broth,100 cc chicken broth


In [1148]:
# Show summary statistics
print("\nSummary of extracted measurements:")
if len(ingredient_breakdown_df) > 0:
    print(ingredient_breakdown_df['measurement'].value_counts().head(10))
    
    print("\nMost common ingredients:")
    print(ingredient_breakdown_df['ingredient_name'].value_counts().head(10))


Summary of extracted measurements:
measurement
g              556
tsp            181
small          148
tbsp           130
cup             85
ml              79
medium          73
tablespoons     47
teaspoon        45
tablespoon      43
Name: count, dtype: int64

Most common ingredients:
ingredient_name
water            69
vegetable oil    65
onion            48
egg              38
carrot           35
olive oil        34
butter           29
potato           25
eggs             25
sugar            20
Name: count, dtype: int64


In [1149]:
# Function to convert measurements to grams
def convert_to_grams(quantity, measurement, ingredient_name):
    """
    Convert quantity and measurement to grams based on common cooking conversions.
    Returns converted quantity in grams and 'g' as measurement.
    """
    if not quantity or not measurement:
        return quantity, measurement
    
    # Clean and parse quantity (handle ranges and fractions)
    def parse_quantity(qty_str):
        if not qty_str:
            return 1.0
        
        # Handle fractions
        fraction_map = {'½': 0.5, '¼': 0.25, '¾': 0.75, '⅓': 0.33, '⅔': 0.67}
        for frac, val in fraction_map.items():
            qty_str = str(qty_str).replace(frac, str(val))
        
        # Handle ranges (take average)
        if '-' in str(qty_str) or '–' in str(qty_str):
            parts = re.split(r'[-–]', str(qty_str))
            if len(parts) == 2:
                try:
                    min_val = float(parts[0].strip())
                    max_val = float(parts[1].strip())
                    return (min_val + max_val) / 2
                except:
                    pass
        
        # Convert to float
        try:
            return float(qty_str)
        except:
            return 1.0
    
    parsed_qty = parse_quantity(quantity)
    measurement_lower = str(measurement).lower()
    
    # Conversion factors to grams
    conversions = {
        # Volume conversions (approximate for common ingredients)
        'tsp': 5,           # 1 tsp ≈ 5g (for most liquids/powders)
        'teaspoon': 5,
        'teaspoons': 5,
        'tbsp': 15,         # 1 tbsp ≈ 15g
        'tablespoon': 15,
        'tablespoons': 15,
        'cup': 240,         # 1 cup ≈ 240g (for liquids)
        'cups': 240,
        'ml': 1,            # 1ml ≈ 1g (for water-based liquids)
        'milliliters': 1,
        'milliliter': 1,
        'l': 1000,          # 1 liter = 1000g
        'liter': 1000,
        'liters': 1000,
        
        # Weight conversions
        'g': 1,             # already in grams
        'gram': 1,
        'grams': 1,
        'kg': 1000,         # 1 kg = 1000g
        'kilogram': 1000,
        'kilograms': 1000,
        'oz': 28.35,        # 1 oz ≈ 28.35g
        'ounce': 28.35,
        'ounces': 28.35,
        'lb': 453.6,        # 1 lb ≈ 453.6g
        'pound': 453.6,
        'pounds': 453.6,
        'cc': 1,           
        
        # Piece conversions (rough estimates)
        'small': 50,        # small piece ≈ 50g
        'medium': 100,      # medium piece ≈ 100g
        'large': 150,       # large piece ≈ 150g
        'piece': 75,        # average piece ≈ 75g
        'pieces': 75,
        'clove': 3,         # garlic clove ≈ 3g
        'cloves': 3,
    }
    
    # Convert to grams
    if measurement_lower in conversions:
        converted_qty = parsed_qty * conversions[measurement_lower]
        return round(converted_qty, 1), 'g'
    else:
        # If measurement not found, return original
        return quantity, measurement



In [1030]:
# Test the conversion function
print("Testing conversion function:")
test_cases = [
    ('2', 'tbsp', 'vegetable oil'),
    ('1', 'cup', 'flour'),
    ('1', 'cc', 'salt'),
    ('100', 'ml', 'water'),
    ('1', 'medium', 'onion'),
    ('2-3', 'tsp', 'sugar'),
    ('½', 'cup', 'butter')
]

# for qty, measure, ingredient in test_cases:
#     # new_qty, new_measure = convert_to_grams(qty, measure, ingredient)
#     print(f"{qty} {measure} {ingredient} → {new_qty} {new_measure}")


Testing conversion function:


In [1150]:
# Apply conversion to all ingredients in the dataframe
print("Converting all measurements to grams...")

# Create new columns for converted values
ingredient_breakdown_df['original_quantity'] = ingredient_breakdown_df['quantity'].copy()
ingredient_breakdown_df['original_measurement'] = ingredient_breakdown_df['measurement'].copy()

# Apply conversion
conversion_results = ingredient_breakdown_df.apply(
    lambda row: convert_to_grams(row['quantity'], row['measurement'], row['ingredient_name']), 
    axis=1
)

# Update the dataframe with converted values
ingredient_breakdown_df['quantity'] = [result[0] for result in conversion_results]
ingredient_breakdown_df['measurement'] = [result[1] for result in conversion_results]

print(f"\nConversion completed for {len(ingredient_breakdown_df)} ingredients")

# Show sample conversions
print("\nSample conversions:")
sample_conversions = ingredient_breakdown_df[ingredient_breakdown_df['original_measurement'].notna()].head(10)
for idx, row in sample_conversions.iterrows():
    print(f"{row['original_quantity']} {row['original_measurement']} {row['ingredient_name']} → {row['quantity']} {row['measurement']}")

# Show new measurement distribution
print("\nNew measurement distribution:")
print(ingredient_breakdown_df['measurement'].value_counts())



Converting all measurements to grams...

Conversion completed for 2572 ingredients

Sample conversions:
60 g cassava → 60.0 g
20 g fish meat (milkfish) → 20.0 g
10 g chicken meat → 10.0 g
5 g coconut oil → 5.0 g
100 cc chicken broth → 100.0 g
10 g fresh lime juice → 10.0 g
20 g spinach → 20.0 g
100 g beef mince → 100.0 g
30 g potato starch → 30.0 g
300 ml milk → 300.0 g

New measurement distribution:
measurement
g           1548
heaped        16
knob          14
pinch         12
level         11
            ... 
lamb           1
pot            1
macarons       1
knobs          1
leaves         1
Name: count, Length: 126, dtype: int64


In [1151]:
#print new df
ingredient_breakdown_df.head()

Unnamed: 0,recipe_id,recipe_name,quantity,measurement,ingredient_name,original_text,original_quantity,original_measurement
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,60.0,g,cassava,"60 g cassava, boiled and blended",60,g
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,20.0,g,fish meat (milkfish),"20 g fish meat (milkfish), finely chopped",20,g
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,10.0,g,chicken meat,10 g chicken meat,10,g
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,5.0,g,coconut oil,5 g coconut oil,5,g
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,100.0,g,chicken broth,100 cc chicken broth,100,cc


In [1152]:
# Filter ingredients that have complete quantity and measurement data
print("Filtering ingredients with complete quantity and measurement data...")

# Check current data completeness
print(f"\nTotal ingredients in breakdown: {len(ingredient_breakdown_df)}")
print(f"Ingredients with quantity: {ingredient_breakdown_df['quantity'].notna().sum()}")
print(f"Ingredients with measurement: {ingredient_breakdown_df['measurement'].notna().sum()}")
print(f"Ingredients with both quantity and measurement: {(ingredient_breakdown_df['quantity'].notna() & ingredient_breakdown_df['measurement'].notna()).sum()}")

# Create mask for complete data (both quantity and measurement are not null)
complete_data_mask = (
    ingredient_breakdown_df['quantity'].notna() & 
    ingredient_breakdown_df['measurement'].notna() &
    (ingredient_breakdown_df['quantity'] != '') &
    (ingredient_breakdown_df['measurement'] != '')
)

# Filter dataframe to only include ingredients with complete data
ingredients_complete = ingredient_breakdown_df[complete_data_mask].copy()

print(f"\nIngredients with complete quantity and measurement: {len(ingredients_complete)}")
print(f"Percentage of complete data: {len(ingredients_complete) / len(ingredient_breakdown_df) * 100:.1f}%")

# Show sample of complete ingredients
print("\nSample of ingredients with complete data:")
print(ingredients_complete[['ingredient_name', 'quantity', 'measurement']].head(10))


Filtering ingredients with complete quantity and measurement data...

Total ingredients in breakdown: 2572
Ingredients with quantity: 2061
Ingredients with measurement: 1895
Ingredients with both quantity and measurement: 1895

Ingredients with complete quantity and measurement: 1895
Percentage of complete data: 73.7%

Sample of ingredients with complete data:
        ingredient_name quantity measurement
0               cassava     60.0           g
1  fish meat (milkfish)     20.0           g
2          chicken meat     10.0           g
3           coconut oil      5.0           g
4         chicken broth    100.0           g
5      fresh lime juice     10.0           g
6               spinach     20.0           g
7            beef mince    100.0           g
8         potato starch     30.0           g
9                  milk    300.0           g


In [1153]:
ingredient_breakdown_df[complete_data_mask]

Unnamed: 0,recipe_id,recipe_name,quantity,measurement,ingredient_name,original_text,original_quantity,original_measurement
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,60.0,g,cassava,"60 g cassava, boiled and blended",60,g
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,20.0,g,fish meat (milkfish),"20 g fish meat (milkfish), finely chopped",20,g
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,10.0,g,chicken meat,10 g chicken meat,10,g
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,5.0,g,coconut oil,5 g coconut oil,5,g
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,100.0,g,chicken broth,100 cc chicken broth,100,cc
...,...,...,...,...,...,...,...,...
2567,1322,Apple Crumble,60.0,g,wholemeal self-raising flour,¼ cup wholemeal self-raising flour,¼,cup
2568,1322,Apple Crumble,60.0,g,brown sugar,¼ cup brown sugar,¼,cup
2569,1322,Apple Crumble,60.0,g,coconut,¼ cup coconut,¼,cup
2570,1322,Apple Crumble,60.0,g,rolled oats,¼ cup rolled oats,¼,cup


In [1035]:
ingredient_counts = ingredients_complete['ingredient_name'].value_counts()
print(f"Total unique ingredients: {len(ingredient_counts)}")
print("\nMost frequently used ingredients:")
print(ingredient_counts.head(15))

# Create a dataframe with ingredient frequencies
unique_ingredients_with_counts = ingredient_counts.reset_index()
unique_ingredients_with_counts.columns = ['ingredient_name', 'frequency']
print(f"\nCreated dataframe with {len(unique_ingredients_with_counts)} unique ingredients")



Total unique ingredients: 873

Most frequently used ingredients:
ingredient_name
water                   68
vegetable oil           65
onion                   41
olive oil               34
butter                  29
carrot                  23
sugar                   20
garlic                  19
frozen peas             16
potatoes                14
vegetable oil spread    13
flour                   13
lemon juice             12
milk                    12
potato                  12
Name: count, dtype: int64

Created dataframe with 873 unique ingredients


In [1036]:

unique_ingredients_with_counts


Unnamed: 0,ingredient_name,frequency
0,water,68
1,vegetable oil,65
2,onion,41
3,olive oil,34
4,butter,29
...,...,...
868,Natto,1
869,Japanese rice *steamed,1
870,dashi (baby-safe,1
871,udon (preferably thin,1


##### **Requesting to USDA Food Central**

In [None]:
import requests
from tqdm import tqdm
import requests
import time

api_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
api_key = "KulngHmZ1nJeaPPBrZ8pH3kyJI2Gy1r9Xm121YO9"

def get_nutrition_data(query: str, api_key: str):
    """
    Fetch nutrition data for a specific ingredient from USDA API with prioritized search strategy.
    
    Search Strategy:
    1. Foundation exact match
    2. Foundation first result
    3. Survey (FNDDS) exact match  
    4. Survey (FNDDS) first result
    
    Returns:
    dict: Nutrition information including energy, macronutrients, top 3 micronutrients by value,
          and search method used
    """
    
    base_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
    
    def search_with_params(data_type):
        """Helper function to search with specific parameters"""
        params = {
            'query': query,
            'api_key': api_key,
            'dataType': [data_type],
            'pageSize': 25,  # Get more results for better matching
            'pageNumber': 1,
            'sortBy': 'dataType.keyword',
            'sortOrder': 'asc'
        }
        
        try:
            response = requests.get(base_url, params=params, timeout=10)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"❌ API request failed: {response.status_code}")
                return None
        except Exception as e:
            print(f"❌ Error in API request: {e}")
            return None
    
    def find_exact_match(foods, query_lower):
        """Find exact match by comparing full ingredient name with full food description (case-insensitive)"""
        for food in foods:
            description_lower = food['description'].lower()
            # Check if the query exactly matches the description or if query is a whole word in description
            # Use word boundaries to ensure exact matching
            import re
            
            # Create pattern for exact word matching
            pattern = r'\b' + re.escape(query_lower) + r'\b'
            
            # Check if query is the entire description OR appears as complete word(s)
            if (query_lower == description_lower or 
                re.search(pattern, description_lower)):
                return food
        return None
    
    def extract_nutrition_info(food, search_method):
        """Extract nutrition information from food item"""
        nutrients = food.get('foodNutrients', [])
        
        result = {
            'ingredient_name': query,
            'found_description': food['description'],
            'search_method': search_method,
            'energy_kcal': None,
            'carbohydrate_g': None,
            'protein_g': None,
            'fat_g': None,
            'micronutrients': [],
            'status': 'success'
        }
        
        # Energy (Atwater General Factors)
        energy = next((item for item in nutrients if item['nutrientName'] == 'Energy (Atwater Specific Factors)'), None)
        if energy:
            result['energy_kcal'] = energy['value']
        
        # Carbohydrates
        carbohydrate = next((item for item in nutrients if item['nutrientName'] == 'Carbohydrate, by difference'), None)
        if carbohydrate:
            result['carbohydrate_g'] = carbohydrate['value']
        
        # Fat
        fat = next((item for item in nutrients if item['nutrientName'] == 'Total lipid (fat)'), None)
        if fat:
            result['fat_g'] = fat['value']
        
        # Protein
        protein = next((item for item in nutrients if item['nutrientName'] == 'Protein'), None)
        if protein:
            result['protein_g'] = protein['value']
        
        # Exclude certain nutrients from micronutrients
        exclude_nutrients = [
            "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
            "Nitrogen", "Protein", "Total lipid (fat)", "Ash", "Carbohydrates",
            "Carbohydrate, by difference", "Total dietary fiber (AOAC 2011.25)",
            "High Molecular Weight Dietary Fiber (HMWDF)", "Low Molecular Weight Dietary Fiber (LMWDF)",
            "Sugars, Total", "Total Sugars", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
        ]
        
        # Get micronutrients (vitamins and minerals) - top 3 by value
        filtered_micronutrients = [
            item for item in nutrients 
            if item['nutrientName'] not in exclude_nutrients and item['value'] > 0
        ]
        
        # Sort by value in descending order and take top 3
        sorted_micronutrients = sorted(filtered_micronutrients, key=lambda x: x['value'], reverse=True)
        top_3_micronutrients = sorted_micronutrients[:3]
        
        # Extract only the nutrient names
        micronutrients = [item['nutrientName'] for item in top_3_micronutrients]
        result['micronutrients'] = micronutrients
        
        return result

    # Make the API call with prioritized search strategy
    try:
        query_lower = query.lower()
        
        # Step 1: Search Foundation for exact match
        print(f"🔍 Step 1: Searching Foundation for exact match: '{query}'")
        data = search_with_params("Foundation")
        if data and 'foods' in data and data['foods']:
            exact_match = find_exact_match(data['foods'], query_lower)
            if exact_match:
                print(f"✅ Found exact match in Foundation: {exact_match['description']}")
                return extract_nutrition_info(exact_match, "Foundation_exact")
        
        # Step 2: Search Foundation and take first result
        print(f"🔍 Step 2: Taking first Foundation result: '{query}'")
        if data and 'foods' in data and data['foods']:
            first_result = data['foods'][0]
            print(f"📄 Using first Foundation result: {first_result['description']}")
            return extract_nutrition_info(first_result, "Foundation_first")
        
        # Step 3: Search Survey (FNDDS) for exact match
        print(f"🔍 Step 3: Searching Survey (FNDDS) for exact match: '{query}'")
        data = search_with_params("Survey (FNDDS)")
        if data and 'foods' in data and data['foods']:
            exact_match = find_exact_match(data['foods'], query_lower)
            if exact_match:
                print(f"✅ Found exact match in Survey: {exact_match['description']}")
                return extract_nutrition_info(exact_match, "Survey_exact")
            
            # Step 4: Take first Survey result as fallback
            print(f"🔍 Step 4: Taking first Survey (FNDDS) result: '{query}'")
            first_result = data['foods'][0]
            print(f"📄 Using first Survey result: {first_result['description']}")
            return extract_nutrition_info(first_result, "Survey_first")
        
        # If no results found at all
        print(f"❌ No nutrition data found for '{query}'")
        return {
            'ingredient_name': query,
            'found_description': None,
            'search_method': 'not_found',
            'energy_kcal': None,
            'carbohydrate_g': None,
            'protein_g': None,
            'fat_g': None,
            'micronutrients': [],
            'status': 'failed',
            'error': 'No results found in any database'
        }
        
    except Exception as e:
        print(f"❌ Error processing '{query}': {e}")
        return {
            'ingredient_name': query,
            'found_description': None,
            'search_method': 'error',
            'energy_kcal': None,
            'carbohydrate_g': None,
            'protein_g': None,
            'fat_g': None,
            'micronutrients': [],
            'status': 'failed',
            'error': str(e)
        }



In [None]:
import time
from tqdm import tqdm

def process_all_ingredients(unique_ingredients_df, api_key, delay=0.1):
    """
    Process all unique ingredients to get their nutrition data.
    
    Args:
        unique_ingredients_df: DataFrame with 'ingredient_name' column
        api_key: USDA API key
        delay: Delay between API calls in seconds (to respect rate limits)
    
    Returns:
        tuple: (nutrition_df, failed_ingredients_list)
    """
    nutrition_results = []
    failed_ingredients = []
    
    print(f"Processing {len(unique_ingredients_df)} unique ingredients...")
    
    # Process each ingredient
    for idx, row in tqdm(unique_ingredients_df.iterrows(), total=len(unique_ingredients_df)):
        ingredient_name = row['ingredient_name']
        # frequency = row['frequency']
        
        try:
            # Get nutrition data
            nutrition_data = get_nutrition_data(ingredient_name, api_key)
            # nutrition_data['frequency'] = frequency
            
            if nutrition_data['status'] == 'success':
                nutrition_results.append(nutrition_data)
                print(f"✅ {ingredient_name}: Found - {nutrition_data['found_description']}")
            else:
                failed_ingredients.append({
                    'ingredient_name': ingredient_name,
                    # 'frequency': frequency,
                    'reason': nutrition_data['status']
                })
                print(f"❌ {ingredient_name}: {nutrition_data['status']}")
            
            # Add delay to respect API rate limits
            time.sleep(delay)
            
        except Exception as e:
            failed_ingredients.append({
                'ingredient_name': ingredient_name,
                # 'frequency': frequency,
                'reason': f'exception: {str(e)}'
            })
            print(f"❌ {ingredient_name}: Exception - {str(e)}")
    
    # Create DataFrames
    nutrition_df = pd.DataFrame(nutrition_results)
    failed_df = pd.DataFrame(failed_ingredients)
    
    return nutrition_df, failed_df

In [None]:
# Process all unique ingredients (excluding water)
print("Starting nutrition data collection for all unique ingredients...")
print(f"Total ingredients to process: {len(unique_ingredients_with_counts)}")

# Exclude 'water' from processing (since water has no significant nutrition and causes matching issues)
ingredients_to_exclude = ['water', 'plain water', 'boiling water', 'cold water', 'warm water']
filtered_ingredients = unique_ingredients_with_counts[
    ~unique_ingredients_with_counts['ingredient_name'].str.lower().isin([x.lower() for x in ingredients_to_exclude])
].copy()


In [None]:
# Create 6 separate DataFrames for each batch of 150 ingredients
import math

def create_batch_dataframes(unique_ingredients_list, batch_size=150):
    """
    Create 6 separate DataFrames, each containing a batch of ingredients.
    Each DataFrame will be processed separately and can store its own nutrition data.
    
    Args:
        unique_ingredients_list: List of unique ingredients
        batch_size: Number of ingredients per batch (default: 150)
    
    Returns:
        Dictionary of DataFrames (batch_1 through batch_6)
    """
    # Calculate total number of batches needed
    total_batches = math.ceil(len(unique_ingredients_list) / batch_size)
    max_batches = 6  # Limit to 6 batches as requested
    
    print(f"Total ingredients: {len(unique_ingredients_list)}")
    print(f"Batch size: {batch_size}")
    print(f"Total batches needed: {total_batches}")
    print(f"Creating {min(total_batches, max_batches)} DataFrames")
    print("=" * 50)
    
    batch_dataframes = {}
    
    for batch_num in range(1, min(total_batches, max_batches) + 1):
        # Calculate start and end indices for this batch
        start_idx = (batch_num - 1) * batch_size
        end_idx = min(start_idx + batch_size, len(unique_ingredients_list))
        
        # Get ingredients for this batch
        batch_ingredients = unique_ingredients_list[start_idx:end_idx]
        
        # Create DataFrame for this batch with empty nutrition columns
        batch_df = pd.DataFrame({
            'ingredient': batch_ingredients,
            'description': [None] * len(batch_ingredients),
            'energy_kcal_per_100g': [None] * len(batch_ingredients),
            'carbs_g_per_100g': [None] * len(batch_ingredients),
            'protein_g_per_100g': [None] * len(batch_ingredients),
            'fat_g_per_100g': [None] * len(batch_ingredients),
            'top_micronutrients': [None] * len(batch_ingredients),
            'search_method': [None] * len(batch_ingredients),
            'batch_number': [batch_num] * len(batch_ingredients)
        })
        
        # Store in dictionary
        batch_dataframes[f'batch_{batch_num}'] = batch_df
        
        print(f"Batch {batch_num}: {len(batch_ingredients)} ingredients (indices {start_idx}-{end_idx-1})")
        print(f"  Sample ingredients: {batch_ingredients[:3]}...")
        print()
    
    return batch_dataframes



In [None]:
# Create the unique ingredients list from filtered_ingredients DataFrame
unique_ingredients_list = filtered_ingredients['ingredient_name'].tolist()
print(f"Extracted {len(unique_ingredients_list)} unique ingredients from filtered_ingredients DataFrame")

# Create the batch DataFrames
print("Creating 6 separate DataFrames for batch processing...")
batch_dfs = create_batch_dataframes(unique_ingredients_list, batch_size=150)

# Display information about each batch
print("\n📊 BATCH DATAFRAMES CREATED:")
print("=" * 60)
for batch_name, df in batch_dfs.items():
    print(f"{batch_name.upper()}:")
    print(f"  - Shape: {df.shape}")
    print(f"  - Ingredient range: {df['ingredient'].iloc[0]} ... {df['ingredient'].iloc[-1]}")
    print(f"  - Batch number: {df['batch_number'].iloc[0]}")
    print()

In [None]:
# Create 6 separate DataFrames for each batch of 150 ingredients
import math

def create_batch_dataframes(unique_ingredients_list, batch_size=150):
    """
    Create 6 separate DataFrames, each containing a batch of ingredients.
    Each DataFrame will be processed separately and can store its own nutrition data.
    
    Args:
        unique_ingredients_list: List of unique ingredients
        batch_size: Number of ingredients per batch (default: 150)
    
    Returns:
        Dictionary of DataFrames (batch_1 through batch_6)
    """
    # Calculate total number of batches needed
    total_batches = math.ceil(len(unique_ingredients_list) / batch_size)
    max_batches = 6  # Limit to 6 batches as requested
    
    print(f"Total ingredients: {len(unique_ingredients_list)}")
    print(f"Batch size: {batch_size}")
    print(f"Total batches needed: {total_batches}")
    print(f"Creating {min(total_batches, max_batches)} DataFrames")
    print("=" * 50)
    
    batch_dataframes = {}
    
    for batch_num in range(1, min(total_batches, max_batches) + 1):
        # Calculate start and end indices for this batch
        start_idx = (batch_num - 1) * batch_size
        end_idx = min(start_idx + batch_size, len(unique_ingredients_list))
        
        # Get ingredients for this batch
        batch_ingredients = unique_ingredients_list[start_idx:end_idx]
        
        # Create DataFrame for this batch with empty nutrition columns
        batch_df = pd.DataFrame({
            'ingredient': batch_ingredients,
            'description': [None] * len(batch_ingredients),
            'energy_kcal_per_100g': [None] * len(batch_ingredients),
            'carbs_g_per_100g': [None] * len(batch_ingredients),
            'protein_g_per_100g': [None] * len(batch_ingredients),
            'fat_g_per_100g': [None] * len(batch_ingredients),
            'top_micronutrients': [None] * len(batch_ingredients),
            'search_method': [None] * len(batch_ingredients),
            'batch_number': [batch_num] * len(batch_ingredients)
        })
        
        # Store in dictionary
        batch_dataframes[f'batch_{batch_num}'] = batch_df
        
        print(f"Batch {batch_num}: {len(batch_ingredients)} ingredients (indices {start_idx}-{end_idx-1})")
        print(f"  Sample ingredients: {batch_ingredients[:3]}...")
        print()
    
    return batch_dataframes


In [None]:
# Function to process individual batch DataFrames
import datetime  # Add missing import

def process_batch_dataframe(batch_df, batch_name, api_key, save_results=True):
    """
    Process a single batch DataFrame by fetching nutrition data for all ingredients.
    
    Args:
        batch_df: DataFrame containing ingredients for this batch
        batch_name: Name of the batch (e.g., 'batch_1')
        api_key: USDA API key
        save_results: Whether to save results to Excel file
    
    Returns:
        Tuple of (processed_df, failed_ingredients_list)
    """
    print(f"\n🔄 PROCESSING {batch_name.upper()}")
    print("=" * 50)
    
    processed_df = batch_df.copy()
    failed_ingredients = []
    successful_count = 0
    
    total_ingredients = len(batch_df)
    
    for idx, row in batch_df.iterrows():
        ingredient = row['ingredient']
        
        # Progress indicator
        current_position = idx - batch_df.index[0] + 1
        print(f"Processing {current_position}/{total_ingredients}: {ingredient}")
        
        # Fetch nutrition data
        nutrition_data = get_nutrition_data(ingredient, api_key)
        
        if nutrition_data and nutrition_data['status'] == 'success':
            # Update the DataFrame with nutrition data
            processed_df.at[idx, 'description'] = nutrition_data.get('found_description', ingredient)
            processed_df.at[idx, 'energy_kcal_per_100g'] = nutrition_data.get('energy_kcal')
            processed_df.at[idx, 'energy_kcal_per_100g'] = nutrition_data.get('energy_kcal')
            processed_df.at[idx, 'carbs_g_per_100g'] = nutrition_data.get('carbohydrate_g')
            processed_df.at[idx, 'protein_g_per_100g'] = nutrition_data.get('protein_g')
            processed_df.at[idx, 'fat_g_per_100g'] = nutrition_data.get('fat_g')
            processed_df.at[idx, 'top_micronutrients'] = ', '.join(nutrition_data.get('micronutrients', []))
            processed_df.at[idx, 'search_method'] = nutrition_data.get('search_method')
            
            successful_count += 1
            print(f"  ✅ Success - Method: {nutrition_data.get('search_method')}")
        else:
            failed_ingredients.append(ingredient)
            print(f"  ❌ Failed")
        
        # Small delay to avoid overwhelming the API
        time.sleep(0.1)
    
    # Summary
    print(f"\n📊 {batch_name.upper()} SUMMARY:")
    print(f"  - Total ingredients: {total_ingredients}")
    print(f"  - Successful: {successful_count}")
    print(f"  - Failed: {len(failed_ingredients)}")
    print(f"  - Success rate: {successful_count/total_ingredients*100:.1f}%")
    
    # if save_results:
    #     # Save processed DataFrame
    #     results_filename = f"nutrition_results_{batch_name}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
    #     processed_df.to_excel(results_filename, index=False)
    #     print(f"  💾 Results saved to: {results_filename}")
        
    #     # Save failed ingredients
    #     if failed_ingredients:
    #         failed_filename = f"failed_ingredients_{batch_name}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
    #         failed_df = pd.DataFrame({'failed_ingredient': failed_ingredients, 'batch': batch_name})
    #         failed_df.to_excel(failed_filename, index=False)
    #         print(f"  💾 Failed ingredients saved to: {failed_filename}")
    
    return processed_df, failed_ingredients

In [None]:
# Function to process all batches or specific batches
def process_selected_batches(batch_dfs, api_key, batch_numbers=None, save_results=True):

    processed_results = {}
    all_failed = {}
    
    # Determine which batches to process
    if batch_numbers is None:
        batches_to_process = list(batch_dfs.keys())
    else:
        batches_to_process = [f'batch_{num}' for num in batch_numbers if f'batch_{num}' in batch_dfs]
    
    print(f"🚀 STARTING BATCH PROCESSING")
    print(f"Batches to process: {batches_to_process}")
    print("=" * 60)
    
    for batch_name in batches_to_process:
        if batch_name in batch_dfs:
            try:
                processed_df, failed_list = process_batch_dataframe(
                    batch_dfs[batch_name], 
                    batch_name, 
                    api_key, 
                    save_results
                )
                processed_results[batch_name] = processed_df
                all_failed[batch_name] = failed_list
                
            except Exception as e:
                print(f"❌ Error processing {batch_name}: {str(e)}")
                all_failed[batch_name] = list(batch_dfs[batch_name]['ingredient'])
        
        print("\n" + "="*60)
    
    # Overall summary
    total_ingredients = sum(len(df) for df in batch_dfs.values() if any(batch in batch_dfs for batch in batches_to_process))
    total_successful = sum(len(df[df['search_method'].notna()]) for df in processed_results.values())
    total_failed = sum(len(failed_list) for failed_list in all_failed.values())
    
    print(f"\n🎯 OVERALL PROCESSING SUMMARY:")
    print(f"  - Total ingredients processed: {total_successful + total_failed}")
    print(f"  - Successful: {total_successful}")
    print(f"  - Failed: {total_failed}")
    print(f"  - Overall success rate: {total_successful/(total_successful + total_failed)*100:.1f}%")
    
    return processed_results, all_failed

##### Batch Processing

In [None]:
# 📊 BATCH 1 PROCESSING (Ingredients 1-150)
print("🔄 PROCESSING BATCH 1")
print("=" * 50)
print(f"Batch 1 contains {len(batch_dfs['batch_1'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_1']['ingredient'].head())}")
print()

# Process Batch 1
batch_1_results, batch_1_failed = process_batch_dataframe(
    batch_dfs['batch_1'], 
    'batch_1', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 1 COMPLETED!")


In [None]:
# 🔍 CHECK BATCH 1 RESULTS
print("📋 BATCH 1 DETAILED RESULTS")
print("=" * 50)

if 'batch_1_results' in locals():
    print(f"✅ Successfully processed: {len(batch_1_results[batch_1_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_1_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_1_results[batch_1_results['search_method'].notna()]) / len(batch_1_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 1 RESULTS:")
    display(batch_1_results)
    
    # Show failed ingredients if any
    if batch_1_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 1:")
        for ingredient in batch_1_failed:
            print(f"  - {ingredient}")
else:
    print("⚠️ Batch 1 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 2 PROCESSING (Ingredients 151-300)
print("🔄 PROCESSING BATCH 2")
print("=" * 50)
print(f"Batch 2 contains {len(batch_dfs['batch_2'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_2']['ingredient'].head())}")
print()

# Process Batch 2
batch_2_results, batch_2_failed = process_batch_dataframe(
    batch_dfs['batch_2'], 
    'batch_2', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 2 COMPLETED!")

In [None]:
# 🔍 CHECK BATCH 2 RESULTS
print("📋 BATCH 2 DETAILED RESULTS")
print("=" * 50)

if 'batch_2_results' in locals():
    print(f"✅ Successfully processed: {len(batch_2_results[batch_2_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_2_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_2_results[batch_2_results['search_method'].notna()]) / len(batch_2_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 2 RESULTS:")
    display(batch_2_results)
    
    # Show failed ingredients if any
    if batch_2_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 2:")
        for ingredient in batch_2_failed:
            print(f"  - {ingredient}")
else:
    print("⚠️ Batch 2 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 3 PROCESSING (Ingredients 301-450)
print("🔄 PROCESSING BATCH 3")
print("=" * 50)
print(f"Batch 3 contains {len(batch_dfs['batch_3'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_3']['ingredient'].head())}")
print()

# Process Batch 3
batch_3_results, batch_3_failed = process_batch_dataframe(
    batch_dfs['batch_3'], 
    'batch_3', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 3 COMPLETED!")

In [None]:
# 🔍 CHECK BATCH 3 RESULTS
print("📋 BATCH 3 DETAILED RESULTS")
print("=" * 50)

if 'batch_3_results' in locals():
    print(f"✅ Successfully processed: {len(batch_3_results[batch_3_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_3_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_3_results[batch_3_results['search_method'].notna()]) / len(batch_3_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 3 RESULTS:")
    display(batch_3_results)
    
    # Show failed ingredients if any
    if batch_3_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 3:")
        for ingredient in batch_3_failed:
            print(f"  - {ingredient}")
    
    print(f"\n✅ Ready to proceed to Batch 4? Run the next cell!")
else:
    print("⚠️ Batch 3 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 4 PROCESSING (Ingredients 451-600)
print("🔄 PROCESSING BATCH 4")
print("=" * 50)
print(f"Batch 4 contains {len(batch_dfs['batch_4'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_4']['ingredient'].head())}")
print()

# Process Batch 4
batch_4_results, batch_4_failed = process_batch_dataframe(
    batch_dfs['batch_4'], 
    'batch_4', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 4 COMPLETED!")


In [None]:
# 🔍 CHECK BATCH 4 RESULTS
print("📋 BATCH 4 DETAILED RESULTS")
print("=" * 50)

if 'batch_4_results' in locals():
    print(f"✅ Successfully processed: {len(batch_4_results[batch_4_results['search_method'].notna()])} ingredients")
    print(f"❌ Failed: {len(batch_4_failed)} ingredients")
    print(f"📊 Success rate: {len(batch_4_results[batch_4_results['search_method'].notna()]) / len(batch_4_results) * 100:.1f}%")
    
    # Show full results DataFrame
    print(f"\n📊 FULL BATCH 4 RESULTS:")
    display(batch_4_results)
    
    # Show failed ingredients if any
    if batch_4_failed:
        print(f"\n❌ FAILED INGREDIENTS IN BATCH 4:")
        for ingredient in batch_4_failed:
            print(f"  - {ingredient}")
else:
    print("⚠️ Batch 4 not processed yet. Run the previous cell first.")

In [None]:
# 📊 BATCH 5 PROCESSING (Ingredients 601-750)
print("🔄 PROCESSING BATCH 5")
print("=" * 50)
print(f"Batch 5 contains {len(batch_dfs['batch_5'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_5']['ingredient'].head())}")
print()

# Process Batch 5
batch_5_results, batch_5_failed = process_batch_dataframe(
    batch_dfs['batch_5'], 
    'batch_5', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 5 COMPLETED!")

In [None]:
# 📊 BATCH 6 PROCESSING (Ingredients 751+) - FINAL BATCH
print("🔄 PROCESSING BATCH 6 (FINAL)")
print("=" * 50)
print(f"Batch 6 contains {len(batch_dfs['batch_6'])} ingredients")
print(f"Sample ingredients: {list(batch_dfs['batch_6']['ingredient'].head())}")
print()

# Process Batch 6
batch_6_results, batch_6_failed = process_batch_dataframe(
    batch_dfs['batch_6'], 
    'batch_6', 
    api_key, 
    save_results=True
)

print(f"\n✅ BATCH 6 COMPLETED!")


In [None]:
print(f"✅ Successfully processed: {len(batch_6_results[batch_6_results['search_method'].notna()])} ingredients")
print(f"❌ Failed: {len(batch_5_failed)} ingredients")
print(f"📊 Success rate: {len(batch_6_results[batch_6_results['search_method'].notna()]) / len(batch_6_results) * 100:.1f}%")

# Show full results DataFrame
print(f"\n📊 FULL BATCH 5 RESULTS:")
display(batch_6_results)

# Show failed ingredients if any
if batch_6_failed:
    print(f"\n❌ FAILED INGREDIENTS IN BATCH 5:")
    for ingredient in batch_6_failed:
        print(f"  - {ingredient}")


In [None]:
def combine_batch_results_to_single_file(output_filename=None):
    """
    Combine all individually processed batch results into a single Excel file 
    with 2 sheets: Successful and Failed results, each with batch tracking.
    
    This function looks for variables in the current namespace following the pattern:
    - batch_X_results (DataFrame with successful results)
    - batch_X_failed (List with failed ingredients)
    
    Parameters:
    - output_filename: Output filename (if None, auto-generated with timestamp)
    
    Returns:
    - combined_successful_df: DataFrame with all successful results
    - combined_failed_df: DataFrame with all failed results
    - summary_stats: Dictionary with combination statistics
    """
    import pandas as pd
    from datetime import datetime
    import re
    
    print("🔍 Searching for existing batch results...")
    
    # Get all variables from the current namespace
    current_vars = globals()
    
    # Find all batch result variables
    successful_batches = {}
    failed_batches = {}
    
    # Pattern to match batch variables
    for var_name in current_vars:
        # Match successful batch results (e.g., batch_1_results, batch_2_results)
        if re.match(r'batch_\d+_results$', var_name):
            batch_num = re.search(r'batch_(\d+)_results', var_name).group(1)
            batch_name = f"batch_{batch_num}"
            if isinstance(current_vars[var_name], pd.DataFrame):
                successful_batches[batch_name] = current_vars[var_name]
                print(f"   ✅ Found {var_name}: {len(current_vars[var_name])} successful results")
        
        # Match failed batch results (e.g., batch_1_failed, batch_2_failed)
        elif re.match(r'batch_\d+_failed$', var_name):
            batch_num = re.search(r'batch_(\d+)_failed', var_name).group(1)
            batch_name = f"batch_{batch_num}"
            if isinstance(current_vars[var_name], list):
                failed_batches[batch_name] = current_vars[var_name]
                print(f"   ❌ Found {var_name}: {len(current_vars[var_name])} failed ingredients")
    
    if not successful_batches and not failed_batches:
        print("⚠️  No batch results found! Make sure you have processed batches first.")
        print("   Expected variables: batch_1_results, batch_1_failed, batch_2_results, batch_2_failed, etc.")
        return None, None, None
    
    print(f"\n📊 Found {len(successful_batches)} successful batch result sets")
    print(f"📊 Found {len(failed_batches)} failed batch result sets")
    
    # Combine all successful results
    all_successful_dfs = []
    total_successful = 0
    
    for batch_name, df in successful_batches.items():
        if not df.empty:
            # Add batch column
            df_copy = df.copy()
            df_copy['batch'] = batch_name
            all_successful_dfs.append(df_copy)
            total_successful += len(df_copy)
            print(f"   ✅ {batch_name}: {len(df_copy)} successful results")
    
    # Combine all failed results
    all_failed_data = []
    total_failed = 0
    
    for batch_name, failed_list in failed_batches.items():
        if failed_list:
            for ingredient in failed_list:
                all_failed_data.append({
                    'ingredient': ingredient,
                    'batch': batch_name,
                    'reason': 'No nutrition data found',
                    'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            total_failed += len(failed_list)
            print(f"   ❌ {batch_name}: {len(failed_list)} failed ingredients")
    
    # Create combined DataFrames
    if all_successful_dfs:
        combined_successful_df = pd.concat(all_successful_dfs, ignore_index=True)
        # Reorder columns to put batch column first after ingredient
        cols = list(combined_successful_df.columns)
        if 'batch' in cols and 'ingredient' in cols:
            # Move batch column to second position (after ingredient)
            cols.remove('batch')
            ingredient_idx = cols.index('ingredient')
            cols.insert(ingredient_idx + 1, 'batch')
            combined_successful_df = combined_successful_df[cols]
    else:
        combined_successful_df = pd.DataFrame()
    
    if all_failed_data:
        combined_failed_df = pd.DataFrame(all_failed_data)
    else:
        combined_failed_df = pd.DataFrame()
    
    # Generate filename if not provided
    if output_filename is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f'combined_batch_results_{timestamp}.xlsx'
    
    # Save to Excel with multiple sheets
    print(f"\n💾 Saving combined results to: {output_filename}")
    
    with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
        # Save successful results
        if not combined_successful_df.empty:
            combined_successful_df.to_excel(writer, sheet_name='Successful', index=False)
            print(f"   ✅ Successful sheet: {len(combined_successful_df)} ingredients saved")
        else:
            # Create empty sheet with headers
            pd.DataFrame(columns=['ingredient', 'batch', 'energy_kcal', 'carbs_g', 'protein_g', 'fat_g', 
                                'micronutrients', 'search_method', 'found_description']).to_excel(
                writer, sheet_name='Successful', index=False)
            print("   ⚠️  No successful results to save")
        
        # Save failed results
        if not combined_failed_df.empty:
            combined_failed_df.to_excel(writer, sheet_name='Failed', index=False)
            print(f"   ❌ Failed sheet: {len(combined_failed_df)} ingredients saved")
        else:
            # Create empty sheet with headers
            pd.DataFrame(columns=['ingredient', 'batch', 'reason', 'processed_at']).to_excel(
                writer, sheet_name='Failed', index=False)
            print("   ✅ No failed results (perfect success!)")
        
        # Create summary sheet
        summary_data = []
        batch_stats = {}
        
        # Get stats for each batch
        all_batch_names = set(list(successful_batches.keys()) + list(failed_batches.keys()))
        
        for batch_name in sorted(all_batch_names):
            successful_count = len(successful_batches.get(batch_name, pd.DataFrame()))
            failed_count = len(failed_batches.get(batch_name, []))
            total_count = successful_count + failed_count
            success_rate = (successful_count / total_count * 100) if total_count > 0 else 0
            
            summary_data.append({
                'Batch': batch_name,
                'Total_Ingredients': total_count,
                'Successful': successful_count,
                'Failed': failed_count,
                'Success_Rate_%': round(success_rate, 2)
            })
            
            batch_stats[batch_name] = {
                'total': total_count,
                'successful': successful_count,
                'failed': failed_count,
                'success_rate': success_rate
            }
        
        # Add overall summary
        overall_total = total_successful + total_failed
        if overall_total > 0:
            summary_data.append({
                'Batch': 'OVERALL_TOTAL',
                'Total_Ingredients': overall_total,
                'Successful': total_successful,
                'Failed': total_failed,
                'Success_Rate_%': round((total_successful / overall_total) * 100, 2)
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
        print(f"   📊 Summary sheet: Batch statistics saved")
    
    # Print final summary
    print(f"\n🎉 COMBINATION COMPLETED!")
    print(f"📊 Final Combined Results:")
    print(f"   • Total ingredients: {total_successful + total_failed}")
    print(f"   • Successful: {total_successful} ({(total_successful/(total_successful + total_failed))*100:.1f}%)")
    print(f"   • Failed: {total_failed} ({(total_failed/(total_successful + total_failed))*100:.1f}%)")
    print(f"   • Batches combined: {len(all_batch_names)}")
    print(f"💾 Results saved to: {output_filename}")
    
    return combined_successful_df, combined_failed_df, {
        'total_ingredients': total_successful + total_failed,
        'successful': total_successful,
        'failed': total_failed,
        'success_rate': (total_successful / (total_successful + total_failed)) * 100 if (total_successful + total_failed) > 0 else 0,
        'batches_combined': len(all_batch_names),
        'batch_statistics': batch_stats,
        'output_filename': output_filename
    }

In [None]:
combined_successful_df, combined_failed_df, summary_stats = combine_batch_results_to_single_file(
    # output_filename=None  # Auto-generate filename with timestamp
    output_filename='my_combined_results.xlsx'  # Or specify custom filename
)

if combined_successful_df is not None:
    # Display final summary
    print(f"\n📊 FINAL COMBINATION SUMMARY:")
    print(f"✅ Total successful: {len(combined_successful_df)}")
    print(f"❌ Total failed: {len(combined_failed_df)}")
    print(f"📈 Overall success rate: {summary_stats['success_rate']:.1f}%")
    print(f"🔢 Batches combined: {summary_stats['batches_combined']}")
    print(f"💾 Results saved to: {summary_stats['output_filename']}")
    
    # Show batch breakdown
    print(f"\n📋 BATCH BREAKDOWN:")
    for batch_name, stats in summary_stats['batch_statistics'].items():
        print(f"   {batch_name}: {stats['successful']}/{stats['total']} successful ({stats['success_rate']:.1f}%)")
    
    # Display sample successful results
    if not combined_successful_df.empty:
        print(f"\n📄 SAMPLE SUCCESSFUL RESULTS (first 5):")
        display(combined_successful_df.head())
    
    # Display sample failed results  
    if not combined_failed_df.empty:
        print(f"\n❌ SAMPLE FAILED RESULTS (first 5):")
        display(combined_failed_df.head())
else:
    print("❌ No batch results found to combine. Please process some batches first!")

PS: Crosscheck the content file

##### Request to do Specific Ingredients

In [None]:
BASE_URL = 'https://api.nal.usda.gov/fdc/v1/food/'

def get_food_data(fdc_id):
    """Fetch food data from USDA FoodData Central using the given FDC ID."""
    url = f"{BASE_URL}{fdc_id}"
    params = {
        'api_key': api_key,
        'format': 'abridged',  # This ensures we get nutrient values
        'nutrients': 'all'     # Get all nutrients with values
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        return None

In [None]:
def extract_nutrition_info(food, search_method):
    """Extract nutrition information from food item."""
    nutrients = food.get('foodNutrients', [])
    
    result = {
        'ingredient_name': food.get('description', 'N/A'),
        'found_description': food.get('description', 'N/A'),
        'search_method': food.get('foodClass', search_method),  # Use foodClass as search_method
        'energy_kcal': None,
        'carbohydrate_g': None,
        'protein_g': None,
        'fat_g': None,
        'micronutrients': [],
        'status': 'success'
    }
    
    # Energy (Atwater General Factors)
    energy = next((item for item in nutrients if item['nutrient']['name'] == 'Energy (Atwater Specific Factors)'), None)
    if energy:
        result['energy_kcal'] = energy['amount']
    
    # Carbohydrates
    carbohydrate = next((item for item in nutrients if item['nutrient']['name'] == 'Carbohydrate, by difference'), None)
    if carbohydrate:
        result['carbohydrate_g'] = carbohydrate['amount']
    
    # Fat
    fat = next((item for item in nutrients if item['nutrient']['name'] == 'Total lipid (fat)'), None)
    if fat:
        result['fat_g'] = fat['amount']
    
    # Protein
    protein = next((item for item in nutrients if item['nutrient']['name'] == 'Protein'), None)
    if protein:
        result['protein_g'] = protein['amount']
    
    # Exclude certain nutrients from micronutrients
    exclude_nutrients = [
        "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
        "Nitrogen", "Protein", "Total lipid (fat)", "Ash", "Carbohydrates",
        "Carbohydrate, by difference", "Total dietary fiber (AOAC 2011.25)",
        "High Molecular Weight Dietary Fiber (HMWDF)", "Low Molecular Weight Dietary Fiber (LMWDF)",
        "Sugars, Total", "Total Sugars", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
    ]
    
    # Get micronutrients (vitamins and minerals) - top 3 by amount
    filtered_micronutrients = [
        item for item in nutrients 
        if item['nutrient']['name'] not in exclude_nutrients and item['amount'] > 0
    ]
    
    # Sort by amount in descending order and take top 3
    sorted_micronutrients = sorted(filtered_micronutrients, key=lambda x: x['amount'], reverse=True)
    top_3_micronutrients = sorted_micronutrients[:3]
    
    # Extract only the nutrient names
    micronutrients = [item['nutrient']['name'] for item in top_3_micronutrients]
    result['micronutrients'] = micronutrients
    
    return result

In [None]:
def extract_nutrition_info_robust(food, search_method):
    """Extract nutrition information from food item with robust structure handling."""
    nutrients = food.get('foodNutrients', [])
    
    result = {
        'ingredient_name': food.get('description', 'N/A'),
        'found_description': food.get('description', 'N/A'),
        'search_method': food.get('foodClass', search_method),
        'energy_kcal': None,
        'carbohydrate_g': None,
        'protein_g': None,
        'fat_g': None,
        'micronutrients': [],
        'status': 'success'
    }
    
    def get_nutrient_value(nutrients, nutrient_name):
        """Helper function to extract nutrient value regardless of structure."""
        for item in nutrients:
            # Handle different possible structures
            name = None
            value = None
            
            # Try different ways to get nutrient name
            if isinstance(item, dict):
                if 'nutrient' in item and isinstance(item['nutrient'], dict):
                    name = item['nutrient'].get('name')
                elif 'nutrientName' in item:
                    name = item['nutrientName']
                elif 'name' in item:
                    name = item['name']
                
                # Try different ways to get value
                if 'amount' in item:
                    value = item['amount']
                elif 'value' in item:
                    value = item['value']
                elif 'quantity' in item:
                    value = item['quantity']
            
            # Check if this is the nutrient we're looking for
            if name and nutrient_name.lower() in name.lower():
                return value
        return None
    
    # Extract main nutrients
    result['energy_kcal'] = get_nutrient_value(nutrients, 'Energy (Atwater General Factors)')
    if result['energy_kcal'] is None:
        result['energy_kcal'] = get_nutrient_value(nutrients, 'Energy')
    
    result['carbohydrate_g'] = get_nutrient_value(nutrients, 'Carbohydrate, by difference')
    if result['carbohydrate_g'] is None:
        result['carbohydrate_g'] = get_nutrient_value(nutrients, 'Carbohydrate')
    
    result['fat_g'] = get_nutrient_value(nutrients, 'Total lipid (fat)')
    if result['fat_g'] is None:
        result['fat_g'] = get_nutrient_value(nutrients, 'Fat')
    
    result['protein_g'] = get_nutrient_value(nutrients, 'Protein')
    
    # Get micronutrients (excluding main macronutrients)
    exclude_nutrients = [
        "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
        "Nitrogen", "Protein", "Total lipid (fat)", "Fat", "Ash", "Carbohydrates",
        "Carbohydrate, by difference", "Total dietary fiber", "Fiber",
        "Sugars", "Sugar", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
    ]
    
    micronutrients_with_values = []
    
    for item in nutrients:
        if isinstance(item, dict):
            name = None
            value = None
            
            # Get nutrient name
            if 'nutrient' in item and isinstance(item['nutrient'], dict):
                name = item['nutrient'].get('name')
            elif 'nutrientName' in item:
                name = item['nutrientName']
            elif 'name' in item:
                name = item['name']
            
            # Get value
            if 'amount' in item:
                value = item['amount']
            elif 'value' in item:
                value = item['value']
            elif 'quantity' in item:
                value = item['quantity']
            
            # Check if this is a micronutrient
            if name and value is not None and value > 0:
                is_excluded = any(excl.lower() in name.lower() for excl in exclude_nutrients)
                if not is_excluded:
                    micronutrients_with_values.append({'name': name, 'value': value})
    
    # Sort by value and take top 3
    sorted_micronutrients = sorted(micronutrients_with_values, key=lambda x: x['value'], reverse=True)
    top_3_micronutrients = sorted_micronutrients[:3]
    result['micronutrients'] = [item['name'] for item in top_3_micronutrients]
    
    return result

# Test the robust function
print("🧪 Testing robust extraction function:")
fdc_id = 2684441
food_data = get_food_data(fdc_id)
if food_data:
    nutrition_info = extract_nutrition_info_robust(food_data, 'direct_search')
    print("✅ Success! Nutrition info extracted:")
    for key, value in nutrition_info.items():
        print(f"  {key}: {value}")
else:
    print("❌ Failed to get food data")

In [None]:
# Let's examine several nutrients to understand the structure
food_data = get_food_data(2684441)
if food_data:
    nutrients = food_data.get('foodNutrients', [])
    print(f"🔍 DETAILED NUTRIENT INSPECTION (first 10 nutrients):")
    
    for i, nutrient in enumerate(nutrients[:10]):
        print(f"\n--- Nutrient {i+1} ---")
        print(f"Full structure: {nutrient}")
        
        if 'nutrient' in nutrient:
            print(f"Nutrient name: {nutrient['nutrient'].get('name')}")
        
        # Check for any field containing values
        value_fields = [key for key in nutrient.keys() if 'value' in key.lower() or 'amount' in key.lower() or 'quantity' in key.lower()]
        if value_fields:
            print(f"Value fields found: {value_fields}")
            for field in value_fields:
                print(f"  {field}: {nutrient[field]}")
        else:
            print("No value fields found")
            
    # Let's also check if there's a different way to get nutrient values
    print(f"\n🔍 Looking for nutrients with actual values...")
    nutrients_with_values = []
    for nutrient in nutrients:
        if any(key for key in nutrient.keys() if 'value' in key.lower() or 'amount' in key.lower()):
            nutrients_with_values.append(nutrient)
    
    print(f"Found {len(nutrients_with_values)} nutrients with value fields")
    if nutrients_with_values:
        print(f"Example nutrient with values: {nutrients_with_values[0]}")

In [None]:
def extract_nutrition_info_corrected(food, search_method):
    """Extract nutrition information from food item - CORRECTED VERSION."""
    nutrients = food.get('foodNutrients', [])
    
    result = {
        'ingredient_name': food.get('description', 'N/A'),
        'found_description': food.get('description', 'N/A'),
        'search_method': food.get('foodClass', search_method),
        'energy_kcal': None,
        'carbohydrate_g': None,
        'protein_g': None,
        'fat_g': None,
        'micronutrients': [],
        'status': 'success'
    }
    
    def get_nutrient_value(nutrient_item):
        """Get the value from a nutrient item, handling different field names."""
        # Try different possible field names for the value
        possible_fields = ['amount', 'value', 'quantity', 'val']
        for field in possible_fields:
            if field in nutrient_item:
                return nutrient_item[field]
        return None
    
    # 1st priority: Energy (Atwater Specific Factors)
    energy = next((item for item in nutrients if item['nutrient']['name'] == 'Energy (Atwater Specific Factors)'), None)
    if energy:
        result['energy_kcal'] = get_nutrient_value(energy)
        result['energy_source'] = 'Atwater Specific Factors'
        print(f"   ✅ Found Energy (Atwater Specific Factors): {result['energy_kcal']} kcal")
    else:
        # 2nd priority: Energy (Atwater General Factors)
        energy = next((item for item in nutrients if item['nutrient']['name'] == 'Energy (Atwater General Factors)'), None)
        if energy:
            result['energy_kcal'] = get_nutrient_value(energy)
            result['energy_source'] = 'Atwater General Factors'
            print(f"   ✅ Found Energy (Atwater General Factors): {result['energy_kcal']} kcal")
        else:
            # 3rd priority: Energy with unitName = "kcal" ⭐ YOUR REQUIREMENT
            energy = next((item for item in nutrients 
                          if item['nutrient']['name'] == 'Energy' and 
                          item['nutrient'].get('unitName') == 'kcal'), None)
            if energy:
                result['energy_kcal'] = get_nutrient_value(energy)
                result['energy_source'] = 'Energy (kcal unit)'
                print(f"   ✅ Found Energy with unitName='kcal': {result['energy_kcal']} kcal")
            else:
                # 4th priority: Any Energy entry as final fallback
                energy = next((item for item in nutrients if 'Energy' in item['nutrient']['name']), None)
                if energy:
                    result['energy_kcal'] = get_nutrient_value(energy)
                    unit = energy['nutrient'].get('unitName', 'unknown unit')
                    result['energy_source'] = f'Fallback Energy ({unit})'
                    print(f"   ⚠️ Found fallback Energy: {result['energy_kcal']} {unit}")
                else:
                    result['energy_source'] = 'Not found'
                    print(f"   ❌ No Energy data found")
    
    # Extract other macronutrients
    print(f"   🔍 Extracting other nutrients...")
    
    # Carbohydrates
    carbohydrate = next((item for item in nutrients if item['nutrient']['name'] == 'Carbohydrate, by difference'), None)
    if carbohydrate:
        result['carbohydrate_g'] = get_nutrient_value(carbohydrate)
    
    # Fat
    fat = next((item for item in nutrients if item['nutrient']['name'] == 'Total lipid (fat)'), None)
    if fat:
        result['fat_g'] = get_nutrient_value(fat)
    
    # Protein
    protein = next((item for item in nutrients if item['nutrient']['name'] == 'Protein'), None)
    if protein:
        result['protein_g'] = get_nutrient_value(protein)
    
    # Exclude certain nutrients from micronutrients
    exclude_nutrients = [
        "Energy", "Water", "Energy (Atwater General Factors)", "Energy (Atwater Specific Factors)",
        "Nitrogen", "Protein", "Total lipid (fat)", "Ash", "Carbohydrates",
        "Carbohydrate, by difference", "Total dietary fiber (AOAC 2011.25)",
        "High Molecular Weight Dietary Fiber (HMWDF)", "Low Molecular Weight Dietary Fiber (LMWDF)",
        "Sugars, Total", "Total Sugars", "Sucrose", "Glucose", "Fructose", "Lactose", "Maltose"
    ]
    
    # Get micronutrients (vitamins and minerals) - top 3 by value
    filtered_micronutrients = []
    for item in nutrients:
        if (item['nutrient']['name'] not in exclude_nutrients and 
            get_nutrient_value(item) is not None and 
            get_nutrient_value(item) > 0):
            filtered_micronutrients.append({
                'name': item['nutrient']['name'],
                'value': get_nutrient_value(item)
            })
    
    # Sort by value in descending order and take top 3
    sorted_micronutrients = sorted(filtered_micronutrients, key=lambda x: x['value'], reverse=True)
    top_3_micronutrients = sorted_micronutrients[:3]
    
    # Extract only the nutrient names
    micronutrients = [item['name'] for item in top_3_micronutrients]
    result['micronutrients'] = micronutrients
    
    return result

# Test the corrected function
print("🧪 Testing CORRECTED extraction function:")
fdc_id = 2684441
food_data = get_food_data(fdc_id)
if food_data:
    nutrition_info = extract_nutrition_info_corrected(food_data, 'direct_search')
    print("✅ Success! Nutrition info extracted:")
    for key, value in nutrition_info.items():
        print(f"  {key}: {value}")
else:
    print("❌ Failed to get food data")

In [None]:
def process_fdc_id_list_to_excel(name_id_list, output_filename=None, delay=0.5):
    """
    Process a list of [name, id] pairs, fetch nutrition data for each FDC ID, and save to Excel.
    
    Parameters:
    - name_id_list: List of [name, fdc_id] pairs, e.g., [["Apple", 123456], ["Banana", 789012]]
    - output_filename: Output Excel filename (if None, auto-generated with timestamp)
    - delay: Delay between API calls in seconds (default 0.5s)
    
    Returns:
    - results_df: DataFrame with all results
    - failed_list: List of failed items
    - summary_stats: Dictionary with processing statistics
    """
    import pandas as pd
    import time
    from datetime import datetime
    
    print(f"🚀 Processing {len(name_id_list)} FDC ID entries...")
    print("=" * 60)
    
    results = []
    failed_items = []
    
    start_time = time.time()
    
    for i, (name, fdc_id) in enumerate(name_id_list, 1):
        print(f"\n🔄 Processing {i}/{len(name_id_list)}: {name} (ID: {fdc_id})")
        
        try:
            # Convert fdc_id to int if it's a string
            if isinstance(fdc_id, str):
                fdc_id = int(fdc_id)
            
            # Get food data from API
            food_data = get_food_data(fdc_id)
            
            if food_data:
                # Extract nutrition info using the corrected function
                nutrition_info = extract_nutrition_info_corrected(food_data, 'FDC_ID_lookup')
                
                if nutrition_info and nutrition_info.get('status') == 'success':
                    # Create result entry
                    result_entry = {
                        'name': name,
                        'fdc_id': fdc_id,
                        'found_description': nutrition_info.get('found_description', ''),
                        'search_method': nutrition_info.get('search_method', ''),
                        'energy_kcal': nutrition_info.get('energy_kcal'),
                        'carbohydrate_g': nutrition_info.get('carbohydrate_g'),
                        'protein_g': nutrition_info.get('protein_g'),
                        'fat_g': nutrition_info.get('fat_g'),
                        'micronutrients': ', '.join(nutrition_info.get('micronutrients', [])),
                        'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'status': 'success'
                    }
                    results.append(result_entry)
                    print(f"   ✅ Success: {nutrition_info.get('energy_kcal')} kcal, {len(nutrition_info.get('micronutrients', []))} micronutrients")
                else:
                    # Failed to extract nutrition
                    failed_entry = {
                        'name': name,
                        'fdc_id': fdc_id,
                        'reason': 'Failed to extract nutrition data',
                        'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    }
                    failed_items.append(failed_entry)
                    print(f"   ❌ Failed: Could not extract nutrition data")
            else:
                # Failed to get food data
                failed_entry = {
                    'name': name,
                    'fdc_id': fdc_id,
                    'reason': 'Failed to fetch food data from API',
                    'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                failed_items.append(failed_entry)
                print(f"   ❌ Failed: API call failed")
        
        except Exception as e:
            # Error occurred
            failed_entry = {
                'name': name,
                'fdc_id': fdc_id,
                'reason': f'Error: {str(e)}',
                'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            failed_items.append(failed_entry)
            print(f"   ❌ Error: {str(e)}")
        
        # Add delay between requests
        if i < len(name_id_list):  # Don't delay after the last item
            time.sleep(delay)
    
    # Create DataFrames
    results_df = pd.DataFrame(results) if results else pd.DataFrame()
    failed_df = pd.DataFrame(failed_items) if failed_items else pd.DataFrame()
    
    # Generate filename if not provided
    if output_filename is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f'fdc_nutrition_data_{timestamp}.xlsx'
    
    # Save to Excel
    print(f"\n💾 Saving results to: {output_filename}")
    
    with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
        # Save successful results
        if not results_df.empty:
            results_df.to_excel(writer, sheet_name='Nutrition_Data', index=False)
            print(f"   ✅ Nutrition data saved: {len(results_df)} items")
        else:
            # Create empty sheet with headers
            empty_df = pd.DataFrame(columns=['name', 'fdc_id', 'found_description', 'search_method', 
                                           'energy_kcal', 'carbohydrate_g', 'protein_g', 'fat_g', 
                                           'micronutrients', 'processed_at', 'status'])
            empty_df.to_excel(writer, sheet_name='Nutrition_Data', index=False)
            print("   ⚠️  No successful results to save")
        
        # Save failed items
        if not failed_df.empty:
            failed_df.to_excel(writer, sheet_name='Failed_Items', index=False)
            print(f"   ❌ Failed items saved: {len(failed_df)} items")
        
        # Create summary sheet
        summary_data = [{
            'Total_Items': len(name_id_list),
            'Successful': len(results),
            'Failed': len(failed_items),
            'Success_Rate_%': (len(results) / len(name_id_list)) * 100 if name_id_list else 0,
            'Processing_Time_Seconds': round(time.time() - start_time, 1),
            'Processed_At': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }]
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
        print(f"   📊 Summary saved")
    
    # Print final summary
    total_time = time.time() - start_time
    success_rate = (len(results) / len(name_id_list)) * 100 if name_id_list else 0
    
    print(f"\n🎉 PROCESSING COMPLETED!")
    print(f"📊 Final Results:")
    print(f"   • Total items processed: {len(name_id_list)}")
    print(f"   • Successful: {len(results)} ({success_rate:.1f}%)")
    print(f"   • Failed: {len(failed_items)} ({100-success_rate:.1f}%)")
    print(f"   • Total processing time: {total_time:.1f} seconds")
    print(f"   • Average time per item: {total_time/len(name_id_list):.2f} seconds")
    print(f"💾 Results saved to: {output_filename}")
    
    return results_df, failed_items, {
        'total_items': len(name_id_list),
        'successful': len(results),
        'failed': len(failed_items),
        'success_rate': success_rate,
        'processing_time': total_time,
        'output_filename': output_filename
    }

In [None]:
name_id_list = [
    ["salmon", 2684441],
    ["Whole Milk", 746782],
    ["White Bread", 339005],
    ["yoghurt", 2259793],
    ["baby oatmeal", 2708492],
    ["bok choy", 2685572],
    ["tomato", 2685581],
    ["clove", 171321],
    ["cloves", 171321],
    ["wholemeal bread", 335240],
    ["rice",2512381],
    ["basmati rice", 2708404],
    ["thyme", 173470], 
    ["coconut milk", 2705413],
    ["paprika", 171329],
    ["cod", 2684444],
    ["dark chocolat",170271],
    ["baking powder", 172805],
    ["formula", 2705518],
    ["parsley", 170416], 
    ["butter", 790508],
    ["apricot", 2710815],
    ["stock",2707132],
    ["coriander", 169997],
    ["chayote", 170402],
    ["chives", 169994],
    ["blueberry", 2346411],
    ["blueberries", 2346411],
    ["spring onion", 170005],
    ["shrimp", 2684443],
    ["catfish", 2684445],
    ["brown sugar", 2710260],
    ["berries", 2709272],
    ["turkey", 2514747],
    ["black pepper", 170931],
    ["raisins",2709212],
    ["egg noodles", 169731],
    ["lemon grass", 168573],
    ["papaya", 2709246],
    ["coconut water", 2707572],
    ["ginger", 169231],
    ["seaweed", 2709988],
    ["eyes fish", 168034],
    ["curry powder", 170924],
    ["cream cheese", 173418],
    ["feta cheese", 2259796],
    ["quinoa", 2708401],
    ["kiwi", 2710831],
    ["mango", 2710834],
    ["mangoes", 2710834],
    ["cassava", 169985], 
    
]
# Process the test list
results_df, failed_list, stats = process_fdc_id_list_to_excel(
    name_id_list=name_id_list,
    output_filename=None,  # Auto-generate filename
    delay=0.5  # 500ms delay between calls
)

In [None]:
# Display results
if not results_df.empty:
    print(f"\n📄 RESULTS PREVIEW:")
    display(results_df)
    
    print(f"\n📊 NUTRITION DATA SUMMARY:")
    numeric_cols = ['energy_kcal', 'carbohydrate_g', 'protein_g', 'fat_g']
    for col in numeric_cols:
        if col in results_df.columns:
            avg_val = results_df[col].mean()
            print(f"   • Average {col}: {avg_val:.2f}")

if failed_list:
    print(f"\n❌ FAILED ITEMS:")
    for item in failed_list:
        print(f"   • {item['name']} (ID: {item['fdc_id']}): {item['reason']}")

print(f"\n💾 Excel file saved: {stats['output_filename']}")
print(f"📈 Success rate: {stats['success_rate']:.1f}%")

In [None]:
df.columns.to_list()

#### Compute Nutrients Per Ingredient of The Recipe

In [1154]:
import pandas as pd

# Load the Excel file structure
excel_file = pd.ExcelFile('final_ingredient_nutrition.xlsx')

# Get list of available sheet names
sheet_names = excel_file.sheet_names

# Check if 'Succesful' sheet exists
if 'Successful' in sheet_names:
    combined_nutrition_data = pd.read_excel('testing/final_ingredient_nutritions.xlsx', sheet_name='Successful')
    print("Shape:", combined_nutrition_data.shape)
    print("✅ Loaded Successful sheet.")
else:
    print("❌ Sheet 'Succesful' not found.")
    print("📄 Available sheets:", sheet_names)
    combined_nutrition_data = None  # or raise an error / fallback action


Shape: (712, 11)
✅ Loaded Successful sheet.


In [1155]:
# Filter for complete data
# Check ingredient breakdown data
print(f"\n=== INGREDIENT BREAKDOWN DATA ===")
print(f"ingredient_breakdown_df shape: {ingredient_breakdown_df.shape}")
print(f"Complete data available: {complete_data_mask.sum()}/{len(complete_data_mask)}")

complete_ingredients = ingredient_breakdown_df[complete_data_mask].copy()
print(f"Working with {len(complete_ingredients)} complete ingredient records")

ingredient_breakdown_df[complete_data_mask].head()


=== INGREDIENT BREAKDOWN DATA ===
ingredient_breakdown_df shape: (2572, 8)
Complete data available: 1895/2572
Working with 1895 complete ingredient records


Unnamed: 0,recipe_id,recipe_name,quantity,measurement,ingredient_name,original_text,original_quantity,original_measurement
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,60.0,g,cassava,"60 g cassava, boiled and blended",60,g
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,20.0,g,fish meat (milkfish),"20 g fish meat (milkfish), finely chopped",20,g
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,10.0,g,chicken meat,10 g chicken meat,10,g
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,5.0,g,coconut oil,5 g coconut oil,5,g
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,100.0,g,chicken broth,100 cc chicken broth,100,cc


In [1156]:
import pandas as pd

expected_cols = ['ingredient', 'energy_kcal_per_100g', 'carbs_g_per_100g',
                 'protein_g_per_100g', 'fat_g_per_100g', 'top_micronutrients']

# Check that data is a DataFrame and has required columns
if isinstance(combined_nutrition_data, pd.DataFrame) and all(col in combined_nutrition_data.columns for col in expected_cols):
    df = combined_nutrition_data[expected_cols]

    # --- Step 1: Drop rows where all 5 nutrient columns are empty/null/N/A ---
    nutrient_cols = ['energy_kcal_per_100g', 'carbs_g_per_100g', 'protein_g_per_100g', 'fat_g_per_100g', 'top_micronutrients']

    # Mark entries considered as "null"
    df[nutrient_cols] = df[nutrient_cols].replace(['', 'N/A', 'n/a', None], pd.NA)

    # Find rows where all nutrient fields are NA
    condition = df[nutrient_cols].isna().all(axis=1)

    # Save dropped rows and count
    dropped_rows = df[condition][['ingredient']].copy()
    dropped_rows['null_count'] = 5

    # Drop the identified rows
    df = df[~condition].reset_index(drop=True)

    # --- Step 2: Clean and convert numeric fields ---
    numeric_cols = ['energy_kcal_per_100g', 'carbs_g_per_100g', 'protein_g_per_100g', 'fat_g_per_100g']
    for col in numeric_cols:
        df[col] = df[col].replace(pd.NA, 0).fillna(0).astype(float)

else:
    print("❌ Could not load sheet or missing columns.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[nutrient_cols] = df[nutrient_cols].replace(['', 'N/A', 'n/a', None], pd.NA)


In [1157]:
print("✅ Cleaned DataFrame:")
display(df.head())
print("Current Shape:" , df.shape)

print("🗑️ Dropped rows where all nutrient info was missing:")
print(dropped_rows.count())

✅ Cleaned DataFrame:


Unnamed: 0,ingredient,energy_kcal_per_100g,carbs_g_per_100g,protein_g_per_100g,fat_g_per_100g,top_micronutrients
0,(0.8 fl oz) whole milk or infant formula,0.0,6.86,7.81,11.0,"Potassium, K, Calcium, Ca, Phosphorus, P"
1,(1 1/2 oz/45 g) frozen petite peas,0.0,5.18,3.38,0.95,"Potassium, K, Calcium, Ca, Phosphorus, P"
2,(1 heaped tbsp) green beans,40.0,7.41,1.97,0.275,"Potassium, K, Vitamin K (phylloquinone), Molyb..."
3,(1 kg) beef or lamb stew meat,237.0,-0.251,17.5,18.6,"Potassium, K, Phosphorus, P, Cholesterol"
4,(1.75 oz) celery stalk (about 1 stalk),16.7,3.32,0.492,0.162,"Potassium, K, Sodium, Na, Calcium, Ca"


Current Shape: (636, 6)
🗑️ Dropped rows where all nutrient info was missing:
ingredient    76
null_count    76
dtype: int64


In [1158]:
# Check measurements to ensure all are in grams
print("=== CHECKING MEASUREMENTS ===")
print("Unique measurement types in complete_ingredients:")
measurement_counts = complete_ingredients['measurement'].value_counts()
print(measurement_counts.head(10))

# Check if all measurements are in grams
non_gram_measurements = complete_ingredients[complete_ingredients['measurement'] != 'g']
print(f"\nNon-gram measurements: {len(non_gram_measurements)}")
if len(non_gram_measurements) > 0:
    print("Sample non-gram measurements:")
    print(non_gram_measurements[['ingredient_name', 'quantity', 'measurement']].head())
    
# For this analysis, we'll assume all measurements should be treated as grams
# If there are non-gram measurements, they should be converted beforehand
print(f"\nProceeding with assumption that all quantities are in grams or gram-equivalent units")

=== CHECKING MEASUREMENTS ===
Unique measurement types in complete_ingredients:
measurement
g            1548
heaped         16
knob           14
pinch          12
level          11
bowl           10
sprig          10
ripe            9
measuring       9
garlic          9
Name: count, dtype: int64

Non-gram measurements: 347
Sample non-gram measurements:
          ingredient_name quantity measurement
17             lemongrass        1       stalk
29                 pieces      4-6    broccoli
32  thick wholemeal bread        1       slice
61                  onion        1      spring
62                    ham        1       slice

Proceeding with assumption that all quantities are in grams or gram-equivalent units


In [1159]:
def find_nutrition_match(ingredient_name, nutrition_df):
    """
    Find nutrition match using 3-step strategy:
    1. Exact match (e.g., "sweet potato" matches exactly "sweet potato")
    2. Keyword match (e.g., "broccoli floret" matches "broccoli")
    3. No match - return None
    
    Includes custom rules:
    - Ignore case sensitivity
    - Remove text in brackets ()
    - Remove x000D characters
    - Special keyword matching rules
    - Exclusion rules for certain keywords
    """
    
    def clean_ingredient_name(name):
        """Clean ingredient name according to custom rules"""
        if pd.isna(name):
            return ""
        
        name = str(name)
        
        # Remove x000D
        name = name.replace('\x00\x0D', '').replace('x000D', '')
        
        # Remove text in brackets ()
        name = re.sub(r'\([^)]*\)', '', name)
        
        # Clean up extra spaces and convert to lowercase
        name = ' '.join(name.split()).lower().strip()
        
        return name
    
    def should_exclude_ingredient(ingredient_name):
        """Check if ingredient should be excluded from matching"""
        ingredient_lower = ingredient_name.lower()
        
        # Don't match hokkien or rice noodles
        if 'hokkien' in ingredient_lower or 'rice noodles' in ingredient_lower:
            return True
            
        # Don't match potato starch with potato (return all zeros)
        if 'potato starch' in ingredient_lower:
            return True
            
        return False
    
    def create_zero_nutrition():
        """Create a nutrition entry with all zeros for excluded ingredients"""
        # Create a pandas Series that matches the structure of nutrition_df
        zero_series = pd.Series({
            'ingredient': 'EXCLUDED - ALL ZEROS',
            'energy_kcal_per_100g': 0,
            'carbs_g_per_100g': 0,
            'protein_g_per_100g': 0,
            'fat_g_per_100g': 0,
            'top_micronutrients': ''
        })
        return zero_series, "excluded_zeros"
    
    def apply_special_matching_rules(ingredient_clean):
        """Apply special matching rules for specific ingredients"""
        # Yoghurt = yogurt
        if 'yoghurt' in ingredient_clean:
            ingredient_clean = ingredient_clean.replace('yoghurt', 'yogurt')
        
        # Cauliflower/broccoli = broccoli
        if 'cauliflower/broccoli' in ingredient_clean or 'cauliflower broccoli' in ingredient_clean:
            ingredient_clean = 'broccoli'
        
        # Oatmeal to baby oatmeal (if not already baby oatmeal)
        if 'oatmeal' in ingredient_clean and 'baby' not in ingredient_clean:
            ingredient_clean = ingredient_clean.replace('oatmeal', 'baby oatmeal')
        
        return ingredient_clean
    
    # Clean the input ingredient name
    ingredient_clean = clean_ingredient_name(ingredient_name)
    
    # Check exclusion rules first
    if should_exclude_ingredient(ingredient_clean):
        return create_zero_nutrition()
    
    # Special case for water - return all zeros
    if ingredient_clean in ['water', 'plain water', 'boiling water', 'cold water', 'warm water']:
        return create_zero_nutrition()
    
    # Apply special matching rules
    ingredient_clean = apply_special_matching_rules(ingredient_clean)
    
    # Clean nutrition dataframe ingredient names for comparison
    nutrition_df_clean = nutrition_df.copy()
    nutrition_df_clean['ingredient_clean'] = nutrition_df_clean['ingredient'].apply(clean_ingredient_name)
    
    # Step 1: Try exact match
    exact_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'] == ingredient_clean]
    if len(exact_match) > 0:
        return exact_match.iloc[0], "exact_match"
    
    # Step 2: Try keyword matching with special rules
    ingredient_words = ingredient_clean.split()
    
    for word in ingredient_words:
        if len(word) > 2:  # Only consider words longer than 2 characters
            
            # Special keyword matching rules
            if word == 'broth':
                # If there's "broth" in ingredient, match with "broth"
                try:
                    broth_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains(r'\bbroth\b', na=False, regex=True)]
                    if len(broth_match) > 0:
                        return broth_match.iloc[0], f"keyword_match_broth"
                except:
                    pass
            
            elif 'long onion' in ingredient_clean:
                # Long onion should match with "long onion" specifically
                try:
                    long_onion_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains('long onion', na=False, regex=False)]
                    if len(long_onion_match) > 0:
                        return long_onion_match.iloc[0], f"keyword_match_long_onion"
                except:
                    pass
            
            elif 'baby oatmeal' in ingredient_clean:
                # Baby oatmeal should match with "baby oatmeal" specifically
                try:
                    baby_oatmeal_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains('baby oatmeal', na=False, regex=False)]
                    if len(baby_oatmeal_match) > 0:
                        return baby_oatmeal_match.iloc[0], f"keyword_match_baby_oatmeal"
                except:
                    pass
            
            # General keyword matching for other words
            try:
                # Escape special regex characters and use word boundaries for exact word matching
                pattern = r'\b' + re.escape(word) + r'\b'
                keyword_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains(pattern, na=False, regex=True)]
                if len(keyword_match) > 0:
                    return keyword_match.iloc[0], f"keyword_match_{word}"
            except Exception as e:
                # If regex fails, try simple string contains
                try:
                    keyword_match = nutrition_df_clean[nutrition_df_clean['ingredient_clean'].str.contains(word, na=False, regex=False)]
                    if len(keyword_match) > 0:
                        return keyword_match.iloc[0], f"keyword_match_{word}"
                except:
                    continue
    
    # Step 3: No match found
    return None, "no_match"


In [1160]:
def calculate_ingredient_nutrition_new_strategy(ingredient_df, nutrition_df):
    """
    Calculate nutrition per ingredient using new strategy:
    - Use quantity directly (assume all in grams or gram-equivalent)
    - No conversion, direct proportional calculation from per 100g data
    """
    print("=== CALCULATING NUTRIENTS PER INGREDIENT (NEW STRATEGY) ===")
    
    result_df = ingredient_df.copy()
    
    # Add nutrition columns
    result_df['energy_kcal'] = 0.0
    result_df['carbs_g'] = 0.0
    result_df['protein_g'] = 0.0
    result_df['fat_g'] = 0.0
    result_df['micronutrients'] = ''
    result_df['nutrition_matched'] = False
    result_df['match_type'] = ''
    result_df['matched_ingredient'] = ''
    
    # Track statistics
    exact_matches = 0
    keyword_matches = 0
    no_matches = 0
    total_count = len(ingredient_df)
    
    print(f"Processing {total_count} ingredients...")
    
    for idx, row in ingredient_df.iterrows():
        ingredient_name = row['ingredient_name']
        quantity = row['quantity']
        
        # Find nutrition match
        nutrition_match, match_type = find_nutrition_match(ingredient_name, nutrition_df)
        
        if nutrition_match is not None:
            result_df.loc[idx, 'nutrition_matched'] = True
            result_df.loc[idx, 'match_type'] = match_type
            result_df.loc[idx, 'matched_ingredient'] = nutrition_match['ingredient']
            
            # Count match types
            if match_type == "exact_match":
                exact_matches += 1
            elif "keyword_match" in match_type:
                keyword_matches += 1
            
            # Calculate nutrition based on quantity (assume quantity is in grams)
            try:
                qty_numeric = float(quantity)
                if qty_numeric > 0:
                    # Calculate proportion of 100g
                    proportion = qty_numeric / 100.0
                    
                    # Scale numeric nutrients by proportion
                    if pd.notna(nutrition_match['energy_kcal_per_100g']):
                        result_df.loc[idx, 'energy_kcal'] = nutrition_match['energy_kcal_per_100g'] * proportion
                    if pd.notna(nutrition_match['carbs_g_per_100g']):
                        result_df.loc[idx, 'carbs_g'] = nutrition_match['carbs_g_per_100g'] * proportion
                    if pd.notna(nutrition_match['protein_g_per_100g']):
                        result_df.loc[idx, 'protein_g'] = nutrition_match['protein_g_per_100g'] * proportion
                    if pd.notna(nutrition_match['fat_g_per_100g']):
                        result_df.loc[idx, 'fat_g'] = nutrition_match['fat_g_per_100g'] * proportion
                    
                    # Store micronutrients as-is (don't scale)
                    if pd.notna(nutrition_match['top_micronutrients']):
                        result_df.loc[idx, 'micronutrients'] = str(nutrition_match['top_micronutrients'])
            except (ValueError, TypeError):
                # If quantity is not numeric, skip calculation but keep the match info
                pass
        else:
            no_matches += 1
            result_df.loc[idx, 'match_type'] = 'no_match'
    return result_df

In [1161]:
ingredients_with_nutrition_updated = calculate_ingredient_nutrition_new_strategy(complete_ingredients, df)

=== CALCULATING NUTRIENTS PER INGREDIENT (NEW STRATEGY) ===
Processing 1895 ingredients...


In [1162]:
ingredients_with_nutrition_updated.columns.to_list
ingredients_with_nutrition_updated.head()

Unnamed: 0,recipe_id,recipe_name,quantity,measurement,ingredient_name,original_text,original_quantity,original_measurement,energy_kcal,carbs_g,protein_g,fat_g,micronutrients,nutrition_matched,match_type,matched_ingredient
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,60.0,g,cassava,"60 g cassava, boiled and blended",60,g,96.0,22.86,0.816,0.168,"Potassium, K, Magnesium, Mg, Phosphorus, P",True,exact_match,cassava
1,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,20.0,g,fish meat (milkfish),"20 g fish meat (milkfish), finely chopped",20,g,0.0,0.0,3.26,0.09,"Potassium, K, Phosphorus, P, Sodium, Na",True,exact_match,fish meat (milkfish)
2,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,10.0,g,chicken meat,10 g chicken meat,10,g,12.7,0.0,2.14,0.478,"Potassium, K, Phosphorus, P, Cholesterol",True,exact_match,chicken meat
3,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,5.0,g,coconut oil,5 g coconut oil,5,g,0.0,0.042,0.0,4.955,"Total fat (NLEA), Fatty acids, total saturated...",True,exact_match,coconut oil
4,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,100.0,g,chicken broth,100 cc chicken broth,100,cc,133.0,0.0,17.9,7.16,"Potassium, K, Phosphorus, P, Cholesterol",True,keyword_match_chicken,(250 g) ground turkey or chicken


##### Validation

In [1163]:
# NUTRITION CALCULATION VALIDATION FUNCTION
def validate_nutrition_calculations(calculated_df, nutrition_database_df, sample_size=20, tolerance=0.1):
    """
    Validate nutrition calculations by comparing calculated values with expected values
    from the main nutrition database.
    
    Parameters:
    - calculated_df: DataFrame with calculated nutrition values per ingredient
    - nutrition_database_df: Main nutrition database (df) with per 100g values
    - sample_size: Number of random samples to validate
    - tolerance: Acceptable percentage difference (0.1 = 10%)
    
    Returns:
    - validation_results: DataFrame with validation details
    - summary_stats: Dictionary with validation summary
    """
    
    def parse_quantity_safe(qty_str):
        """Safely parse quantity string including fractions"""
        if pd.isna(qty_str):
            return 0.0
        
        qty_str = str(qty_str)
        
        # Handle fractions
        fraction_map = {'½': 0.5, '¼': 0.25, '¾': 0.75, '⅓': 0.33, '⅔': 0.67, '⅛': 0.125}
        for frac, val in fraction_map.items():
            qty_str = qty_str.replace(frac, str(val))
        
        # Handle ranges (take average)
        if '-' in qty_str or '–' in qty_str:
            parts = re.split(r'[-–]', qty_str)
            if len(parts) == 2:
                try:
                    min_val = float(parts[0].strip())
                    max_val = float(parts[1].strip())
                    return (min_val + max_val) / 2
                except:
                    pass
        
        # Convert to float
        try:
            return float(qty_str)
        except:
            return 0.0
    
    print("🔍 NUTRITION CALCULATION VALIDATION")
    print("=" * 60)
    
    # Filter only ingredients that have nutrition matches (not excluded or no_match)
    matched_ingredients = calculated_df[
        (calculated_df['nutrition_matched'] == True) & 
        (calculated_df['match_type'] != 'excluded_zeros') &
        (calculated_df['match_type'] != 'no_match')
    ].copy()
    
    print(f"Total ingredients with nutrition matches: {len(matched_ingredients)}")
    
    # Sample random ingredients for validation
    if len(matched_ingredients) > sample_size:
        validation_sample = matched_ingredients.sample(n=sample_size, random_state=42)
        print(f"Validating random sample of {sample_size} ingredients")
    else:
        validation_sample = matched_ingredients.copy()
        print(f"Validating all {len(validation_sample)} matched ingredients")
    
    validation_results = []
    
    for idx, row in validation_sample.iterrows():
        ingredient_name = row['ingredient_name']
        matched_ingredient = row['matched_ingredient']
        quantity = parse_quantity_safe(row['quantity'])
        match_type = row['match_type']
        
        # Get calculated values
        calc_energy = row['energy_kcal'] if pd.notna(row['energy_kcal']) else 0
        calc_protein = row['protein_g'] if pd.notna(row['protein_g']) else 0
        calc_carbs = row['carbs_g'] if pd.notna(row['carbs_g']) else 0
        calc_fat = row['fat_g'] if pd.notna(row['fat_g']) else 0
        
        # Find the corresponding nutrition data in main database
        nutrition_match = nutrition_database_df[
            nutrition_database_df['ingredient'].str.lower().str.strip() == 
            matched_ingredient.lower().strip()
        ]
        
        if len(nutrition_match) > 0:
            nutr = nutrition_match.iloc[0]
            
            # Calculate expected values (quantity/100 * per_100g_value)
            if quantity > 0:
                proportion = quantity / 100.0
                
                expected_energy = nutr['energy_kcal_per_100g'] * proportion if pd.notna(nutr['energy_kcal_per_100g']) else 0
                expected_protein = nutr['protein_g_per_100g'] * proportion if pd.notna(nutr['protein_g_per_100g']) else 0
                expected_carbs = nutr['carbs_g_per_100g'] * proportion if pd.notna(nutr['carbs_g_per_100g']) else 0
                expected_fat = nutr['fat_g_per_100g'] * proportion if pd.notna(nutr['fat_g_per_100g']) else 0
                
                # Calculate percentage differences
                def calc_percentage_diff(calculated, expected):
                    if expected == 0 and calculated == 0:
                        return 0.0
                    elif expected == 0:
                        return float('inf') if calculated != 0 else 0.0
                    else:
                        return abs((calculated - expected) / expected) * 100
                
                energy_diff = calc_percentage_diff(calc_energy, expected_energy)
                protein_diff = calc_percentage_diff(calc_protein, expected_protein)
                carbs_diff = calc_percentage_diff(calc_carbs, expected_carbs)
                fat_diff = calc_percentage_diff(calc_fat, expected_fat)
                
                # Determine if validation passed (within tolerance)
                tolerance_percent = tolerance * 100
                energy_pass = energy_diff <= tolerance_percent or energy_diff == 0.0
                protein_pass = protein_diff <= tolerance_percent or protein_diff == 0.0
                carbs_pass = carbs_diff <= tolerance_percent or carbs_diff == 0.0
                fat_pass = fat_diff <= tolerance_percent or fat_diff == 0.0
                
                overall_pass = energy_pass and protein_pass and carbs_pass and fat_pass
                
                validation_results.append({
                    'ingredient_name': ingredient_name,
                    'matched_ingredient': matched_ingredient,
                    'match_type': match_type,
                    'quantity_g': quantity,
                    'proportion': proportion,
                    
                    # Energy validation
                    'calc_energy': round(calc_energy, 2),
                    'expected_energy': round(expected_energy, 2),
                    'energy_diff_percent': round(energy_diff, 2) if energy_diff != float('inf') else 'INF',
                    'energy_pass': energy_pass,
                    
                    # Protein validation
                    'calc_protein': round(calc_protein, 2),
                    'expected_protein': round(expected_protein, 2),
                    'protein_diff_percent': round(protein_diff, 2) if protein_diff != float('inf') else 'INF',
                    'protein_pass': protein_pass,
                    
                    # Carbs validation
                    'calc_carbs': round(calc_carbs, 2),
                    'expected_carbs': round(expected_carbs, 2),
                    'carbs_diff_percent': round(carbs_diff, 2) if carbs_diff != float('inf') else 'INF',
                    'carbs_pass': carbs_pass,
                    
                    # Fat validation
                    'calc_fat': round(calc_fat, 2),
                    'expected_fat': round(expected_fat, 2),
                    'fat_diff_percent': round(fat_diff, 2) if fat_diff != float('inf') else 'INF',
                    'fat_pass': fat_pass,
                    
                    'overall_pass': overall_pass,
                    'validation_status': 'PASS' if overall_pass else 'FAIL'
                })
            else:
                # Zero quantity case
                validation_results.append({
                    'ingredient_name': ingredient_name,
                    'matched_ingredient': matched_ingredient,
                    'match_type': match_type,
                    'quantity_g': quantity,
                    'validation_status': 'ZERO_QUANTITY',
                    'overall_pass': True  # Zero quantities are valid
                })
        else:
            # Could not find nutrition data for validation
            validation_results.append({
                'ingredient_name': ingredient_name,
                'matched_ingredient': matched_ingredient,
                'match_type': match_type,
                'quantity_g': quantity,
                'validation_status': 'NO_DB_MATCH',
                'overall_pass': False
            })
    
    # Create validation results DataFrame
    validation_df = pd.DataFrame(validation_results)
    
    # Calculate summary statistics
    if len(validation_df) > 0:
        total_validations = len(validation_df)
        successful_validations = len(validation_df[validation_df['validation_status'].isin(['PASS', 'FAIL'])])
        passed_validations = (validation_df['overall_pass'] == True).sum()
        failed_validations = (validation_df['validation_status'] == 'FAIL').sum()
        no_db_match = (validation_df['validation_status'] == 'NO_DB_MATCH').sum()
        zero_quantity = (validation_df['validation_status'] == 'ZERO_QUANTITY').sum()
        
        # Calculate pass rates for each nutrient (only for PASS/FAIL status)
        valid_for_nutrient_check = validation_df[validation_df['validation_status'].isin(['PASS', 'FAIL'])]
        if len(valid_for_nutrient_check) > 0:
            energy_pass_rate = (valid_for_nutrient_check['energy_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
            protein_pass_rate = (valid_for_nutrient_check['protein_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
            carbs_pass_rate = (valid_for_nutrient_check['carbs_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
            fat_pass_rate = (valid_for_nutrient_check['fat_pass'] == True).sum() / len(valid_for_nutrient_check) * 100
        else:
            energy_pass_rate = protein_pass_rate = carbs_pass_rate = fat_pass_rate = 0
        
        summary_stats = {
            'total_validations': total_validations,
            'successful_validations': successful_validations,
            'passed_validations': passed_validations,
            'failed_validations': failed_validations,
            'no_db_match': no_db_match,
            'zero_quantity': zero_quantity,
            'overall_pass_rate': passed_validations / total_validations * 100 if total_validations > 0 else 0,
            'energy_pass_rate': energy_pass_rate,
            'protein_pass_rate': protein_pass_rate,
            'carbs_pass_rate': carbs_pass_rate,
            'fat_pass_rate': fat_pass_rate,
            'tolerance_percent': tolerance * 100
        }
    else:
        summary_stats = {'error': 'No validations performed'}
    
    return validation_df, summary_stats

# Run the validation
print("Starting nutrition calculation validation...")
validation_results, validation_summary = validate_nutrition_calculations(
    ingredients_with_nutrition_updated, 
    df, 
    sample_size=30,  # Validate 30 random samples
    tolerance=0.05   # 5% tolerance
)

Starting nutrition calculation validation...
🔍 NUTRITION CALCULATION VALIDATION
Total ingredients with nutrition matches: 1747
Validating random sample of 30 ingredients


In [1164]:
# Display validation results and summary
print("📊 VALIDATION SUMMARY REPORT")
print("=" * 60)

if 'error' in validation_summary:
    print(f"❌ Validation Error: {validation_summary['error']}")
else:
    # Display summary statistics
    print(f"📈 OVERALL VALIDATION STATISTICS:")
    print(f"   Total validations performed: {validation_summary['total_validations']}")
    print(f"   Successful validations: {validation_summary['successful_validations']}")
    print(f"   Passed validations: {validation_summary['passed_validations']}")
    print(f"   Failed validations: {validation_summary['failed_validations']}")
    print(f"   No database match: {validation_summary['no_db_match']}")
    print(f"   Zero quantity cases: {validation_summary['zero_quantity']}")
    print(f"   Overall pass rate: {validation_summary['overall_pass_rate']:.1f}%")
    print(f"   Tolerance used: ±{validation_summary['tolerance_percent']}%")
    
    print(f"\n🧪 NUTRIENT-SPECIFIC PASS RATES:")
    print(f"   Energy (kcal): {validation_summary['energy_pass_rate']:.1f}%")
    print(f"   Protein (g): {validation_summary['protein_pass_rate']:.1f}%")
    print(f"   Carbohydrates (g): {validation_summary['carbs_pass_rate']:.1f}%")
    print(f"   Fat (g): {validation_summary['fat_pass_rate']:.1f}%")

# Display detailed validation results
print(f"\n📋 DETAILED VALIDATION RESULTS:")
print("-" * 60)

if len(validation_results) > 0:
    # Show passed validations
    passed_validations = validation_results[validation_results['validation_status'] == 'PASS']
    if len(passed_validations) > 0:
        print(f"\n✅ PASSED VALIDATIONS ({len(passed_validations)}):")
        for idx, row in passed_validations.head(5).iterrows():
            print(f"   • {row['ingredient_name']} ({row['quantity_g']}g)")
            print(f"     Matched: {row['matched_ingredient']} via {row['match_type']}")
            print(f"     Energy: {row['calc_energy']} vs {row['expected_energy']} (diff: {row['energy_diff_percent']}%)")
    
    # Show failed validations
    failed_validations = validation_results[validation_results['validation_status'] == 'FAIL']
    if len(failed_validations) > 0:
        print(f"\n❌ FAILED VALIDATIONS ({len(failed_validations)}):")
        for idx, row in failed_validations.head(5).iterrows():
            print(f"   • {row['ingredient_name']} ({row['quantity_g']}g)")
            print(f"     Matched: {row['matched_ingredient']} via {row['match_type']}")
            print(f"     Energy: {row['calc_energy']} vs {row['expected_energy']} (diff: {row['energy_diff_percent']}%)")
            failed_nutrients = []
            if not row['energy_pass']: failed_nutrients.append('Energy')
            if not row['protein_pass']: failed_nutrients.append('Protein')
            if not row['carbs_pass']: failed_nutrients.append('Carbs')
            if not row['fat_pass']: failed_nutrients.append('Fat')
            print(f"     Failed nutrients: {', '.join(failed_nutrients)}")
    
    # Show some examples in tabular format
    print(f"\n📊 SAMPLE VALIDATION DETAILS:")
    display_cols = ['ingredient_name', 'quantity_g', 'calc_energy', 'expected_energy', 
                   'energy_diff_percent', 'validation_status']
    sample_results = validation_results[display_cols].head(10)
    print(sample_results.to_string(index=False))

# Overall validation assessment
print(f"\n🎯 VALIDATION ASSESSMENT:")
if 'error' not in validation_summary:
    overall_rate = validation_summary['overall_pass_rate']
    if overall_rate >= 95:
        print("   🟢 EXCELLENT: Calculations are highly accurate (≥95% pass rate)")
    elif overall_rate >= 90:
        print("   🟡 GOOD: Calculations are mostly accurate (≥90% pass rate)")
    elif overall_rate >= 80:
        print("   🟠 FAIR: Some calculation issues detected (≥80% pass rate)")
    else:
        print("   🔴 POOR: Significant calculation errors detected (<80% pass rate)")
        
    print(f"   📐 Calculation accuracy: {overall_rate:.1f}%")
    print(f"   🔍 Tolerance: ±{validation_summary['tolerance_percent']}%")

print(f"\n" + "=" * 60)
print("✅ VALIDATION COMPLETE!")

📊 VALIDATION SUMMARY REPORT
📈 OVERALL VALIDATION STATISTICS:
   Total validations performed: 30
   Successful validations: 30
   Passed validations: 30
   Failed validations: 0
   No database match: 0
   Zero quantity cases: 0
   Overall pass rate: 100.0%
   Tolerance used: ±5.0%

🧪 NUTRIENT-SPECIFIC PASS RATES:
   Energy (kcal): 100.0%
   Protein (g): 100.0%
   Carbohydrates (g): 100.0%
   Fat (g): 100.0%

📋 DETAILED VALIDATION RESULTS:
------------------------------------------------------------

✅ PASSED VALIDATIONS (30):
   • tuna in spring water (185.0g)
     Matched: tuna in spring water via exact_match
     Energy: 0.0 vs 0.0 (diff: 0.0%)
   • butter  _x000D_ (20.0g)
     Matched: (10g) Butter via keyword_match_butter
     Energy: 129.0 vs 129.0 (diff: 0.0%)
   • sweet potato (50.0g)
     Matched: sweet potato via exact_match
     Energy: 39.5 vs 39.5 (diff: 0.0%)
   • unsalted butter (15.0g)
     Matched: (15g) unsalted butter via exact_match
     Energy: 0.0 vs 0.0 (diff: 0.0%

In [1165]:
import pandas as pd

# Show all exact matches
exact_matches_df = ingredients_with_nutrition_updated[ingredients_with_nutrition_updated['match_type'] == 'exact_match']
print(f"\nExact Matches:")

exact_sample = exact_matches_df[['ingredient_name', 'matched_ingredient', 'match_type']]

# Display all rows without being cut off
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(exact_sample)



Exact Matches:


Unnamed: 0,ingredient_name,matched_ingredient,match_type
0,cassava,cassava,exact_match
1,fish meat (milkfish),fish meat (milkfish),exact_match
2,chicken meat,chicken meat,exact_match
3,coconut oil,coconut oil,exact_match
5,fresh lime juice,fresh lime juice,exact_match
6,spinach,spinach,exact_match
7,beef mince,beef mince,exact_match
9,milk,milk (can use water instead),exact_match
12,margarine,margarine,exact_match
18,cauliflower/broccoli,broccoli,exact_match


In [1166]:
# Show all no matches
no_matches_df = ingredients_with_nutrition_updated [ingredients_with_nutrition_updated['match_type'] == 'no_match']
print(f"\nNo Matches:")

no_match_sample = no_matches_df[['ingredient_name', 'match_type']]

# Display all rows and columns fully
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(no_match_sample)



No Matches:


Unnamed: 0,ingredient_name,match_type
17,lemongrass,no_match
34,polenta (a coarse,no_match
117,breadcrumbs,no_match
121,cornstarch,no_match
125,udon (preferably thin,no_match
126,dashi (baby-safe,no_match
139,dashi,no_match
183,(4 oz/125 g) pearl barley,no_match
251,stalks,no_match
290,fromage blanc,no_match


In [1167]:
keyword_matches_df = ingredients_with_nutrition_updated [ingredients_with_nutrition_updated ['match_type'].str.contains('keyword_match', na=False)]
keyword_matches = keyword_matches_df[['ingredient_name', 'matched_ingredient', 'match_type']]

# Ensure full display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(keyword_matches)

Unnamed: 0,ingredient_name,matched_ingredient,match_type
4,chicken broth,(250 g) ground turkey or chicken,keyword_match_chicken
38,cooked rice,cold cooked basmati rice or brown rice,keyword_match_cooked
41,vanilla essence,natural vanilla extract,keyword_match_vanilla
82,self-raising flour,(65 g) all-purpose flour,keyword_match_flour
86,vegetables (bok choy,bok choy,keyword_match_choy
99,self-raising flour,(65 g) all-purpose flour,keyword_match_flour
107,canned tuna or salmon in spring water,(3 tbsp) canned five bean salad,keyword_match_canned
128,plain yogurt,plain cooked chestnuts,keyword_match_plain
131,Japanese rice *steamed,basmati rice,keyword_match_rice
133,Long onion *chopped,(40g) onion,keyword_match_onion


In [1168]:
# Check the total counts and math
total_processed = len(ingredients_with_nutrition_updated)
exact_matches = (ingredients_with_nutrition_updated['match_type'] == 'exact_match').sum()
keyword_matches = ingredients_with_nutrition_updated['match_type'].str.contains('keyword_match', na=False).sum()
no_matches = (ingredients_with_nutrition_updated['match_type'] == 'no_match').sum()
excluded_zeros = (ingredients_with_nutrition_updated['match_type'] == 'excluded_zeros').sum()

print(f"Total ingredients processed: {total_processed}")
print(f"Exact matches: {exact_matches}")
print(f"Keyword matches: {keyword_matches}")
print(f"No matches: {no_matches}")
print(f"Excluded zeros: {excluded_zeros}")

Total ingredients processed: 1895
Exact matches: 1496
Keyword matches: 251
No matches: 64
Excluded zeros: 84


#### Calculate Recipe

In [1052]:
# 🔧 SOLUTION: Ensure no_match cases have zero values for all numeric fields
def ensure_no_match_zero_values(df):
    print("🔧 ENSURING NO_MATCH CASES HAVE ZERO VALUES")
    print("=" * 50)
    
    # Define all numeric nutrition columns that should be zero for no_match
    numeric_nutrition_cols = [
        'energy_kcal', 'carbs_g', 'protein_g', 'fat_g', 
        'carbohydrate_g',  # Alternative naming
        # Add any other numeric nutrition columns you have
    ]
    
    # Find no_match cases
    no_match_mask = df['match_type'] == 'no_match'
    no_match_count = no_match_mask.sum()
    
    print(f"📊 Found {no_match_count} ingredients with 'no_match' status")
    
    if no_match_count > 0:
        # Set all numeric nutrition values to 0 for no_match cases
        for col in numeric_nutrition_cols:
            if col in df.columns:
                # Check current state
                before_none = df.loc[no_match_mask, col].isna().sum()
                before_values = df.loc[no_match_mask, col].notna().sum()
                
                # Set to zero
                df.loc[no_match_mask, col] = 0.0
                
                print(f"   ✅ {col}: Set {before_none} None/NaN + {before_values} existing values → 0")
        
        # Also ensure nutrition_matched is False for no_match cases
        if 'nutrition_matched' in df.columns:
            df.loc[no_match_mask, 'nutrition_matched'] = False
            print(f"   ✅ nutrition_matched: Set to False for all no_match cases")
        
        # Set micronutrients to empty list for no_match cases
        if 'micronutrients' in df.columns:
            df.loc[no_match_mask, 'micronutrients'] = '[]'  # Empty list as string
            print(f"   ✅ micronutrients: Set to empty list for all no_match cases")
        
        # Show sample of corrected data
        print(f"\n📋 SAMPLE CORRECTED NO_MATCH CASES:")
        sample_cols = ['ingredient_name', 'match_type'] + [col for col in numeric_nutrition_cols if col in df.columns]
        sample_no_match = df[no_match_mask][sample_cols].head(3)
        display(sample_no_match)
        
        print(f"\n✅ SUCCESS: All {no_match_count} no_match cases now have zero values for numeric fields!")
    else:
        print("ℹ️ No ingredients with 'no_match' status found.")
    
    return df

# Apply the correction to your current data
if 'ingredients_with_nutrition_updated' in locals():
    print("🧪 TESTING: Before correction")
    no_match_before = ingredients_with_nutrition_updated[ingredients_with_nutrition_updated['match_type'] == 'no_match']
    if len(no_match_before) > 0:
        print(f"Sample before correction (showing nutrition values):")
        display(no_match_before[['ingredient_name', 'match_type', 'energy_kcal', 'carbs_g', 'protein_g', 'fat_g']].head(3))
    
    # Apply the correction
    ingredients_with_nutrition_updated = ensure_no_match_zero_values(ingredients_with_nutrition_updated)
    
    print(f"\n🧪 VERIFICATION: After correction")
    no_match_after = ingredients_with_nutrition_updated[ingredients_with_nutrition_updated['match_type'] == 'no_match']
    if len(no_match_after) > 0:
        print(f"Sample after correction (all should be 0.0):")
        display(no_match_after[['ingredient_name', 'match_type', 'energy_kcal', 'carbs_g', 'protein_g', 'fat_g']].head(3))
else:
    print("❌ ingredients_with_nutrition_updated not found. Please run the ingredient processing first.")

🧪 TESTING: Before correction
Sample before correction (showing nutrition values):


Unnamed: 0,ingredient_name,match_type,energy_kcal,carbs_g,protein_g,fat_g
17,lemongrass,no_match,0.0,0.0,0.0,0.0
34,polenta (a coarse,no_match,0.0,0.0,0.0,0.0
117,breadcrumbs,no_match,0.0,0.0,0.0,0.0


🔧 ENSURING NO_MATCH CASES HAVE ZERO VALUES
📊 Found 64 ingredients with 'no_match' status
   ✅ energy_kcal: Set 0 None/NaN + 64 existing values → 0
   ✅ carbs_g: Set 0 None/NaN + 64 existing values → 0
   ✅ protein_g: Set 0 None/NaN + 64 existing values → 0
   ✅ fat_g: Set 0 None/NaN + 64 existing values → 0
   ✅ nutrition_matched: Set to False for all no_match cases
   ✅ micronutrients: Set to empty list for all no_match cases

📋 SAMPLE CORRECTED NO_MATCH CASES:


Unnamed: 0,ingredient_name,match_type,energy_kcal,carbs_g,protein_g,fat_g
17,lemongrass,no_match,0.0,0.0,0.0,0.0
34,polenta (a coarse,no_match,0.0,0.0,0.0,0.0
117,breadcrumbs,no_match,0.0,0.0,0.0,0.0



✅ SUCCESS: All 64 no_match cases now have zero values for numeric fields!

🧪 VERIFICATION: After correction
Sample after correction (all should be 0.0):


Unnamed: 0,ingredient_name,match_type,energy_kcal,carbs_g,protein_g,fat_g
17,lemongrass,no_match,0.0,0.0,0.0,0.0
34,polenta (a coarse,no_match,0.0,0.0,0.0,0.0
117,breadcrumbs,no_match,0.0,0.0,0.0,0.0


In [1180]:
# Calculate recipe-level nutrition using new strategy
def calculate_recipe_nutrition_new(ingredients_df):
    """
    Calculate overall recipe nutrition by aggregating ingredient nutrition.
    For numeric nutrients: sum all values and divide by number of ingredients
    For micronutrients: find overlapping nutrients across all ingredients
    """
    print("=== CALCULATING RECIPE-LEVEL NUTRITION (NEW STRATEGY) ===")
    
    # Group by recipe
    recipe_nutrition = []
    
    for recipe_id, recipe_group in ingredients_df.groupby('recipe_id'):
        # Filter only ingredients with nutrition data
        recipe_name = recipe_group['recipe_name'].iloc[0] if 'recipe_name' in recipe_group.columns else f"Recipe {recipe_id}"
        with_nutrition = recipe_group[recipe_group['nutrition_matched'] == True]
        
        if len(with_nutrition) == 0:
            continue
            
        recipe_data = {
            'recipe_id': recipe_id,
            'recipe_name': recipe_name,
            'total_ingredients': len(recipe_group),
            'ingredients_with_nutrition': len(with_nutrition),
            'nutrition_coverage': len(with_nutrition) / len(recipe_group) * 100,
            'exact_matches': len(with_nutrition[with_nutrition['match_type'] == 'exact_match']),
            'keyword_matches': len(with_nutrition[with_nutrition['match_type'].str.contains('keyword_match', na=False)])
        }
        
        # Calculate total numeric nutrients (sum without averaging)
        numeric_cols = ['energy_kcal', 'carbs_g', 'protein_g', 'fat_g']
        
        for col in numeric_cols:
            total_value = with_nutrition[col].sum()
            recipe_data[f'total_{col}'] = total_value  # Total nutrition value
            # Optional: Keep average per ingredient with nutrition data
            recipe_data[f'avg_per_matched_{col}'] = total_value / len(with_nutrition) if len(with_nutrition) > 0 else 0
        
        # Find overlapping micronutrients
        micronutrient_lists = []
        for _, ingredient in with_nutrition.iterrows():
            if ingredient['micronutrients'] and str(ingredient['micronutrients']).strip():
                try:
                    # Try to parse as list if it's a string representation
                    micro_str = str(ingredient['micronutrients'])
                    if micro_str.startswith('[') and micro_str.endswith(']'):
                        micro_list = eval(micro_str)  # Be careful with eval in production
                    else:
                        # Split by comma if it's a comma-separated string
                        micro_list = [m.strip() for m in micro_str.split(',')]
                    micronutrient_lists.append(set(micro_list))
                except:
                    # If parsing fails, treat as single item
                    micronutrient_lists.append({str(ingredient['micronutrients'])})
        
        # Find overlapping micronutrients (present in all ingredients)
        if micronutrient_lists:
            # Calculate overlapping micronutrients (present in ALL ingredients)
            overlapping_micronutrients = set.intersection(*micronutrient_lists)
            all_micronutrients = set.union(*micronutrient_lists)
            
            # 🎯 KEY MODIFICATION: Use overlapping if available, otherwise use all
            if overlapping_micronutrients:
                # Use overlapping micronutrients (ideal case)
                final_micronutrients = list(overlapping_micronutrients)
                micronutrient_source = "overlapping"
                print(f"   Recipe '{recipe_name}': Using {len(final_micronutrients)} overlapping micronutrients")
            else:
                # Fallback: Use all unique micronutrients when no overlap
                final_micronutrients = list(all_micronutrients)
                micronutrient_source = "all_unique"
                print(f"   Recipe '{recipe_name}': No overlap found, using {len(final_micronutrients)} unique micronutrients")
            
            # Store results
            recipe_data['overlapping_micronutrients'] = list(overlapping_micronutrients)
            recipe_data['all_micronutrients'] = list(all_micronutrients)
            recipe_data['final_micronutrients'] = final_micronutrients  # 🆕 The ones actually used
            recipe_data['micronutrient_source'] = micronutrient_source  # 🆕 Track which method was used
            recipe_data['micronutrient_overlap_count'] = len(overlapping_micronutrients)
            recipe_data['total_unique_micronutrients'] = len(all_micronutrients)
            recipe_data['final_micronutrient_count'] = len(final_micronutrients)  # 🆕 Count of final list
        else:
            # No micronutrients found at all
            recipe_data['overlapping_micronutrients'] = []
            recipe_data['all_micronutrients'] = []
            recipe_data['final_micronutrients'] = []
            recipe_data['micronutrient_source'] = "none"
            recipe_data['micronutrient_overlap_count'] = 0
            recipe_data['total_unique_micronutrients'] = 0
            recipe_data['final_micronutrient_count'] = 0
        
        recipe_nutrition.append(recipe_data)
    
    recipe_nutrition_df = pd.DataFrame(recipe_nutrition)
    print(f"✅ Calculated nutrition for {len(recipe_nutrition_df)} recipes")
    
    return recipe_nutrition_df

In [1181]:
# Execute recipe nutrition calculation
recipe_nutrition_new = calculate_recipe_nutrition_new(ingredients_with_nutrition_updated)

print("columns:" , recipe_nutrition_new.columns.to_list)

display(recipe_nutrition_new.head(10))


=== CALCULATING RECIPE-LEVEL NUTRITION (NEW STRATEGY) ===
   Recipe 'Cassava Porridge with Fish Sauce and Lemon (Bubur Singkong Kukuruyuk Saus Jeruk)': No overlap found, using 17 unique micronutrients
   Recipe 'Bitterballs (Bitterballen)': Using 2 overlapping micronutrients
   Recipe 'Broccoli/Cauliflower Cheese': No overlap found, using 15 unique micronutrients
   Recipe 'Beef Casserole': No overlap found, using 14 unique micronutrients
   Recipe 'Toast Fingers': Using 6 overlapping micronutrients
   Recipe 'Pumpkin Polenta Fingers': No overlap found, using 13 unique micronutrients
   Recipe 'Rice Pudding': No overlap found, using 11 unique micronutrients
   Recipe 'Hummus': No overlap found, using 16 unique micronutrients
   Recipe 'Baked Bean Pie': Using 2 overlapping micronutrients
   Recipe 'Tomato & Lentil Soup': No overlap found, using 22 unique micronutrients
   Recipe 'Fried Rice': No overlap found, using 17 unique micronutrients
   Recipe 'Toddler Couscous': No overlap found

Unnamed: 0,recipe_id,recipe_name,total_ingredients,ingredients_with_nutrition,nutrition_coverage,exact_matches,keyword_matches,total_energy_kcal,avg_per_matched_energy_kcal,total_carbs_g,...,avg_per_matched_protein_g,total_fat_g,avg_per_matched_fat_g,overlapping_micronutrients,all_micronutrients,final_micronutrients,micronutrient_source,micronutrient_overlap_count,total_unique_micronutrients,final_micronutrient_count
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,7,7,100.0,6,1,255.48,36.497143,24.854,...,3.566429,13.1378,1.876829,[],"[Magnesium, P, beta, Total fat (NLEA), Na, K, ...","[Magnesium, P, beta, Total fat (NLEA), Na, K, ...",all_unique,0,17,17
1,2,Bitterballs (Bitterballen),5,4,80.0,3,0,0.0,0.0,3.376,...,17.18775,50.332,12.583,"[Sodium, Na]","[P, Vitamin A, RAE, Retinol, Na, K, Sodium, Ph...","[Sodium, Na]",overlapping,2,9,2
2,3,Broccoli/Cauliflower Cheese,5,5,100.0,5,0,53.55,10.71,23.5955,...,9.2766,28.989,5.7978,[],"[P, Vitamin A, Vitamin K (phylloquinone), RAE,...","[P, Vitamin A, Vitamin K (phylloquinone), RAE,...",all_unique,0,15,15
3,5,Beef Casserole,6,6,100.0,6,0,1585.2,264.2,44.9207,...,18.835833,290.2725,48.37875,[],"[Sodium, P, Vitamin C, Cholesterol, Choline, V...","[Sodium, P, Vitamin C, Cholesterol, Choline, V...",all_unique,0,14,14
4,6,Toast Fingers,1,1,100.0,1,0,0.0,0.0,0.492,...,0.0943,0.0359,0.0359,"[Ca, Na, K, Sodium, Potassium, Calcium]","[Ca, Na, K, Sodium, Potassium, Calcium]","[Ca, Na, K, Sodium, Potassium, Calcium]",overlapping,6,6,6
5,7,Pumpkin Polenta Fingers,5,4,80.0,3,0,555.0,138.75,22.42,...,9.695,48.4,12.1,[],"[Magnesium, Sodium, P, Total fat (NLEA), Ca, N...","[Magnesium, Sodium, P, Total fat (NLEA), Ca, N...",all_unique,0,13,13
6,8,Rice Pudding,3,3,100.0,1,2,0.0,0.0,182.413,...,21.35,11.541,3.847,[],"[Magnesium, Sodium, P, DFE, Na, Folate, K, Pot...","[Magnesium, Sodium, P, DFE, Na, Folate, K, Pot...",all_unique,0,11,11
7,9,Hummus,5,5,100.0,5,0,548.0,109.6,90.602,...,10.31361,50.4154,10.08308,[],"[Magnesium, P, total dietary, Na, K, Sodium, t...","[Magnesium, P, total dietary, Na, K, Sodium, t...",all_unique,0,16,16
8,10,Baked Bean Pie,5,5,100.0,5,0,1207.1,241.42,228.558,...,21.9992,45.575,9.115,"[Phosphorus, P]","[P, Vitamin C, Ca, Na, Folate, K, Sodium, Phos...","[Phosphorus, P]",overlapping,2,12,2
9,11,Tomato & Lentil Soup,7,7,100.0,7,0,895.2,127.885714,179.396,...,8.915371,598.779,85.539857,[],"[Magnesium, P, Vitamin A, RAE, total dietary, ...","[Magnesium, P, Vitamin A, RAE, total dietary, ...",all_unique,0,22,22


In [1182]:
# 🧪 DETAILED VALIDATION TESTING FOR 5 RECIPES
print("=" * 80)
print("🧪 DETAILED VALIDATION TESTING FOR 5 SPECIFIC RECIPES")
print("=" * 80)

# Select 5 recipes for detailed validation
test_recipes = recipe_nutrition_new.head(5)['recipe_name'].tolist()

print(f"Testing these recipes: {test_recipes}")
print("\n" + "="*50)

for i, recipe_name in enumerate(test_recipes, 1):
    print(f"\n🔍 RECIPE {i}: {recipe_name}")
    print("-" * 60)
    
    # Get recipe ingredients
    recipe_ingredients = ingredients_with_nutrition_updated[
        ingredients_with_nutrition_updated['recipe_name'] == recipe_name
    ]
    
    # Get recipe nutrition results
    recipe_nutrition = recipe_nutrition_new[
        recipe_nutrition_new['recipe_name'] == recipe_name
    ].iloc[0]
    
    print(f"📊 RECIPE OVERVIEW:")
    print(f"   • Total ingredients: {len(recipe_ingredients)}")
    print(f"   • Matched ingredients: {recipe_nutrition['ingredients_with_nutrition']}")
    print(f"   • Nutrition coverage: {recipe_nutrition['nutrition_coverage']:.1f}%")
    print(f"   • Exact matches: {recipe_nutrition['exact_matches']}")
    print(f"   • Keyword matches: {recipe_nutrition['keyword_matches']}")
    
    # Show ingredient breakdown
    print(f"\n🥄 INGREDIENT BREAKDOWN:")
    ingredient_details = []
    total_manual_energy = 0
    total_manual_protein = 0
    total_manual_carbs = 0
    total_manual_fat = 0
    
    for _, ingredient in recipe_ingredients.iterrows():
        has_nutrition = ingredient['nutrition_matched']
        energy = ingredient['energy_kcal'] if has_nutrition else 0
        protein = ingredient['protein_g'] if has_nutrition else 0
        carbs = ingredient['carbs_g'] if has_nutrition else 0
        fat = ingredient['fat_g'] if has_nutrition else 0
        
        if has_nutrition:
            total_manual_energy += energy
            total_manual_protein += protein
            total_manual_carbs += carbs
            total_manual_fat += fat
        
        status = "✅ Matched" if has_nutrition else "❌ No match"
        print(f"   • {ingredient['ingredient_name']:<25} | {status:<12} | "
              f"Energy: {energy:>6.1f} kcal | Protein: {protein:>5.1f}g | "
              f"Carbs: {carbs:>5.1f}g | Fat: {fat:>5.1f}g")
    
    # Verify calculations
    print(f"\n🧮 MANUAL vs SYSTEM CALCULATION VERIFICATION:")
    print(f"   Energy:  Manual={total_manual_energy:>6.1f} | System={recipe_nutrition['total_energy_kcal']:>6.1f} | Match: {'✅' if abs(total_manual_energy - recipe_nutrition['total_energy_kcal']) < 0.1 else '❌'}")
    print(f"   Protein: Manual={total_manual_protein:>6.1f} | System={recipe_nutrition['total_protein_g']:>6.1f} | Match: {'✅' if abs(total_manual_protein - recipe_nutrition['total_protein_g']) < 0.1 else '❌'}")
    print(f"   Carbs:   Manual={total_manual_carbs:>6.1f} | System={recipe_nutrition['total_carbs_g']:>6.1f} | Match: {'✅' if abs(total_manual_carbs - recipe_nutrition['total_carbs_g']) < 0.1 else '❌'}")
    print(f"   Fat:     Manual={total_manual_fat:>6.1f} | System={recipe_nutrition['total_fat_g']:>6.1f} | Match: {'✅' if abs(total_manual_fat - recipe_nutrition['total_fat_g']) < 0.1 else '❌'}")
    
    # Show micronutrients
    if recipe_nutrition['total_unique_micronutrients'] > 0:
        print(f"\n🧬 MICRONUTRIENTS:")
        print(f"   • Total unique: {recipe_nutrition['total_unique_micronutrients']}")
        print(f"   • Overlapping: {recipe_nutrition['micronutrient_overlap_count']}")
        if recipe_nutrition['overlapping_micronutrients']:
            overlap_str = str(recipe_nutrition['overlapping_micronutrients'])[:100]
            print(f"   • Common nutrients: {overlap_str}{'...' if len(str(recipe_nutrition['overlapping_micronutrients'])) > 100 else ''}")
    
    print("-" * 60)

print(f"\n✅ VALIDATION COMPLETE FOR ALL {len(test_recipes)} RECIPES")
print("=" * 80)

🧪 DETAILED VALIDATION TESTING FOR 5 SPECIFIC RECIPES
Testing these recipes: ['Cassava Porridge with Fish Sauce and Lemon (Bubur Singkong Kukuruyuk Saus Jeruk)', 'Bitterballs (Bitterballen)', 'Broccoli/Cauliflower Cheese', 'Beef Casserole', 'Toast Fingers']


🔍 RECIPE 1: Cassava Porridge with Fish Sauce and Lemon (Bubur Singkong Kukuruyuk Saus Jeruk)
------------------------------------------------------------
📊 RECIPE OVERVIEW:
   • Total ingredients: 7
   • Matched ingredients: 7
   • Nutrition coverage: 100.0%
   • Exact matches: 6
   • Keyword matches: 1

🥄 INGREDIENT BREAKDOWN:
   • cassava                   | ✅ Matched    | Energy:   96.0 kcal | Protein:   0.8g | Carbs:  22.9g | Fat:   0.2g
   • fish meat (milkfish)      | ✅ Matched    | Energy:    0.0 kcal | Protein:   3.3g | Carbs:   0.0g | Fat:   0.1g
   • chicken meat              | ✅ Matched    | Energy:   12.7 kcal | Protein:   2.1g | Carbs:   0.0g | Fat:   0.5g
   • coconut oil               | ✅ Matched    | Energy:    0.0 

In [1183]:
#add nutrition idx to the recipe_nutrition_new
# Add nutrition index (simple index + 1)
recipe_nutrition_new['nutrition_id'] = recipe_nutrition_new.index + 1

# Keep only specified columns
columns_to_keep = ['recipe_id', 'total_energy_kcal', 'total_carbs_g', 'total_protein_g', 'total_fat_g', 'final_micronutrients', 'nutrition_id']
recipe_nutrition =recipe_nutrition_new.copy()
recipe_nutrition = recipe_nutrition[columns_to_keep]

print(f"Final recipe nutrition shape: {recipe_nutrition.shape}")
print(f"Columns: {recipe_nutrition.columns.tolist()}")
display(recipe_nutrition.head())

Final recipe nutrition shape: (408, 7)
Columns: ['recipe_id', 'total_energy_kcal', 'total_carbs_g', 'total_protein_g', 'total_fat_g', 'final_micronutrients', 'nutrition_id']


Unnamed: 0,recipe_id,total_energy_kcal,total_carbs_g,total_protein_g,total_fat_g,final_micronutrients,nutrition_id
0,1,255.48,24.854,24.965,13.1378,"[Magnesium, P, beta, Total fat (NLEA), Na, K, ...",1
1,2,0.0,3.376,68.751,50.332,"[Sodium, Na]",2
2,3,53.55,23.5955,46.383,28.989,"[P, Vitamin A, Vitamin K (phylloquinone), RAE,...",3
3,5,1585.2,44.9207,113.015,290.2725,"[Sodium, P, Vitamin C, Cholesterol, Choline, V...",4
4,6,0.0,0.492,0.0943,0.0359,"[Ca, Na, K, Sodium, Potassium, Calcium]",5


### **Last Validation w/ Saving Excel**

In [932]:
import pandas as pd
from IPython.display import display

# List of DataFrames to display
dataframes = {
    "📝 recipe_ingredient": recipe_ingredient,
    "🏷️ recipe_category_df": recipe_category_df,
    "🍴 recipes": recipes,
    "📊 recipe_nutrition": recipe_nutrition,
    "🧂 final_ingredient_df": final_ingredient_df
}

print("📊 DISPLAYING ALL DATAFRAMES")
print("=" * 60)

for title, df in dataframes.items():
    print(f"\n➡️ {title}")
    print("-" * 50)
    if isinstance(df, pd.DataFrame) or isinstance(df, pd.Series):
        display(df.head(5))  # Show first 5 rows
    else:
        print("(Data not available or not a DataFrame)")

📊 DISPLAYING ALL DATAFRAMES

➡️ 📝 recipe_ingredient
--------------------------------------------------


Unnamed: 0,recipe_id,ingredient_id
0,1,154
1,1,329
2,1,181
3,1,230
4,1,184



➡️ 🏷️ recipe_category_df
--------------------------------------------------


Unnamed: 0,recipe_id,category_id
0,1,11
1,1,9
2,1,4
3,1,5
4,1,6



➡️ 🍴 recipes
--------------------------------------------------


Unnamed: 0,name,ingredients,instructions,min_age,max_age,texture,prep_time,cook_time,serving,recipe_link,credibility,image_link,difficulty,meal_type,description,tips,hypoallergenic,origin_id,choking_hazards,recipe_id
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,NONE,15.0,45.0,1,https://eprints.uad.ac.id/51598/1/Buku%20Makan...,,,Medium,,,,Yes,9,No,1
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,1. Stir-fry blended spices until fragrant. \n2...,9,11,family food,30.0,30.0,10 servings,pdfcoffee.com_mommyclopedia-78-resep-mpasi-pdf...,,,Hard,Snack,,,Yes,9,No,2
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,NONE,10.0,20.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,3
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...",1. Steam or microwave vegetables until tender....,6,12,NONE,5.0,10.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Easy,,,,Yes,68,No,4
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,family food,10.0,2.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,5



➡️ 📊 recipe_nutrition
--------------------------------------------------


Unnamed: 0,recipe_id,total_energy_kcal,total_carbs_g,total_protein_g,total_fat_g,overlapping_micronutrients,nutrition_id
0,1,255.48,24.854,24.965,13.1378,[],1
1,2,0.0,3.376,68.751,50.332,"[Sodium, Na]",2
2,3,53.55,23.5955,46.383,28.989,[],3
3,5,1585.2,44.9207,113.015,290.2725,[],4
4,6,0.0,0.492,0.0943,0.0359,"[Ca, Na, K, Sodium, Potassium, Calcium]",5



➡️ 🧂 final_ingredient_df
--------------------------------------------------


Unnamed: 0,pk,name,allergen_group_id,isAllergen
0,1,Whole Milk,1.0,True
1,2,Breast Milk,,False
2,3,Almond Butter,4.0,True
3,4,Chicken Breast,,False
4,5,Salmon Fillet,6.0,True


##### Populate Texture Using Unsupervised Learning

In [762]:
recipes_testing = recipes.copy()

recipes_testing.columns.to_list()

['name',
 'ingredients',
 'instructions',
 'min_age',
 'max_age',
 'texture',
 'prep_time',
 'cook_time',
 'serving',
 'recipe_link',
 'credibility',
 'image_link',
 'difficulty',
 'meal_type',
 'description',
 'tips',
 'hypoallergenic',
 'origin_id',
 'choking_hazards',
 'recipe_id']

In [763]:
recipes_testing_original = recipes_testing.copy()

In [764]:
display(recipes_testing_original)

Unnamed: 0,name,ingredients,instructions,min_age,max_age,texture,prep_time,cook_time,serving,recipe_link,credibility,image_link,difficulty,meal_type,description,tips,hypoallergenic,origin_id,choking_hazards,recipe_id
0,Cassava Porridge with Fish Sauce and Lemon (Bu...,"- 60 g cassava, boiled and blended\n- 20 g fis...","Broth:\n1. Use chicken bones, chicken feet, fi...",6,8,NONE,15.0,45.0,1,https://eprints.uad.ac.id/51598/1/Buku%20Makan...,,,Medium,,,,Yes,9,No,1
1,Bitterballs (Bitterballen),- 100 g beef mince \n- 30 g potato starch \n- ...,1. Stir-fry blended spices until fragrant. \n2...,9,11,family food,30.0,30.0,10 servings,pdfcoffee.com_mommyclopedia-78-resep-mpasi-pdf...,,,Hard,Snack,,,Yes,9,No,2
2,Broccoli/Cauliflower Cheese,"- 175g cauliflower/broccoli, cut into pieces\n...","1. Steam, boil, or microwave cauliflower/brocc...",6,12,NONE,10.0,20.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,3
3,Vegetable Fingers,"- 1 carrot, potato, or sweet potato, peeled an...",1. Steam or microwave vegetables until tender....,6,12,NONE,5.0,10.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Easy,,,,Yes,68,No,4
4,Beef Casserole,"- 1 onion, peeled and finely chopped\n- 1½ tab...",1. Preheat oven to 180°C.\n2. Heat oil in a me...,6,12,family food,10.0,2.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,,Yes,68,No,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,Baby breakfast acai bowl,- 50 g berries (you can use a mix of frozen be...,1. Blend all ingredients together and make sur...,6,12,puree,,,,https://chipeafood.com/recipes/baby-breakfast-...,,,Medium,,This super quick but very healthy breakfast bo...,,Yes,42,No,1318
1318,Savoury Toastie,"- 3 small mushrooms, finely chopped\n- ½ cup b...",1. Microwave mushrooms on high for 30 seconds ...,12,12,NONE,5.0,15.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,Tip: Use other vegetables like Chinese cabbage...,Yes,68,No,1319
1319,Tuna Mornay,- 1 cup pasta\n- 1 tablespoon margarine\n- 1 t...,1. Preheat oven to 180°C.\n2. Cook pasta as pe...,12,12,NONE,10.0,30.0,,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,Tip: Add extra vegetables like frozen mixed ve...,Yes,68,No,1320
1320,Fruit Muffins,- ¼ cup sugar\n- 1 cup milk\n- 1 egg\n- 2 tabl...,1. Preheat oven to 180°C and grease a 12-hole ...,12,12,NONE,10.0,25.0,12 muffins,https://www.islhd.health.nsw.gov.au/sites/defa...,Illawarra Shoalhaven Local Health District (IS...,,Medium,,,Tip: Try other combinations like grated carrot...,Yes,68,No,1321


In [633]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # Corrected from Tfirecipes_testingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from collections import Counter

In [773]:
texture_mapping = {
    '': 'UNKNOWN',
    'NONE': 'UNKNOWN',
    np.nan: 'UNKNOWN',
    'unknown': 'UNKNOWN',
    'soft finger food': 'soft finger food',
    'puree': 'puree',
    'lumpy texture': 'lumpy texture',
    'family food': 'family food'
}
recipes_testing['texture'] = recipes_testing['texture'].map(texture_mapping).fillna('UNKNOWN')
print("Updated texture categories:")
print(recipes_testing['texture'].unique())
# Count occurrences of each unique texture
texture_counts = recipes_testing['texture'].value_counts()

# Print the counts
print("\nCount of each unique texture:")
print(texture_counts)

Updated texture categories:
['UNKNOWN' 'family food' 'puree' 'lumpy texture' 'soft finger food']

Count of each unique texture:
texture
UNKNOWN             822
puree               333
family food         103
lumpy texture        37
soft finger food     27
Name: count, dtype: int64


In [767]:
# 2. Create Combined Text Feature
recipes_testing['combined_text'] = (
    recipes_testing['name'] + " " + 
    recipes_testing['ingredients'].astype(str) + " " + 
    recipes_testing['instructions'].astype(str)
)

# 3. Text Vectorization and Dimensionality Reduction
# 3. Text Vectorization and Dimensionality Reduction
vectorizer = TfidfVectorizer(
    max_features=500,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2  # Corrected from min_recipes_testing=2
)
X_tfidf = vectorizer.fit_transform(recipes_testing['combined_text'])

In [768]:

# Dimensionality reduction (captures 95%+ variance)
svd = TruncatedSVD(n_components=50, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)
print(f"Explained variance: {svd.explained_variance_ratio_.sum():.1%}")

# 4. Determine number of clusters based on known textures
known_textures = [t for t in recipes_testing['texture'].unique() if t != 'UNKNOWN']
n_clusters = len(known_textures)  # Use number of known texture types

# 5. Clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
recipes_testing['cluster'] = kmeans.fit_predict(X_reduced)

Explained variance: 48.1%


In [769]:
# 6. Map Clusters to Textures
texture_map = {}
for cluster_id in range(n_clusters):
    # Get known textures in cluster
    cluster_recipes_testing = recipes_testing[(recipes_testing['cluster'] == cluster_id) & (recipes_testing['texture'] != 'UNKNOWN')]
    
    if not cluster_recipes_testing.empty:
        # Get most common texture
        texture_map[cluster_id] = cluster_recipes_testing['texture'].mode()[0]
    else:
        # Keyword-based assignment
        cluster_text = " ".join(recipes_testing[recipes_testing['cluster'] == cluster_id]['combined_text']).lower()
        
        if any(kw in cluster_text for kw in ['blend', 'puree', 'mash', 'smooth', 'creamy']):
            texture_map[cluster_id] = 'puree'
        elif any(kw in cluster_text for kw in ['finger', 'dice', 'cube', 'stick', 'shape', 'log']):
            texture_map[cluster_id] = 'soft finger food'
        elif any(kw in cluster_text for kw in ['chop', 'piece', 'lump', 'chunk', 'mince', 'grate']):
            texture_map[cluster_id] = 'lumpy texture'
        else:
            texture_map[cluster_id] = 'family food'

In [770]:
# 7. Assign Textures to Unknowns
recipes_testing['predicted_texture'] = recipes_testing['texture']
unknown_mask = recipes_testing['texture'] == 'UNKNOWN'
recipes_testing.loc[unknown_mask, 'predicted_texture'] = recipes_testing.loc[unknown_mask, 'cluster'].map(texture_map)

# 8. Cluster Analysis and Validation
print("\n=== Cluster-Texture Mapping ===")
for cluster_id, texture in texture_map.items():
    cluster_size = (recipes_testing['cluster'] == cluster_id).sum()
    example_recipe = recipes_testing[recipes_testing['cluster'] == cluster_id].iloc[0]['name']
    print(f"Cluster {cluster_id}: {texture} | Size: {cluster_size} | Example: '{example_recipe}'")

# 9. Evaluate Cluster Quality
from sklearn.metrics import silhouette_score
sil_score = silhouette_score(X_reduced, recipes_testing['cluster'])
print(f"\nSilhouette Score: {sil_score:.3f} (0-1 scale, higher = better separation)")

# 10. Show Unknown Assignments
unknown_assignments = recipes_testing[unknown_mask][['name', 'predicted_texture']]
if not unknown_assignments.empty:
    print(f"\nAssigned textures for {len(unknown_assignments)} unknown entries:")
    print(unknown_assignments.sample(min(10, len(unknown_assignments)), random_state=42))
else:
    print("\nAll textures were already known - no predictions made")


=== Cluster-Texture Mapping ===
Cluster 0: puree | Size: 385 | Example: 'Broccoli/Cauliflower Cheese'
Cluster 1: puree | Size: 160 | Example: 'Purple Sweet Potato and Red Spinach Puree'
Cluster 2: puree | Size: 360 | Example: 'Toast Fingers'
Cluster 3: puree | Size: 417 | Example: 'Cassava Porridge with Fish Sauce and Lemon (Bubur Singkong Kukuruyuk Saus Jeruk)'

Silhouette Score: 0.058 (0-1 scale, higher = better separation)

Assigned textures for 822 unknown entries:
                                                   name predicted_texture
1017                                    Turkey and peas             puree
274                      Shrimp Risotto (Risotto Udang)             puree
133          Tofu Stuffed with Sausage (Tahu Isi Sosis)             puree
263   Red Bean Cake with Eggs (Bolu Kacang Merah Telur)             puree
458                      Cheesy Potatoes (Kentang Keju)             puree
573                    Beef & Prunes with Mashed Potato             puree
1305   

In [771]:
# Ensure that unknown_assignments is not empty before proceeding
if not unknown_assignments.empty:
    # Generate the count of each unique value in the 'predicted_texture' column
    texture_counts = unknown_assignments['predicted_texture'].value_counts()
    
    # Print the counts
    print("\nCount of each unique predicted texture for unknown entries:")
    print(texture_counts)
else:
    print("\nAll textures were already known - no predictions made")


Count of each unique predicted texture for unknown entries:
predicted_texture
puree    822
Name: count, dtype: int64


In [None]:
# 11. Save Results
output_cols = [
    'recipe_id', 'name', 'ingredients', 'instructions', 
    'min_age', 'max_age', 'texture', 'predicted_texture', 
    'prep_time', 'cook_time', 'serving', 'region'
]
recipes_testing[output_cols].to_excel("texture_predictions.xlsx", index=False)
print("\nPredictions saved to texture_predictions.xlsx")

# 12. Additional Analysis: Texture-Age Relationship
print("\n=== Texture by Age Group ===")
texture_age = recipes_testing.groupby('predicted_texture').agg(
    min_age_min=('min_age', 'min'),
    min_age_avg=('min_age', 'mean'),
    max_age_max=('max_age', 'max')
).reset_index()
print(texture_age)

In [None]:
# Check count of texture None values
none_count = df[df['texture'].isna()].shape[0]  # For actual None/NaN values
print(f"Number of None values in texture column: {none_count}")

# Check count of "None" string values
none_string_count = df[df['texture'] == "None"].shape[0]
print(f"Number of 'None' string values in texture column: {none_string_count}")

# Check count of empty strings
empty_string_count = df[df['texture'] == ""].shape[0]  
print(f"Number of empty strings in texture column: {empty_string_count}")

# Check count of "NONE" uppercase string values (as seen in your code elsewhere)
none_upper_count = df[df['texture'] == "NONE"].shape[0]
print(f"Number of 'NONE' values in texture column: {none_upper_count}")

=== CALCULATING RECIPE-LEVEL NUTRITION (NEW STRATEGY) ===
   🔄 Recipe 'Cassava Porridge with Fish Sau...' - No overlapping micronutrients, using all unique micronutrients
   ✅ Recipe 'Bitterballs (Bitterballen)...' - Found 2 overlapping micronutrients
   🔄 Recipe 'Broccoli/Cauliflower Cheese...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe 'Beef Casserole...' - No overlapping micronutrients, using all unique micronutrients
   ✅ Recipe 'Toast Fingers...' - Found 6 overlapping micronutrients
   🔄 Recipe 'Pumpkin Polenta Fingers...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe 'Rice Pudding...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe 'Hummus...' - No overlapping micronutrients, using all unique micronutrients
   ✅ Recipe 'Baked Bean Pie...' - Found 2 overlapping micronutrients
   🔄 Recipe 'Tomato & Lentil Soup...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe

=== CALCULATING RECIPE-LEVEL NUTRITION (NEW STRATEGY) ===
   🔄 Recipe 'Cassava Porridge with Fish Sau...' - No overlapping micronutrients, using all unique micronutrients
   ✅ Recipe 'Bitterballs (Bitterballen)...' - Found 2 overlapping micronutrients
   🔄 Recipe 'Broccoli/Cauliflower Cheese...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe 'Beef Casserole...' - No overlapping micronutrients, using all unique micronutrients
   ✅ Recipe 'Toast Fingers...' - Found 6 overlapping micronutrients
   🔄 Recipe 'Pumpkin Polenta Fingers...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe 'Rice Pudding...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe 'Hummus...' - No overlapping micronutrients, using all unique micronutrients
   ✅ Recipe 'Baked Bean Pie...' - Found 2 overlapping micronutrients
   🔄 Recipe 'Tomato & Lentil Soup...' - No overlapping micronutrients, using all unique micronutrients
   🔄 Recipe

Unnamed: 0,recipe_id,recipe_name,total_ingredients,ingredients_with_nutrition,nutrition_coverage,exact_matches,keyword_matches,total_energy_kcal,avg_per_matched_energy_kcal,total_carbs_g,avg_per_matched_carbs_g,total_protein_g,avg_per_matched_protein_g,total_fat_g,avg_per_matched_fat_g,overlapping_micronutrients,all_micronutrients,micronutrient_overlap_count,total_unique_micronutrients,micronutrient_strategy
0,1,Cassava Porridge with Fish Sauce and Lemon (Bu...,7,7,100.0,6,1,255.48,36.497143,24.854,3.550571,24.965,3.566429,13.1378,1.876829,"[Magnesium, P, beta, Total fat (NLEA), Na, K, ...","[Magnesium, P, beta, Total fat (NLEA), Na, K, ...",0,17,all_unique
1,2,Bitterballs (Bitterballen),5,4,80.0,3,0,0.0,0.0,3.376,0.844,68.751,17.18775,50.332,12.583,"[Sodium, Na]","[P, Vitamin A, RAE, Retinol, Na, K, Sodium, Ph...",2,9,overlapping
2,3,Broccoli/Cauliflower Cheese,5,5,100.0,5,0,53.55,10.71,23.5955,4.7191,46.383,9.2766,28.989,5.7978,"[P, Vitamin A, Vitamin K (phylloquinone), RAE,...","[P, Vitamin A, Vitamin K (phylloquinone), RAE,...",0,15,all_unique
3,5,Beef Casserole,6,6,100.0,6,0,1585.2,264.2,44.9207,7.486783,113.015,18.835833,290.2725,48.37875,"[Sodium, P, Vitamin C, Cholesterol, Choline, V...","[Sodium, P, Vitamin C, Cholesterol, Choline, V...",0,14,all_unique
4,6,Toast Fingers,1,1,100.0,1,0,0.0,0.0,0.492,0.492,0.0943,0.0943,0.0359,0.0359,"[Ca, Na, K, Sodium, Potassium, Calcium]","[Ca, Na, K, Sodium, Potassium, Calcium]",6,6,overlapping
5,7,Pumpkin Polenta Fingers,5,4,80.0,3,0,555.0,138.75,22.42,5.605,38.78,9.695,48.4,12.1,"[Magnesium, Sodium, P, Total fat (NLEA), Ca, N...","[Magnesium, Sodium, P, Total fat (NLEA), Ca, N...",0,13,all_unique
6,8,Rice Pudding,3,3,100.0,1,2,0.0,0.0,182.413,60.804333,64.05,21.35,11.541,3.847,"[Magnesium, Sodium, P, DFE, Na, Folate, K, Pot...","[Magnesium, Sodium, P, DFE, Na, Folate, K, Pot...",0,11,all_unique
7,9,Hummus,5,5,100.0,5,0,548.0,109.6,90.602,18.1204,51.56805,10.31361,50.4154,10.08308,"[Magnesium, P, total dietary, Na, K, Sodium, t...","[Magnesium, P, total dietary, Na, K, Sodium, t...",0,16,all_unique
8,10,Baked Bean Pie,5,5,100.0,5,0,1207.1,241.42,228.558,45.7116,109.996,21.9992,45.575,9.115,"[Phosphorus, P]","[P, Vitamin C, Ca, Na, Folate, K, Sodium, Phos...",2,12,overlapping
9,11,Tomato & Lentil Soup,7,7,100.0,7,0,895.2,127.885714,179.396,25.628,62.4076,8.915371,598.779,85.539857,"[Magnesium, P, Vitamin A, RAE, total dietary, ...","[Magnesium, P, Vitamin A, RAE, total dietary, ...",0,22,all_unique


In [None]:
# Flatten the list of lists to get all unique ingredients
all_ingredients = set()
for ingredient_list in df['ner_ingredient']:
    if isinstance(ingredient_list, list):
        all_ingredients.update(ingredient_list)

print(f"Total unique ingredients found: {len(all_ingredients)}")
print("Unique ingredients:")
for ingredient in sorted(all_ingredients):
    print(f"- {ingredient}")

In [None]:
# Count occurrences of each unique ingredient
ingredient_counts = {}
for ingredient_list in df['ner_ingredient']:
    if isinstance(ingredient_list, list):
        for ingredient in ingredient_list:
            if ingredient in ingredient_counts:
                ingredient_counts[ingredient] += 1
            else:
                ingredient_counts[ingredient] = 1

# Sort ingredients by frequency (most common first)
sorted_ingredients = sorted(ingredient_counts.items(), key=lambda x: x[1], reverse=True)

# Print the results
print(f"Total unique ingredients found: {len(ingredient_counts)}")
print("\nIngredient frequency (sorted by most common):")
for ingredient, count in sorted_ingredients:
    print(f"- {ingredient}: {count} recipes")

# Alternatively, print alphabetically with counts
print("\nIngredient frequency (sorted alphabetically):")
for ingredient in sorted(ingredient_counts.keys()):
    count = ingredient_counts[ingredient]
    print(f"- {ingredient}: {count} recipes")

In [None]:
# Step 1: Create unique ingredients dataframe with ingredient IDs
unique_ingredients_list = sorted(list(all_ingredients))

# Create ingredients dataframe with IDs
ingredients_df = pd.DataFrame({
    'ingredient_id': range(1, len(unique_ingredients_list) + 1),
    'ingredient_name': unique_ingredients_list
})

print(f"Created ingredients dataframe with {len(ingredients_df)} unique ingredients")
print("\nFirst 10 ingredients:")
print(ingredients_df.head(10))
print("\nLast 5 ingredients:")
print(ingredients_df.tail())

In [None]:
print("\nDetecting allergens for all ingredients...")
ingredients_df['detected_allergens'] = ingredients_df['ingredient_name'].apply(
    lambda x: detect_ingredient_allergens(x, allergen_tags)
)

# Filter ingredients with no allergens (empty list)
ingredients_with_no_allergens = ingredients_df[ingredients_df['detected_allergens'].apply(len) == 0]

# Store them in a separate dataframe
null_allergen_ingredients_df = ingredients_with_no_allergens[['ingredient_id', 'ingredient_name']].copy()

In [None]:

print(f"\nFound {len(null_allergen_ingredients_df)} ingredients with NO allergens detected:")
print("\nIngredients with no allergens:")
for _, row in null_allergen_ingredients_df.iterrows():
    print(f"- ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}'")

# Show summary statistics
print(f"\n📊 Allergen Detection Summary:")
print(f"- Total ingredients: {len(ingredients_df)}")
print(f"- Ingredients with allergens: {len(ingredients_df) - len(null_allergen_ingredients_df)}")
print(f"- Ingredients with NO allergens: {len(null_allergen_ingredients_df)}")
print(f"- Percentage with no allergens: {len(null_allergen_ingredients_df)/len(ingredients_df)*100:.1f}%")

# Save the null allergen ingredients to a separate Excel file if needed
null_allergen_ingredients_df.to_excel('ingredients_with_no_allergens.xlsx', index=False)
print(f"\n✅ Saved {len(null_allergen_ingredients_df)} ingredients with no allergens to 'ingredients_with_no_allergens.xlsx'")

In [None]:
def detect_allergens(recipe_data, ALLERGEN_TAGS):
    ner_ingredients = recipe_data.get("ner_ingredient_string", [])
    # Normalize ingredients (make sure they're lowercase and clean)
    cleaned_ingredients = [str(i).strip().lower() for i in ner_ingredients if i]

    # Combine into one searchable string
    combined_text = ' '.join(cleaned_ingredients)
    matched_allergens = []

    # Define exclusion terms for breast milk and formula
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # First check if the combined text contains any exclusion terms
    has_exclusion_terms = any(exclusion in combined_text for exclusion in exclusion_terms)
    for allergen, data in ALLERGEN_TAGS.items():
        keywords = [kw.lower() for kw in data["keywords"]]
        
        # For milk allergen specifically, skip entirely if exclusion terms found
        if allergen == "milk" and has_exclusion_terms:
            continue
            
        for keyword in keywords:
            if keyword in combined_text:
                # Check if this keyword match is actually part of an exclusion term
                is_excluded = False
                
                # For example, if "milk" is found but it's part of "breast milk"
                for exclusion in exclusion_terms:
                    # Check all possible positions where keyword could be within exclusion term
                    if (exclusion.startswith(keyword + " ") or 
                        exclusion.endswith(" " + keyword) or 
                        " " + keyword + " " in exclusion or 
                        exclusion == keyword):
                        
                        # Only exclude if this exact exclusion term is in the text
                        if exclusion in combined_text:
                            is_excluded = True
                            break
                
                if not is_excluded:
                    matched_allergens.append(allergen)
                    break  # No need to check other keywords for this allergen

    return list(set(matched_allergens))  # Remove duplicates

In [None]:

def detect_allergen_with_strategy(ingredient_name, allergen_tags):
    """
    Advanced allergen detection using two-step matching strategy:
    1. Exact matching - check if ingredient name exactly matches any keyword
    2. Partial keyword matching - check if any allergen keyword appears in ingredient name
    
    Args:
        ingredient_name (str): Name of the ingredient to check
        allergen_tags (dict): Dictionary of allergen information
    
    Returns:
        dict: {
            'allergen_group': str or None,
            'match_type': 'exact' or 'partial' or None,
            'matched_keyword': str or None,
            'confidence': float (0.0 to 1.0)
        }
    """
    ingredient_lower = ingredient_name.lower().strip()
    exclusion_terms = ["breast milk", "breastmilk", "formula milk", "formula", "breast"]
    # Step 1: Exact matching (highest priority)
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            if keyword_lower == ingredient_lower:
                return {
                    'allergen_group': allergen_group,
                    'match_type': 'exact',
                    'matched_keyword': keyword,
                    'confidence': 1.0
                }
    
    # Step 2: Partial keyword matching (lower priority)
    partial_matches = []
    
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        for keyword in keywords:
            keyword_lower = keyword.lower().strip()
            
            # Check if keyword appears as a word in ingredient name
            if keyword_lower in ingredient_lower:
                # Calculate confidence based on match quality
                confidence = calculate_match_confidence(ingredient_lower, keyword_lower)
                
                partial_matches.append({
                    'allergen_group': allergen_group,
                    'match_type': 'partial',
                    'matched_keyword': keyword,
                    'confidence': confidence,
                    'allergen_name': allergen_name
                })
    
    # If partial matches found, return the best one
    if partial_matches:
        # Sort by confidence (highest first)
        partial_matches.sort(key=lambda x: x['confidence'], reverse=True)
        best_match = partial_matches[0]
        
        # Only return if confidence is above threshold
        if best_match['confidence'] >= 0.5:
            return {
                'allergen_group': best_match['allergen_group'],
                'match_type': best_match['match_type'],
                'matched_keyword': best_match['matched_keyword'],
                'confidence': best_match['confidence']
            }
    
    # No match found
    return {
        'allergen_group': None,
        'match_type': None,
        'matched_keyword': None,
        'confidence': 0.0
    }

def calculate_match_confidence(ingredient_name, keyword):
    """
    Calculate confidence score for partial matches based on various factors.
    
    Args:
        ingredient_name (str): The ingredient name (lowercase)
        keyword (str): The allergen keyword (lowercase)
    
    Returns:
        float: Confidence score between 0.0 and 1.0
    """
    # Base confidence for any partial match
    confidence = 0.6
    
    # Boost confidence if keyword is a significant portion of ingredient name
    if len(keyword) >= len(ingredient_name) * 0.5:
        confidence += 0.2
    
    # Boost confidence if keyword appears at start or end
    if ingredient_name.startswith(keyword) or ingredient_name.endswith(keyword):
        confidence += 0.2
    
    # Boost confidence if keyword appears as a complete word (surrounded by spaces or boundaries)
    import re
    if re.search(r'\b' + re.escape(keyword) + r'\b', ingredient_name):
        confidence += 0.1
    
    # Reduce confidence for very short keywords in long ingredient names
    if len(keyword) <= 3 and len(ingredient_name) >= 10:
        confidence -= 0.2
    
    # Cap confidence at 0.95 for partial matches (exact matches get 1.0)
    return min(0.95, max(0.0, confidence))

def detect_allergen_simple(ingredient_name, allergen_tags):
    """
    Simplified version that returns just the allergen group (backward compatibility).
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags
    
    Returns:
        str or None: Allergen group name or None
    """
    result = detect_allergen_with_strategy(ingredient_name, allergen_tags)
    return result.get('allergen_group')

def test_allergen_detection():
    """
    Test function to demonstrate the matching strategy.
    """
    test_ingredients = [
        "milk",                    # Exact match
        "whole milk",             # Partial match  
        "milk chocolate",         # Partial match
        "almond butter",          # Exact match (tree nuts)
        "peanut oil",            # Exact match (peanuts)
        "wheat flour",           # Partial match (gluten)
        "salmon fillet",         # Partial match (fish)
        "chicken breast",        # No match
        "egg white powder",      # Partial match (egg)
        "soy sauce concentrate"  # Partial match (soy)
    ]
    
    print("🧪 TESTING ALLERGEN DETECTION STRATEGY")
    print("=" * 50)
    
    for ingredient in test_ingredients:
        result = detect_allergen_with_strategy(ingredient, ALLERGEN_TAGS)
        
        if result['allergen_group']:
            print(f"✅ '{ingredient}':")
            print(f"   → Allergen: {result['allergen_group']}")
            print(f"   → Match type: {result['match_type']}")
            print(f"   → Matched keyword: '{result['matched_keyword']}'")
            print(f"   → Confidence: {result['confidence']:.2f}")
        else:
            print(f"❌ '{ingredient}': No allergen detected")
        print()

# Uncomment to run test
# test_allergen_detection()

# Quick test examples
if __name__ == "__main__":
    # Test the function with a few examples
    test_cases = ["milk", "wheat flour", "almond butter", "chicken", "egg white"]
    
    print("Quick Test Results:")
    for ingredient in test_cases:
        result = detect_allergen_with_strategy(ingredient, ALLERGEN_TAGS)
        if result['allergen_group']:
            print(f"  {ingredient} → {result['allergen_group']} ({result['match_type']}, {result['confidence']:.2f})")
        else:
            print(f"  {ingredient} → No allergen detected")

In [None]:
def detect_primary_allergen_for_ingredient(ingredient_name, allergen_tags):
    """
    Detect the primary allergen for a single ingredient based on allergen tags.
    Ensures 1 ingredient maps to 1 primary allergen only.
    
    Args:
        ingredient_name (str): Name of the ingredient
        allergen_tags (dict): Dictionary of allergen tags with keywords
    
    Returns:
        str or None: Primary allergen group name, or None if no allergen detected
    """
    ingredient_lower = ingredient_name.lower().strip()
    detected_allergens = []
    
    # Priority order for allergens (most specific to least specific)
    allergen_priority = [
        'shellfish', 'fish', 'peanuts', 'tree_nuts', 'egg', 'dairy', 'soy', 'gluten'
    ]
    
    # First pass: detect all matching allergens
    for allergen_name, allergen_data in allergen_tags.items():
        keywords = allergen_data.get('keywords', [])
        allergen_group = allergen_data.get('allergen_group', allergen_name)
        
        # Check if any keyword matches the ingredient exactly
        for keyword in keywords:
            if keyword.lower().strip() == ingredient_lower:
                detected_allergens.append(allergen_group)
                break  # No need to check other keywords for this allergen
    
    # If no allergens detected, return None
    if not detected_allergens:
        return None
    
    # If only one allergen detected, return it
    if len(detected_allergens) == 1:
        return detected_allergens[0]
    
    # If multiple allergens detected, select based on priority
    for priority_allergen in allergen_priority:
        if priority_allergen in detected_allergens:
            return priority_allergen
    
    # Fallback: return the first detected allergen
    return detected_allergens[0]


def add_allergen_group_id_to_ingredients(ingredient_df, allergen_df, allergen_tags):
    """
    Add allergen_group_id column to ingredient_df based on allergen detection.
    
    Args:
        ingredient_df: DataFrame with ingredient_id and ingredient_name
        allergen_df: DataFrame with allergen group information (pk, name, description)
        allergen_tags: Dictionary of allergen tags for detection
    
    Returns:
        DataFrame: Updated ingredient_df with allergen_group_id column
    """
    print("🔗 CONNECTING ALLERGEN GROUP IDs WITH INGREDIENT IDs")
    print("=" * 60)
    
    # Create a mapping from allergen group name to allergen group ID
    allergen_name_to_id = dict(zip(allergen_df['name'], allergen_df['pk']))
    
    print(f"📊 Available allergen groups:")
    for name, pk in allergen_name_to_id.items():
        print(f"   - {name} (ID: {pk})")
    
    # Create a copy of ingredient_df to avoid modifying the original
    updated_ingredient_df = ingredient_df.copy()
    
    # Detect primary allergen for each ingredient
    print(f"\n🔍 Detecting primary allergens for {len(updated_ingredient_df)} ingredients...")
    
    primary_allergens = []
    allergen_group_ids = []
    
    for _, row in updated_ingredient_df.iterrows():
        ingredient_name = row['ingredient_name']
        primary_allergen = detect_primary_allergen_for_ingredient(ingredient_name, allergen_tags)
        primary_allergens.append(primary_allergen)
        
        # Map to allergen group ID
        if primary_allergen and primary_allergen in allergen_name_to_id:
            allergen_group_id = allergen_name_to_id[primary_allergen]
        else:
            allergen_group_id = None  # No allergen detected
        
        allergen_group_ids.append(allergen_group_id)
    
    # Add the new columns
    updated_ingredient_df['primary_allergen'] = primary_allergens
    updated_ingredient_df['allergen_group_id'] = allergen_group_ids
    
    # Calculate statistics
    total_ingredients = len(updated_ingredient_df)
    ingredients_with_allergens = updated_ingredient_df['allergen_group_id'].notna().sum()
    ingredients_without_allergens = total_ingredients - ingredients_with_allergens
    
    print(f"\n✅ ALLERGEN MAPPING RESULTS:")
    print(f"   Total ingredients: {total_ingredients}")
    print(f"   Ingredients with allergens: {ingredients_with_allergens} ({ingredients_with_allergens/total_ingredients*100:.1f}%)")
    print(f"   Ingredients without allergens: {ingredients_without_allergens} ({ingredients_without_allergens/total_ingredients*100:.1f}%)")
    
    # Show breakdown by allergen group
    allergen_counts = updated_ingredient_df['primary_allergen'].value_counts()
    print(f"\n📊 Breakdown by allergen group:")
    for allergen, count in allergen_counts.items():
        if allergen:  # Skip None values
            allergen_id = allergen_name_to_id.get(allergen, 'Unknown')
            print(f"   - {allergen} (ID: {allergen_id}): {count} ingredients")
    
    # Show sample of mapped ingredients
    print(f"\n📋 Sample mapped ingredients:")
    sample_with_allergens = updated_ingredient_df[updated_ingredient_df['allergen_group_id'].notna()].head(10)
    for _, row in sample_with_allergens.iterrows():
        print(f"   - ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', Allergen: {row['primary_allergen']} (Group ID: {row['allergen_group_id']})")
    
    # Show sample of ingredients without allergens
    sample_without_allergens = updated_ingredient_df[updated_ingredient_df['allergen_group_id'].isna()].head(5)
    if len(sample_without_allergens) > 0:
        print(f"\n📋 Sample ingredients without allergens:")
        for _, row in sample_without_allergens.iterrows():
            print(f"   - ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}'")
    
    return updated_ingredient_df


# Apply the function to add allergen group IDs
ingredient_df_with_allergens = add_allergen_group_id_to_ingredients(
    ingredient_df, 
    allergen_df, 
    allergen_tags
)

In [None]:
# Validate and analyze the allergen mapping results
print("🔍 VALIDATION AND ANALYSIS OF ALLERGEN MAPPING")
print("=" * 50)

# Check the structure of the updated dataframe
print(f"📊 Updated ingredient_df structure:")
print(f"   Columns: {list(ingredient_df_with_allergens.columns)}")
print(f"   Shape: {ingredient_df_with_allergens.shape}")

# Show detailed breakdown
print(f"\n📈 Detailed allergen group mapping:")
allergen_mapping_summary = ingredient_df_with_allergens.groupby(['primary_allergen', 'allergen_group_id']).size().reset_index(name='count')
for _, row in allergen_mapping_summary.iterrows():
    if pd.notna(row['primary_allergen']):
        print(f"   {row['primary_allergen']} (Group ID: {row['allergen_group_id']}): {row['count']} ingredients")

# Show some specific examples
print(f"\n🔍 Example allergen mappings:")
example_allergens = ['dairy', 'egg', 'soy', 'gluten', 'tree_nuts']
for allergen in example_allergens:
    examples = ingredient_df_with_allergens[ingredient_df_with_allergens['primary_allergen'] == allergen].head(3)
    if len(examples) > 0:
        print(f"\n   {allergen.upper()} examples:")
        for _, row in examples.iterrows():
            print(f"     - '{row['ingredient_name']}' → Group ID: {row['allergen_group_id']}")

# Save the updated dataframe
output_filename = "ingredient_master_with_allergens.xlsx"
ingredient_df_with_allergens.to_excel(output_filename, index=False)
print(f"\n💾 Saved updated ingredient dataframe to '{output_filename}'")

# Display sample of final structure
print(f"\n📋 Final ingredient dataframe sample:")
display(ingredient_df_with_allergens[['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id']].head(10))

In [None]:
# Enhanced allergen detection with exclusion terms and isAllergen column
def add_exclusion_terms_and_allergen_flag(ingredient_df_with_allergens):
    """
    Add exclusion terms logic and isAllergen column to ingredient dataframe.
    
    Exclusion terms like "breast milk", "formula milk" should NOT be flagged as allergens
    even if they contain allergen keywords like "milk".
    
    Args:
        ingredient_df_with_allergens: DataFrame with allergen mapping results
        
    Returns:
        DataFrame: Enhanced dataframe with exclusion logic and isAllergen column
    """
    print("🚫 APPLYING EXCLUSION TERMS AND ADDING isAllergen COLUMN")
    print("=" * 60)
    
    # Define exclusion terms - ingredients that should NOT be considered allergens
    exclusion_terms = [
        "breast milk", "breastmilk", "formula milk", "formula", "breast"
    ]
    
    print(f"📋 Exclusion terms: {exclusion_terms}")
    
    # Create a copy to work with
    enhanced_df = ingredient_df_with_allergens.copy()
    
    # Check for exclusions and nullify allergen data if needed
    exclusion_count = 0
    excluded_ingredients = []
    
    for idx, row in enhanced_df.iterrows():
        ingredient_name = row['ingredient_name'].lower().strip()
        
        # Check if ingredient matches any exclusion term
        is_excluded = False
        for exclusion_term in exclusion_terms:
            if exclusion_term.lower() in ingredient_name:
                is_excluded = True
                exclusion_count += 1
                excluded_ingredients.append({
                    'ingredient_name': row['ingredient_name'],
                    'original_allergen': row['primary_allergen'],
                    'original_allergen_id': row['allergen_group_id'],
                    'exclusion_term': exclusion_term
                })
                break
        
        # If excluded, nullify allergen information
        if is_excluded:
            enhanced_df.at[idx, 'primary_allergen'] = None
            enhanced_df.at[idx, 'allergen_group_id'] = None
    
    # Add isAllergen column based on allergen_group_id
    enhanced_df['isAllergen'] = enhanced_df['allergen_group_id'].notna()
    
    # Calculate statistics
    total_ingredients = len(enhanced_df)
    ingredients_with_allergens = enhanced_df['isAllergen'].sum()
    ingredients_without_allergens = total_ingredients - ingredients_with_allergens
    
    print(f"\n✅ EXCLUSION AND FLAG RESULTS:")
    print(f"   Total ingredients processed: {total_ingredients}")
    print(f"   Excluded due to exclusion terms: {exclusion_count}")
    print(f"   Final ingredients with allergens: {ingredients_with_allergens}")
    print(f"   Final ingredients without allergens: {ingredients_without_allergens}")
    print(f"   Allergen rate: {(ingredients_with_allergens/total_ingredients)*100:.1f}%")
    
    # Show excluded ingredients
    if excluded_ingredients:
        print(f"\n🚫 Excluded ingredients (due to exclusion terms):")
        for item in excluded_ingredients:
            print(f"   - '{item['ingredient_name']}' (was: {item['original_allergen']}) → excluded by '{item['exclusion_term']}'")
    
    # Show updated allergen breakdown
    print(f"\n📊 Updated allergen breakdown:")
    allergen_counts = enhanced_df[enhanced_df['isAllergen']]['primary_allergen'].value_counts()
    for allergen, count in allergen_counts.items():
        print(f"   - {allergen}: {count} ingredients")
    
    # Show sample of final structure
    print(f"\n📋 Sample of enhanced dataframe:")
    sample_cols = ['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id', 'isAllergen']
    print("With allergens:")
    sample_with = enhanced_df[enhanced_df['isAllergen']].head(5)
    for _, row in sample_with.iterrows():
        print(f"   ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', Allergen: {row['primary_allergen']}, isAllergen: {row['isAllergen']}")
    
    print("\nWithout allergens:")
    sample_without = enhanced_df[~enhanced_df['isAllergen']].head(5)
    for _, row in sample_without.iterrows():
        print(f"   ID: {row['ingredient_id']}, Name: '{row['ingredient_name']}', isAllergen: {row['isAllergen']}")
    
    return enhanced_df

# Apply the enhancement
ingredient_df_final = add_exclusion_terms_and_allergen_flag(ingredient_df_with_allergens)

In [None]:
# Final validation and export of enhanced ingredient dataframe
print("📊 FINAL INGREDIENT DATAFRAME VALIDATION")
print("=" * 50)

# Check the final structure
print(f"Final dataframe columns: {list(ingredient_df_final.columns)}")
print(f"Final dataframe shape: {ingredient_df_final.shape}")

# Data type validation
print(f"\n📋 Column data types:")
for col in ingredient_df_final.columns:
    print(f"   {col}: {ingredient_df_final[col].dtype}")

# isAllergen column validation
allergen_true_count = ingredient_df_final['isAllergen'].sum()
allergen_false_count = (~ingredient_df_final['isAllergen']).sum()

print(f"\n✅ isAllergen column validation:")
print(f"   True (has allergen): {allergen_true_count}")
print(f"   False (no allergen): {allergen_false_count}")
print(f"   Total: {allergen_true_count + allergen_false_count}")

# Cross-validation: isAllergen should match allergen_group_id presence
validation_passed = True
mismatch_count = 0

for _, row in ingredient_df_final.iterrows():
    has_allergen_id = pd.notna(row['allergen_group_id'])
    is_allergen_flag = row['isAllergen']
    
    if has_allergen_id != is_allergen_flag:
        mismatch_count += 1
        if mismatch_count <= 5:  # Show first 5 mismatches
            print(f"   ⚠️ Mismatch: {row['ingredient_name']} - allergen_group_id: {row['allergen_group_id']}, isAllergen: {is_allergen_flag}")
        validation_passed = False

if validation_passed:
    print("   ✅ All isAllergen flags match allergen_group_id presence")
else:
    print(f"   ❌ Found {mismatch_count} mismatches between isAllergen and allergen_group_id")

# Show final breakdown by allergen type
print(f"\n📈 Final allergen distribution:")
allergen_breakdown = ingredient_df_final[ingredient_df_final['isAllergen']]['primary_allergen'].value_counts()
for allergen, count in allergen_breakdown.items():
    percentage = (count / allergen_true_count) * 100
    print(f"   {allergen}: {count} ingredients ({percentage:.1f}%)")

# Export final results
final_output_filename = "ingredient_master_final_with_allergens.xlsx"
ingredient_df_final.to_excel(final_output_filename, index=False)
print(f"\n💾 Exported final ingredient dataframe to '{final_output_filename}'")

# Display final structure sample
print(f"\n📋 Final dataframe structure sample:")
display(ingredient_df_final[['ingredient_id', 'ingredient_name', 'primary_allergen', 'allergen_group_id', 'isAllergen']].head(15))

Finish Populating Data

In [None]:
#save the final dataset
df.to_excel("cfirstversion_current_dataset.xlsx", index=False)
print("✅ DataFrame saved to 'cfirstversion_current_dataset.xlsx'")

Creating Category With Recipe Id

In [None]:
# Define categories and assign IDs
categories = [
    'vegan', 'vegetarian', 'pescetarian', 'dairy_free', 
    'egg_free', 'soy_free', 'nut_free', 'gluten_free',
    'halal', 'non_halal', 'non_veg'
]

category_id_map = {category: idx for idx, category in enumerate(categories, 1)}

# Create a reference DataFrame for categories
category_df = pd.DataFrame({
    'category_id': list(category_id_map.values()),
    'category_name': list(category_id_map.keys())
})

print("Category DataFrame:")
print(category_df)

In [None]:
df.shape
df.head(10)

## 🆔 Create Ingredient Master DataFrame with Unique IDs

##### Detail

In [None]:
def map_ingredient_ids_to_recipe_df(recipe_ingredient_df, ingredient_id_mapping, use_standardized=True):
    """
    Map ingredient IDs to the recipe ingredient DataFrame using cleaned ingredient names
    
    Args:
        recipe_ingredient_df: DataFrame with recipe-ingredient relationships
        ingredient_id_mapping: Dictionary mapping cleaned ingredient names to IDs
        use_standardized: If True, use standardized_ingredient; if False, use single_ingredient
        
    Returns:
        DataFrame: Updated recipe_ingredient_df with ingredient_id column
    """
    print("🔗 MAPPING INGREDIENT IDS TO RECIPE DATAFRAME")
    print("-" * 50)
    
    # Choose which ingredient column to use
    ingredient_column = 'standardized_ingredient' if use_standardized else 'single_ingredient'
    
    if ingredient_column not in recipe_ingredient_df.columns:
        print(f"❌ Column '{ingredient_column}' not found in recipe_ingredient_df")
        return recipe_ingredient_df
    
    # Create a copy to avoid modifying the original
    updated_df = recipe_ingredient_df.copy()
    
    # Clean the ingredient names in the DataFrame and map to IDs
    def get_ingredient_id(ingredient_name):
        if pd.isna(ingredient_name):
            return None
        cleaned_name = clean_ingredient_name_symbols(ingredient_name)
        return ingredient_id_mapping.get(cleaned_name, None)
    
    updated_df['ingredient_id'] = updated_df[ingredient_column].apply(get_ingredient_id)
    
    # Check mapping results
    total_rows = len(updated_df)
    mapped_rows = updated_df['ingredient_id'].notna().sum()
    unmapped_rows = total_rows - mapped_rows
    
    print(f"✅ Ingredient ID mapping complete:")
    print(f"   Total rows: {total_rows}")
    print(f"   Successfully mapped: {mapped_rows} ({(mapped_rows/total_rows)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_rows} ({(unmapped_rows/total_rows)*100:.1f}%)")
    
    if unmapped_rows > 0:
        print(f"\n🔍 Sample unmapped ingredients:")
        unmapped_sample = updated_df[updated_df['ingredient_id'].isna()][ingredient_column].dropna().unique()[:5]
        for ingredient in unmapped_sample:
            cleaned = clean_ingredient_name_symbols(ingredient)
            print(f"   Original: '{ingredient}' → Cleaned: '{cleaned}'")
    
    return updated_df

In [None]:
# Step 2: Map ingredient IDs back to recipe DataFrame
if 'ingredient_id_mapping' in locals() and 'recipe_ingredient_df' in locals():
    print("\n📋 Step 2: Mapping ingredient IDs to recipe DataFrame...")
    
    # Update the recipe_ingredient_df with ingredient IDs
    recipe_ingredient_df_with_ids = map_ingredient_ids_to_recipe_df(
        recipe_ingredient_df,
        ingredient_id_mapping,
        use_standardized=True
    )
    
    print(f"\n📋 Sample of updated recipe_ingredient_df with IDs:")
    sample_cols = ['recipe_id', 'single_ingredient', 'standardized_ingredient', 'ingredient_id']
    available_cols = [col for col in sample_cols if col in recipe_ingredient_df_with_ids.columns]
    display(recipe_ingredient_df_with_ids[available_cols].head(10))
    
    print(f"\n📊 Final DataFrame Statistics:")
    print(f"   Recipe DataFrame shape: {recipe_ingredient_df_with_ids.shape}")
    print(f"   Unique recipes: {recipe_ingredient_df_with_ids['recipe_id'].nunique()}")
    print(f"   Unique ingredients: {len(ingredient_df)}")
    print(f"   Recipe-ingredient relationships: {len(recipe_ingredient_df_with_ids)}")
    
else:
    print("❌ Required variables not found for ID mapping")



In [None]:
# Step 3: Final validation and export
print("📋 Step 3: Final Data Validation and Export")
print("-" * 50)

if 'ingredient_df' in locals() and 'recipe_ingredient_df_with_ids' in locals():
    
    # Validation checks
    print("🔍 FINAL VALIDATION CHECKS:")
    
    # Check 1: Ingredient DataFrame cleanliness
    total_ingredients = len(ingredient_df)
    clean_ingredients = ingredient_df['ingredient_name'].notna().sum()
    empty_ingredients = ingredient_df['ingredient_name'].isna().sum()
    
    print(f"✅ Ingredient Master DataFrame:")
    print(f"   Total unique ingredients: {total_ingredients}")
    print(f"   Clean ingredients: {clean_ingredients}")
    print(f"   Empty/null ingredients: {empty_ingredients}")
    
    # Check 2: Recipe-Ingredient mapping completeness
    total_relationships = len(recipe_ingredient_df_with_ids)
    mapped_relationships = recipe_ingredient_df_with_ids['ingredient_id'].notna().sum()
    unmapped_relationships = total_relationships - mapped_relationships
    
    print(f"\n✅ Recipe-Ingredient Relationships:")
    print(f"   Total relationships: {total_relationships}")
    print(f"   Successfully mapped: {mapped_relationships} ({(mapped_relationships/total_relationships)*100:.1f}%)")
    print(f"   Unmapped: {unmapped_relationships} ({(unmapped_relationships/total_relationships)*100:.1f}%)")
    
    # Check 3: Data consistency
    unique_ingredient_ids_in_recipes = recipe_ingredient_df_with_ids['ingredient_id'].dropna().nunique()
    total_ingredient_ids_in_master = ingredient_df['ingredient_id'].nunique()
    
    print(f"\n✅ Data Consistency:")
    print(f"   Unique ingredient IDs in recipes: {unique_ingredient_ids_in_recipes}")
    print(f"   Total ingredient IDs in master: {total_ingredient_ids_in_master}")
    print(f"   Coverage: {(unique_ingredient_ids_in_recipes/total_ingredient_ids_in_master)*100:.1f}%")
    
    # Export cleaned datasets
    print(f"\n💾 EXPORTING CLEANED DATASETS:")
    
    try:
        # Export ingredient master DataFrame
        ingredient_filename = "cleaned_ingredient_master.xlsx"
        ingredient_df.to_excel(ingredient_filename, index=False)
        print(f"✅ Exported ingredient master: {ingredient_filename}")
        
        # Export recipe-ingredient DataFrame with IDs
        recipe_ingredient_filename = "cleaned_recipe_ingredient_with_ids.xlsx"
        recipe_ingredient_df_with_ids.to_excel(recipe_ingredient_filename, index=False)
        print(f"✅ Exported recipe-ingredient data: {recipe_ingredient_filename}")
        
        print(f"\n🎉 DATA PROCESSING COMPLETE!")
        print(f"📁 Files ready for downstream modeling and analysis")
        
    except Exception as e:
        print(f"❌ Export error: {str(e)}")
        
else:
    print("❌ Required DataFrames not found for validation")

print("\n" + "=" * 60)

In [None]:
# Debug: Check the format of standardized ingredients
print("🔍 DEBUGGING STANDARDIZED INGREDIENT FORMAT")
print("-" * 50)

if 'recipe_ingredient_df' in locals():
    # Sample some standardized ingredients to see their actual format
    sample_ingredients = recipe_ingredient_df['standardized_ingredient'].dropna().head(20).tolist()
    
    print("📋 Sample standardized ingredients (raw format):")
    for i, ingredient in enumerate(sample_ingredients[:10], 1):
        print(f"   {i}. {repr(ingredient)}")  # repr shows quotes and special characters
    
    # Test the cleaning function on these samples
    print(f"\n🧹 Testing cleaning function:")
    for i, ingredient in enumerate(sample_ingredients[:5], 1):
        cleaned = clean_ingredient_name_symbols(ingredient)
        print(f"   {i}. Original: {repr(ingredient)}")
        print(f"      Cleaned:  {repr(cleaned)}")
        print()

else:
    print("❌ recipe_ingredient_df not found")

In [None]:
# Check what ner_ingredient_string contains
print("🔍 COMPARING ner_ingredient vs ner_ingredient_string")
print("=" * 60)

# Check first 3 recipes
for i in range(3):
    row = df.iloc[i]
    print(f"\n📋 Recipe {i+1}: {row['name'][:50]}...")
    print(f"ner_ingredient (type {type(row['ner_ingredient'])}): {row['ner_ingredient']}")
    print(f"ner_ingredient_string (type {type(row['ner_ingredient_string'])}): {row['ner_ingredient_string']}")
    print("-" * 50)

In [None]:
# Simple check of ner_ingredient_string
row = df.iloc[0]
print("ner_ingredient_string:")
print(f"Type: {type(row['ner_ingredient_string'])}")
print(f"Content: {row['ner_ingredient_string']}")
print(f"Length: {len(str(row['ner_ingredient_string']))}")

# Check if it's already a clean string
if isinstance(row['ner_ingredient_string'], str):
    print("✅ ner_ingredient_string is already a string - perfect for direct use!")
else:
    print("❌ ner_ingredient_string is not a string")

In [None]:
def classify_recipe_simplified(recipe_data, dietary_tags):
    """
    Simplified classify_recipe function using ner_ingredient_string directly
    """
    # Use ner_ingredient_string directly (it's already a clean string)
    ner_ingredient_string = recipe_data.get("ner_ingredient_string", "")
    combined_text = str(ner_ingredient_string).lower()

    # Handle allergen data (keeping existing logic)
    detected_allergens = recipe_data.get("allergen", [])
    if isinstance(detected_allergens, str):
        try:
            import ast
            detected_allergens = ast.literal_eval(detected_allergens)
        except (ValueError, SyntaxError):
            detected_allergens = []
    
    allergen = [str(i).strip().lower() for i in detected_allergens if i]
    allergen_text = ' '.join(allergen).lower()

    matched_tags = []

    # Step 1: Match all possible tags
    for tag, rules in dietary_tags.items():
        exclude_found = False

        # Step 1a: Check excluded ingredients
        if "excluded_ingredients" in rules:
            for word in rules["excluded_ingredients"]:
                if word.lower() in combined_text:
                    exclude_found = True
                    break

        # Step 1b: Check excluded allergen groups
        if "excluded_allergen_groups" in rules:
            for group in rules["excluded_allergen_groups"]:
                if group in allergen_text:
                    exclude_found = True
                    break

        # Step 1c: Required ingredients check (e.g., for non_halal)
        if "required_ingredients" in rules:
            required_match = any(word in combined_text for word in rules["required_ingredients"])
            if not required_match:
                continue

        if not exclude_found:
            matched_tags.append(tag)

    # Define categories according to user requirements
    GENERAL_DIET_TAGS = ["vegan", "vegetarian", "pescetarian", "non_veg"]
    RELIGIOUS_TAGS = ["halal", "non_halal"]
    ALLERGEN_TAGS = ["dairy_free", "egg_free", "soy_free", "nut_free", "gluten_free"]

    result = {
        "general_diet": None,
        "religious_tag": None,
        "allergen_tags": [],
    }

    # Step 2: Select only one general diet (priority-based)
    for tag in GENERAL_DIET_TAGS:
        if tag in matched_tags:
            result["general_diet"] = tag
            break
    else:
        # Fallback (shouldn't happen unless none of the 4 are valid)
        result["general_diet"] = "non_veg"

    # Step 3: Select at most one religious tag (prefer halal over non_halal)
    for tag in RELIGIOUS_TAGS:
        if tag in matched_tags:
            result["religious_tag"] = tag
            break

    # Step 4: Select all applicable allergen-friendly tags
    result["allergen_tags"] = [tag for tag in matched_tags if tag in ALLERGEN_TAGS]

    values = []
    
    # Add general diet if present
    if result.get("general_diet"):
        values.append(result["general_diet"])
    
    # Add religious tag if present
    if result.get("religious_tag"):
        values.append(result["religious_tag"])
    
    # Add allergen tags if present
    allergen_tags = result.get("allergen_tags", [])
    if allergen_tags:
        values.extend(allergen_tags)
    
    # Join all values with commas
    return ", ".join(values)

In [None]:
# 🔍 DIAGNOSING TEXTURE PREDICTION ISSUE
print("🔍 ANALYZING TEXTURE PREDICTION PROBLEM")
print("=" * 60)

# First, let's identify which dataframe has texture information
print("📊 Available dataframes and their columns:")
for df_name, df_obj in all_dataframes.items():
    try:
        if hasattr(df_obj, 'columns') and hasattr(df_obj, 'shape'):
            has_texture = 'texture' in df_obj.columns
            has_predicted_texture = 'predicted_texture' in df_obj.columns
            print(f"   {df_name}: {df_obj.shape} - texture: {has_texture}, predicted_texture: {has_predicted_texture}")
        else:
            print(f"   {df_name}: {type(df_obj)} - not a DataFrame")
    except Exception as e:
        print(f"   {df_name}: Error checking - {str(e)}")

# Check global variables for dataframes with texture
texture_df_candidates = []
for var_name in globals():
    try:
        var_obj = globals()[var_name]
        if hasattr(var_obj, 'columns') and hasattr(var_obj, 'shape') and 'texture' in var_obj.columns:
            texture_df_candidates.append((var_name, var_obj))
    except:
        pass

print(f"\n🔍 Found {len(texture_df_candidates)} dataframes with 'texture' column:")
for name, df_obj in texture_df_candidates:
    print(f"   {name}: {df_obj.shape}")

# Use the first available dataframe with texture information
target_df = None
df_name = None

if texture_df_candidates:
    df_name, target_df = texture_df_candidates[0]
    print(f"\n✅ Using dataframe: {df_name} with shape {target_df.shape}")
else:
    print("\n❌ No dataframe with 'texture' column found!")
    # Show what's available
    print("Available global variables that look like DataFrames:")
    for var_name in globals():
        try:
            var_obj = globals()[var_name]
            if hasattr(var_obj, 'columns') and hasattr(var_obj, 'shape'):
                print(f"   {var_name}: {var_obj.shape} - columns: {list(var_obj.columns)[:5]}...")
        except:
            pass

if target_df is not None:
    # Check current texture distribution
    current_texture_distribution = target_df['texture'].value_counts()
    print("📊 Current texture distribution:")
    for texture, count in current_texture_distribution.items():
        print(f"   {texture}: {count}")

    # Check if all predictions are "puree"
    if 'predicted_texture' in target_df.columns:
        predicted_distribution = target_df['predicted_texture'].value_counts()
        print("\n🤖 Predicted texture distribution:")
        for texture, count in predicted_distribution.items():
            print(f"   {texture}: {count}")
    else:
        print("\n❌ No 'predicted_texture' column found")

    # The problem: Unsupervised learning needs better feature engineering
    print("\n🚨 IDENTIFIED ISSUES:")
    print("   1. TF-IDF may be dominated by common ingredient words")
    print("   2. Clustering may not capture texture-specific patterns")
    print("   3. Need texture-specific feature extraction")
    print("   4. Class imbalance may bias toward 'puree'")

    # SOLUTION: Improved texture prediction approach
    print("\n💡 IMPLEMENTING IMPROVED TEXTURE PREDICTION:")
    print("-" * 50)

    def create_texture_specific_features(df):
        """
        Create features specifically designed for texture classification
        """
        print("🛠️ Creating texture-specific features...")
        
        # Texture indicator keywords
        texture_keywords = {
            'puree_indicators': [
                'puree', 'smooth', 'blend', 'process', 'strain', 'mash completely',
                'baby food', 'blended', 'creamy', 'liquified', 'homogenized'
            ],
            'lumpy_indicators': [
                'lumpy', 'chunks', 'pieces', 'chopped', 'diced', 'cubes', 
                'partially mashed', 'texture', 'bits', 'small pieces'
            ],
            'finger_food_indicators': [
                'finger', 'handheld', 'stick', 'roll', 'ball', 'strip', 
                'slice', 'cube', 'pieces you can pick up', 'self feeding'
            ],
            'family_food_indicators': [
                'family meal', 'adult portion', 'regular', 'normal', 'seasoned',
                'spicy', 'full meal', 'dinner', 'lunch', 'adult food'
            ]
        }
        
        # Create feature matrix
        feature_data = []
        
        for _, row in df.iterrows():
            # Combine text fields - use available columns
            text_fields = []
            for col in ['name', 'recipe_name', 'ner_ingredient_string', 'instructions']:
                if col in df.columns:
                    text_fields.append(str(row.get(col, '')))
            
            combined_text = ' '.join(text_fields).lower()
            
            # Count texture-specific keywords
            features = {}
            for texture_type, keywords in texture_keywords.items():
                count = sum(1 for keyword in keywords if keyword in combined_text)
                features[texture_type] = count
            
            # Additional features
            if 'ner_ingredient_string' in df.columns:
                features['total_ingredients'] = len(str(row.get('ner_ingredient_string', '')).split(','))
            else:
                features['total_ingredients'] = 1
                
            features['name_length'] = len(str(row.get('name', row.get('recipe_name', ''))))
            features['has_cooking_method'] = 1 if any(word in combined_text for word in ['cook', 'steam', 'boil', 'fry', 'bake']) else 0
            features['age_group'] = row.get('min_age', row.get('age', 6))  # Default to 6 months
            
            feature_data.append(features)
        
        feature_df = pd.DataFrame(feature_data)
        print(f"   ✅ Created {len(feature_df.columns)} texture-specific features")
        return feature_df

    def predict_texture_improved(feature_df, n_clusters=4):
        """
        Improved texture prediction using texture-specific features
        """
        from sklearn.preprocessing import StandardScaler
        from sklearn.cluster import KMeans
        from sklearn.decomposition import PCA
        import numpy as np
        
        print("🤖 Running improved texture prediction...")
        
        # Standardize features
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(feature_df)
        
        # Use PCA for dimensionality reduction
        pca = PCA(n_components=min(10, features_scaled.shape[1]))
        features_pca = pca.fit_transform(features_scaled)
        
        # Apply K-means clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(features_pca)
        
        # Map clusters to texture types based on feature patterns
        cluster_characteristics = {}
        for cluster_id in range(n_clusters):
            cluster_mask = cluster_labels == cluster_id
            cluster_features = feature_df[cluster_mask]
            
            # Calculate average feature values for this cluster
            avg_features = cluster_features.mean()
            cluster_characteristics[cluster_id] = avg_features
            
            print(f"\n📊 Cluster {cluster_id} characteristics:")
            print(f"   Size: {cluster_mask.sum()} recipes")
            print(f"   Avg puree indicators: {avg_features['puree_indicators']:.2f}")
            print(f"   Avg lumpy indicators: {avg_features['lumpy_indicators']:.2f}")
            print(f"   Avg finger food indicators: {avg_features['finger_food_indicators']:.2f}")
            print(f"   Avg family food indicators: {avg_features['family_food_indicators']:.2f}")
            print(f"   Avg age group: {avg_features['age_group']:.1f} months")
        
        # Smart cluster-to-texture mapping
        texture_mapping = {}
        available_textures = ['puree', 'lumpy texture', 'soft finger food', 'family food']
        
        # Sort clusters by their dominant characteristics
        for cluster_id in range(n_clusters):
            chars = cluster_characteristics[cluster_id]
            
            # Determine most likely texture based on indicators
            scores = {
                'puree': chars['puree_indicators'] * 2 + (1 if chars['age_group'] <= 6 else 0),
                'lumpy texture': chars['lumpy_indicators'] * 2 + (1 if 6 < chars['age_group'] <= 9 else 0),
                'soft finger food': chars['finger_food_indicators'] * 2 + (1 if 8 <= chars['age_group'] <= 12 else 0),
                'family food': chars['family_food_indicators'] * 2 + (1 if chars['age_group'] > 12 else 0)
            }
            
            # Assign texture with highest score
            best_texture = max(scores.keys(), key=lambda x: scores[x])
            texture_mapping[cluster_id] = best_texture
            
            print(f"   → Mapped to: {best_texture}")
        
        # Apply mapping to get predicted textures
        predicted_textures = [texture_mapping[cluster] for cluster in cluster_labels]
        
        return predicted_textures, cluster_labels, texture_mapping

    # Execute improved texture prediction
    try:
        print("\n🚀 EXECUTING IMPROVED TEXTURE PREDICTION:")
        print("-" * 50)
        
        # Create texture-specific features
        texture_features = create_texture_specific_features(target_df)
        
        # Predict textures
        predicted_textures, clusters, mapping = predict_texture_improved(texture_features, n_clusters=4)
        
        # Add results to dataframe
        target_df['predicted_texture_improved'] = predicted_textures
        target_df['texture_cluster'] = clusters
        
        # Show improved results
        print("\n📈 IMPROVED PREDICTION RESULTS:")
        improved_distribution = pd.Series(predicted_textures).value_counts()
        for texture, count in improved_distribution.items():
            print(f"   {texture}: {count}")
        
        # Compare with actual distribution
        print("\n🆚 COMPARISON WITH ACTUAL TEXTURE DISTRIBUTION:")
        if 'texture' in target_df.columns:
            actual_distribution = target_df['texture'].value_counts()
            print("   Actual distribution:")
            for texture, count in actual_distribution.items():
                predicted_count = improved_distribution.get(texture, 0)
                print(f"     {texture}: Actual={count}, Predicted={predicted_count}")
        
        # Analyze prediction accuracy for known textures
        if 'texture' in target_df.columns:
            known_texture_mask = target_df['texture'] != 'NONE'
            if known_texture_mask.any():
                known_actual = target_df[known_texture_mask]['texture']
                known_predicted = target_df[known_texture_mask]['predicted_texture_improved']
                
                accuracy = (known_actual == known_predicted).mean()
                print(f"\n🎯 Prediction accuracy on known textures: {accuracy:.2%}")
                
                # Show confusion matrix
                from sklearn.metrics import confusion_matrix
                import pandas as pd
                
                cm = confusion_matrix(known_actual, known_predicted, labels=['puree', 'lumpy texture', 'soft finger food', 'family food'])
                cm_df = pd.DataFrame(cm, 
                                   index=['puree', 'lumpy texture', 'soft finger food', 'family food'],
                                   columns=['puree', 'lumpy texture', 'soft finger food', 'family food'])
                
                print("\n📊 Confusion Matrix:")
                print(cm_df)
        
        print("\n✅ Improved texture prediction completed!")
        
    except Exception as e:
        print(f"❌ Error in improved texture prediction: {str(e)}")
        import traceback
        traceback.print_exc()

🔍 ANALYZING TEXTURE PREDICTION PROBLEM
📊 Available dataframes and their columns:


AttributeError: 'dict' object has no attribute 'columns'