In [11]:
# Improved Recipe Recommendation System
# This notebook contains a more robust implementation of a recipe recommendation system
# with better data processing, evaluation, and user experience features

# Import necessary libraries
import pandas as pd
import numpy as np
import random
import re
import ast
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
import matplotlib.pyplot as plt

# Step 1: Load and preprocess data
print("Loading and preprocessing data...")

# Load the dataset
# Note: You'll need to download the dataset first as in your original code
df = pd.read_csv("recipes_data.csv")
print(f"Loaded {len(df)} recipes")

# Convert string representations of lists to actual lists (if needed)
try:
    # Check if ingredients are already lists
    if isinstance(df['ingredients'].iloc[0], str):
        df['ingredients'] = df['ingredients'].apply(ast.literal_eval)
    if isinstance(df['directions'].iloc[0], str):
        df['directions'] = df['directions'].apply(ast.literal_eval)
except Exception as e:
    print(f"Error converting strings to lists: {e}")
    print("Columns may already be in list format")

# Basic data cleaning
df.dropna(subset=['title', 'ingredients', 'directions'], inplace=True)
print(f"After cleaning, {len(df)} recipes remain")

# Step 2: Simple Ingredient Standardization
print("Standardizing ingredients...")

def clean_ingredient(ingredient):
    """Basic cleaning of ingredient text"""
    # Convert to lowercase
    ing = ingredient.lower()
    
    # Remove quantities (numbers and fractions) and common units
    ing = re.sub(r'^[\d\s/]+', '', ing)
    units = ['cup', 'cups', 'tablespoon', 'tablespoons', 'tbsp', 'teaspoon', 'teaspoons', 
             'tsp', 'pound', 'pounds', 'lb', 'lbs', 'ounce', 'ounces', 'oz']
    
    for unit in units:
        ing = ing.replace(f"{unit} ", " ").replace(f"{unit}s ", " ")
    
    # Remove extra whitespace
    ing = re.sub(r'\s+', ' ', ing).strip()
    
    return ing

# Apply cleaning to all ingredients
df['clean_ingredients'] = df['ingredients'].apply(lambda x: [clean_ingredient(i) for i in x])

# Step 3: Simple Recipe Categorization
print("Categorizing recipes...")

def simple_categorize_recipe(title, ingredients):
    """Categorize recipe based on title and ingredients"""
    title = title.lower()
    
    # Simple classification rules
    if any(word in title for word in ['cake', 'cookie', 'pie', 'dessert', 'sweet', 'chocolate', 'ice cream']):
        return 'dessert'
    elif any(word in title for word in ['breakfast', 'pancake', 'waffle', 'oatmeal', 'cereal']):
        return 'breakfast'
    elif any(word in title for word in ['salad', 'vegetable']):
        return 'salad'
    elif any(word in title for word in ['soup', 'stew']):
        return 'soup'
    elif any(word in title for word in ['chicken', 'beef', 'pork', 'fish', 'meat']):
        return 'main_protein'
    elif any(word in title for word in ['pasta', 'rice', 'noodle']):
        return 'main_carb'
    elif any(word in title for word in ['bread', 'muffin', 'roll']):
        return 'bread'
    elif any(word in title for word in ['drink', 'cocktail', 'smoothie', 'juice']):
        return 'beverage'
    else:
        # Check ingredients for clues
        ingredients_str = ' '.join([i.lower() for i in ingredients])
        if 'meat' in ingredients_str or 'chicken' in ingredients_str or 'beef' in ingredients_str:
            return 'main_protein'
        else:
            return 'other'

# Apply categorization
df['category'] = df.apply(lambda x: simple_categorize_recipe(x['title'], x['ingredients']), axis=1)

# Display category distribution
category_counts = df['category'].value_counts()
print("Recipe categories:")
print(category_counts)

# Step 4: Check for dietary preferences
print("Checking for dietary preferences...")

def is_vegetarian(ingredients):
    """Simple check if recipe is vegetarian based on ingredients"""
    ingredients_str = ' '.join([i.lower() for i in ingredients])
    meat_keywords = ['chicken', 'beef', 'pork', 'lamb', 'turkey', 'meat', 'fish', 'seafood']
    return not any(meat in ingredients_str for meat in meat_keywords)

def is_gluten_free(ingredients):
    """Simple check if recipe might be gluten-free"""
    ingredients_str = ' '.join([i.lower() for i in ingredients])
    gluten_keywords = ['flour', 'wheat', 'pasta', 'bread', 'crumb', 'biscuit']
    # This is a very simple approximation - in reality, you'd need more sophisticated checking
    return not any(gluten in ingredients_str for gluten in gluten_keywords)

# Add dietary preference flags
df['vegetarian'] = df['ingredients'].apply(is_vegetarian)
df['gluten_free'] = df['ingredients'].apply(is_gluten_free)

print(f"Vegetarian recipes: {df['vegetarian'].sum()}")
print(f"Potentially gluten-free recipes: {df['gluten_free'].sum()}")

# Step 5: Simple Feature Engineering
print("Creating recipe feature vectors...")

# Function to count ingredients as a complexity metric
df['ingredient_count'] = df['ingredients'].apply(len)
df['direction_count'] = df['directions'].apply(len)

# Calculate a simple complexity score
df['complexity'] = (df['ingredient_count'] / df['ingredient_count'].max() + 
                    df['direction_count'] / df['direction_count'].max()) / 2

# Calculate a dietary balance score based on ingredients (very simplified)
def simple_nutrition_score(ingredients):
    """Extremely simplified nutrition scoring"""
    ingredients_str = ' '.join([i.lower() for i in ingredients])
    
    # Count mentions of various food groups (very simplified)
    protein = sum(1 for word in ['meat', 'chicken', 'beef', 'pork', 'fish', 'tofu', 'bean', 'egg', 'nut'] 
                  if word in ingredients_str)
    vegetables = sum(1 for word in ['vegetable', 'carrot', 'broccoli', 'spinach', 'kale', 'potato', 'onion'] 
                     if word in ingredients_str)
    fruits = sum(1 for word in ['fruit', 'apple', 'banana', 'berry', 'orange'] 
                 if word in ingredients_str)
    grains = sum(1 for word in ['grain', 'rice', 'pasta', 'bread', 'oat', 'wheat'] 
                 if word in ingredients_str)
    
    # Simple balance score - higher when more food groups are present
    food_groups = [protein, vegetables, fruits, grains]
    present_groups = sum(1 for group in food_groups if group > 0)
    
    return present_groups / 4.0  # Normalized to 0-1

df['nutrition_balance'] = df['ingredients'].apply(simple_nutrition_score)

# Step 6: Simple Bag-of-Words Recipe Embeddings
print("Creating recipe embeddings...")

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a bag of words representation for ingredients
vectorizer = TfidfVectorizer(max_features=100)  # Limit to top features for simplicity
ingredient_docs = [' '.join(ing) for ing in df['clean_ingredients']]
X_ingredients = vectorizer.fit_transform(ingredient_docs)

# Convert sparse matrix to dense array
X_ingredients_dense = X_ingredients.toarray()

# For each recipe, store its ingredient vector
df['embedding'] = list(X_ingredients_dense)

# Step 7: Create balanced meal plans
print("Generating meal plans...")

def create_balanced_meal_plan(df, num_recipes=3, vegetarian=False, gluten_free=False):
    """Create a balanced meal plan with dietary restrictions"""
    filtered_df = df.copy()
    
    # Apply dietary filters if requested
    if vegetarian:
        filtered_df = filtered_df[filtered_df['vegetarian']]
    if gluten_free:
        filtered_df = filtered_df[filtered_df['gluten_free']]
    
    if len(filtered_df) < num_recipes:
        print(f"Warning: Not enough recipes match dietary restrictions. Found {len(filtered_df)}")
        return None
    
    # Try to include main dish, side, and dessert
    categories_to_include = ['main_protein', 'main_carb', 'salad', 'dessert']
    
    plan = []
    for category in categories_to_include:
        if len(plan) < num_recipes:
            category_recipes = filtered_df[filtered_df['category'] == category]
            if len(category_recipes) > 0:
                plan.append(random.choice(category_recipes.index))
    
    # Fill remaining slots with random recipes
    remaining_recipes = filtered_df.index.difference(plan)
    plan.extend(random.sample(list(remaining_recipes), min(num_recipes - len(plan), len(remaining_recipes))))
    
    return plan[:num_recipes]  # Ensure we return exactly num_recipes

# Generate sample meal plans
num_plans = 500
meal_plans = []
for i in range(num_plans):
    # Mix of regular, vegetarian, and gluten-free plans
    vegetarian = random.random() < 0.3
    gluten_free = random.random() < 0.2
    
    plan = create_balanced_meal_plan(df, num_recipes=3, vegetarian=vegetarian, gluten_free=gluten_free)
    if plan:
        meal_plans.append(plan)

print(f"Created {len(meal_plans)} balanced meal plans")

# Step 8: Feature vectors for meal plans
print("Creating meal plan feature vectors...")

def get_meal_plan_features(plan, df):
    """Create a feature vector for a meal plan"""
    # Get embeddings for each recipe in the plan
    recipe_embeddings = [df.loc[idx, 'embedding'] for idx in plan]
    
    # Average the embeddings
    avg_embedding = np.mean(recipe_embeddings, axis=0)
    
    # Count unique categories as a diversity measure
    categories = [df.loc[idx, 'category'] for idx in plan]
    category_diversity = len(set(categories)) / len(categories)  # 1.0 = all different
    
    # Get other features
    avg_complexity = np.mean([df.loc[idx, 'complexity'] for idx in plan])
    avg_nutrition = np.mean([df.loc[idx, 'nutrition_balance'] for idx in plan])
    
    # Combine features
    additional_features = np.array([category_diversity, avg_complexity, avg_nutrition])
    
    return np.concatenate([avg_embedding, additional_features])

# Create feature vectors for all meal plans
X = np.array([get_meal_plan_features(plan, df) for plan in meal_plans])

# Step 9: Create synthetic ratings (more realistic than purely random)
print("Creating synthetic ratings...")

def generate_realistic_rating(plan, df):
    """Generate a more realistic rating based on properties of the meal plan"""
    # Get categories in the plan
    categories = [df.loc[idx, 'category'] for idx in plan]
    
    # Base score
    score = 5.0
    
    # Bonus for diversity (different categories)
    unique_categories = len(set(categories))
    score += unique_categories * 0.5
    
    # Bonus for nutritional balance
    nutrition_scores = [df.loc[idx, 'nutrition_balance'] for idx in plan]
    avg_nutrition = np.mean(nutrition_scores)
    score += avg_nutrition * 2
    
    # Slight penalty for very complex meals
    complexity_scores = [df.loc[idx, 'complexity'] for idx in plan]
    avg_complexity = np.mean(complexity_scores)
    if avg_complexity > 0.7:  # Only penalize very complex plans
        score -= (avg_complexity - 0.7) * 2
    
    # Add some random noise
    score += np.random.normal(0, 0.5)
    
    # Ensure within range 1-10
    return max(1, min(10, score))

y = np.array([generate_realistic_rating(plan, df) for plan in meal_plans])

print(f"Rating distribution: min={y.min():.2f}, max={y.max():.2f}, mean={y.mean():.2f}")

# Step 10: Improved model with regularization and cross-validation
print("Training improved model with cross-validation...")

# First, standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define improved model with regularization
def create_model(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1)  # Regression output
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Train with early stopping to prevent overfitting
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)

model = create_model(X_train.shape[1])

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

# Step 11: Evaluate model
print("Evaluating model...")

# Evaluate on test set
test_results = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss (MSE): {test_results[0]:.4f}")
print(f"Test MAE: {test_results[1]:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.savefig('training_history.png')
plt.close()

print("Saved training history visualization to training_history.png")

# Step 12: Generate and recommend meal plans based on user preferences
print("Generating recommendations...")

def recommend_meal_plans(df, model, scaler, num_candidates=100, num_to_recommend=5, 
                        vegetarian=False, gluten_free=False):
    """Generate and recommend meal plans based on user preferences"""
    # Generate candidate meal plans
    candidate_plans = []
    for _ in range(num_candidates):
        plan = create_balanced_meal_plan(df, num_recipes=3, vegetarian=vegetarian, gluten_free=gluten_free)
        if plan:
            candidate_plans.append(plan)
    
    if len(candidate_plans) == 0:
        print("Could not generate any meal plans matching preferences")
        return []
    
    # Create feature vectors
    candidate_features = np.array([get_meal_plan_features(plan, df) for plan in candidate_plans])
    
    # Scale features
    candidate_features_scaled = scaler.transform(candidate_features)
    
    # Predict scores
    predicted_scores = model.predict(candidate_features_scaled).flatten()
    
    # Find top plans
    top_indices = np.argsort(predicted_scores)[-num_to_recommend:][::-1]
    
    # Return top plans with their predicted scores
    top_plans = [(candidate_plans[i], predicted_scores[i]) for i in top_indices]
    return top_plans

# Generate different types of recommendations
print("\nTop 5 Regular Meal Plans:")
regular_recommendations = recommend_meal_plans(df, model, scaler)
for i, (plan, score) in enumerate(regular_recommendations):
    recipes = [df.loc[idx, 'title'] for idx in plan]
    categories = [df.loc[idx, 'category'] for idx in plan]
    print(f"Plan {i+1} (Score: {score:.2f}):")
    for j, (recipe, category) in enumerate(zip(recipes, categories)):
        print(f"  - {recipe} ({category})")
    print()

print("\nTop 3 Vegetarian Meal Plans:")
veg_recommendations = recommend_meal_plans(df, model, scaler, num_to_recommend=3, vegetarian=True)
for i, (plan, score) in enumerate(veg_recommendations):
    recipes = [df.loc[idx, 'title'] for idx in plan]
    print(f"Plan {i+1} (Score: {score:.2f}):")
    for recipe in recipes:
        print(f"  - {recipe}")
    print()

print("\nTop 3 Gluten-Free Meal Plans:")
gf_recommendations = recommend_meal_plans(df, model, scaler, num_to_recommend=3, gluten_free=True)
for i, (plan, score) in enumerate(gf_recommendations):
    recipes = [df.loc[idx, 'title'] for idx in plan]
    print(f"Plan {i+1} (Score: {score:.2f}):")
    for recipe in recipes:
        print(f"  - {recipe}")
    print()

print("\nTop 3 Vegetarian and Gluten-Free Meal Plans:")
veg_gf_recommendations = recommend_meal_plans(df, model, scaler, num_to_recommend=3, 
                                            vegetarian=True, gluten_free=True)
for i, (plan, score) in enumerate(veg_gf_recommendations):
    recipes = [df.loc[idx, 'title'] for idx in plan]
    print(f"Plan {i+1} (Score: {score:.2f}):")
    for recipe in recipes:
        print(f"  - {recipe}")
    print()

# Step 13: Summarize recipe recommendation system
print("\nSummary of Recipe Recommendation System:")
print(f"- Dataset: {len(df)} recipes")
print(f"- Generated {len(meal_plans)} meal plans for training")
print(f"- Model performance: MAE = {test_results[1]:.2f}")
print("- Support for dietary preferences: Vegetarian and Gluten-free")
print("- Features used: TF-IDF ingredient embeddings, category diversity, complexity, nutritional balance")

print("\nDone! The recipe recommendation system has been improved.")

# Save the trained model and preprocessor for future use
model.save('recipe_recommender_model')
# Save important data for deployment
import pickle
with open('recipe_model_data.pkl', 'wb') as f:
    pickle.dump({
        'scaler': scaler,
        'vectorizer': vectorizer,
    }, f)

print("Model and preprocessors have been saved for future use.")

Loading and preprocessing data...
Loaded 2231141 recipes
After cleaning, 2231141 recipes remain
Standardizing ingredients...
Categorizing recipes...
Recipe categories:
category
other           900297
dessert         456009
main_protein    408591
salad           179333
bread           115231
soup             81851
main_carb        55133
breakfast        20913
beverage         13783
Name: count, dtype: int64
Checking for dietary preferences...
Vegetarian recipes: 1648142
Potentially gluten-free recipes: 1389604
Creating recipe feature vectors...
Creating recipe embeddings...
Generating meal plans...
Created 500 balanced meal plans
Creating meal plan feature vectors...
Creating synthetic ratings...
Rating distribution: min=5.63, max=9.48, mean=7.34
Training improved model with cross-validation...
Epoch 1/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 65.1430 - mae: 8.0000 - val_loss: 43.9549 - val_mae: 6.5703
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━

ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=recipe_recommender_model.