In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

if os.getcwd().endswith('notebooks'):
    # Move up one level to the project root
    os.chdir('..') 

print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /home/hsozer/Projects/DeepLearning/recipe_bot


In [3]:
import pandas as pd

# Load both pickled datasets
df_kaggle = pd.read_pickle('data/processed/clean_recipes.pkl')
df_scraped = pd.read_pickle('data/processed/full_scraped_recipes.pkl')

print(f"Kaggle count: {len(df_kaggle)}")
print(f"Scraped count: {len(df_scraped)}")

# Merge them
df_combined = pd.concat([df_kaggle, df_scraped], ignore_index=True)

# Remove if there's duplicates
df_final = df_combined.drop_duplicates(subset=['name'], keep='last')

# 4. Shuffling the final dataframe
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final Training Set Size: {len(df_final)}")
df = df_final

# Save the final dataframe
df.to_pickle('data/processed/final_recipes.pkl')

Kaggle count: 225342
Scraped count: 2867
Final Training Set Size: 226816


In [5]:
import json
import sys
from scripts.data_prep import generate_llm_dataset

# CONFIG

# Paths
INPUT_PATH = "data/processed/full_scraped_recipes.pkl"
OUTPUT_PATH = "data/training/llm_train.jsonl"

# Thresholds
MAX_LENGTH = 2048  # Max token length for the LLM

# Meal keywords
MEAL_KEYWORDS = {
    'main-dish': 'Dinner',           # Rank 10
    'side-dishes': 'Side Dish',      # Rank 42
    'desserts': 'Dessert',           # Rank 23
    'lunch': 'Lunch',                # Rank 48
    'appetizers': 'Appetizer',       # Rank 55
    'breakfast': 'Breakfast',        # Rank 65
    'brunch': 'Brunch',              # Rank 56
    'soups-stews': 'Soup',           # Rank 75
    'salads': 'Salad',               # Rank 63
    'beverages': 'Beverage',         # Rank 76
    'cocktails': 'Beverage',         # Rank 137
    'breads': 'Bread',               # Rank 64
    'snacks': 'Snack',               # Rank 100
    'finger-food': 'Snack'           # Rank 108
}

# Dietary keywords
DIET_KEYWORDS = {
    'low-sodium': 'Low-Sodium',          # Rank 22
    'low-carb': 'Low-Carb',              # Rank 24
    'healthy': 'Healthy',                # Rank 25
    'healthy-2': 'Healthy',              # Rank 43
    'low-cholesterol': 'Low-Cholesterol',# Rank 27
    'low-calorie': 'Low-Calorie',        # Rank 28
    'vegetarian': 'Vegetarian',          # Rank 29
    'vegan': 'Vegan',                    # Rank 82
    'low-protein': 'Low-Protein',        # Rank 34
    'low-saturated-fat': 'Low-Fat',      # Rank 35
    'low-fat': 'Low-Fat',                # Rank 50
    'very-low-carbs': 'Keto',            # Rank 86
    'high-protein': 'High-Protein',      # Rank 99
    'diabetic': 'Diabetic-Friendly',     # Rank 107
    'gluten-free': 'Gluten-Free',        # Rank 115
    'egg-free': 'Egg-Free',              # Rank 123
    'kosher': 'Kosher'                   # Rank 135
}

# Cooking method keywords
METHOD_KEYWORDS = {
    'oven': 'Oven Baked',                # Rank 37
    'stove-top': 'Stove Top',            # Rank 51
    'crock-pot-slow-cooker': 'Slow Cooker', # Rank 106
    'no-cook': 'No-Cook',                # Rank 110
    'grilling': 'Grilled',               # Rank 128
    'barbecue': 'BBQ',                   # Rank 140
    'food-processor-blender': 'Blended'  # Rank 126
}

STYLE_KEYWORDS = {
    'easy': 'Easy',                      # Rank 6
    'beginner-cook': 'Beginner Friendly',# Rank 30
    'inexpensive': 'Budget Friendly',    # Rank 33
    'kid-friendly': 'Kid Friendly',      # Rank 41
    'toddler-friendly': 'Kid Friendly',  # Rank 134
    'comfort-food': 'Comfort Food',      # Rank 44
    'for-1-or-2': 'Single Serving',      # Rank 49
    'for-large-groups': 'Crowd Pleaser', # Rank 58
    'weeknight': 'Weeknight',            # Rank 53
    'one-dish-meal': 'One-Pot',          # Rank 60
    'potluck': 'Potluck',                # Rank 77
    'spicy': 'Spicy',                    # Rank 89
    'romantic': 'Date Night',            # Rank 122
    'oamc-freezer-make-ahead': 'Meal Prep' # Rank 139
}

In [6]:
# Run the Processing Script
print("‚è≥ Generating training pairs...")
training_data = generate_llm_dataset(
    df, 
    meal_map=MEAL_KEYWORDS, 
    diet_map=DIET_KEYWORDS, 
    method_map=METHOD_KEYWORDS,
    style_map=STYLE_KEYWORDS,
    max_length=MAX_LENGTH
)

# Save to JSONL
print(f"‚è≥ Saving to {OUTPUT_PATH}...")
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for entry in training_data:
        json.dump(entry, f)
        f.write('\n')

print(f"‚úÖ Done! Ready for QLoRA.")

# 4. Preview
print("\n--- Sample Entry ---")
print(f"INSTRUCTION: {training_data[0]['instruction']}")
print(f"INPUT:       {training_data[0]['input']}")
print(f"OUTPUT:      {training_data[0]['output']}")

‚è≥ Generating training pairs...
   - Processed 223498 items.
   - Skipped 3318 items due to length > 2048.
‚è≥ Saving to data/training/llm_train.jsonl...
‚úÖ Done! Ready for QLoRA.

--- Sample Entry ---
INSTRUCTION: You are a smart chef. Generate a recipe that uses the provided ingredients and strictly follows the context constraints.
INPUT:       Ingredients: turkey, campbell's cream of chicken soup, milk, mixed vegetables, cornstarch, water. Context: Dinner, Easy, Beginner Friendly
OUTPUT:      **Easy As Can Be Creamed Turkey   Veggies**

Ingredients:
- turkey
- campbell's cream of chicken soup
- milk
- mixed vegetables
- cornstarch
- water

**Instructions:**
1. In a medium sized bowl add soup and milk , mixing well
2. Pour soup mixture into a large skillet
3. Add turkey and veggies
4. Bring to a simmer over medium heat
5. If it looks to thin , add cornstarch mixed with water
6. Stir until thickened
7. Season with pepper to taste
 </s>


In [None]:
import pandas as pd
import json
import random
import glob
import os
from sklearn.utils import shuffle

# PATHS
# Original robotic dataset
EXISTING_RECIPES_PATH = "data/training/llm_train.jsonl"
# Chat dataset
NEW_CHEF_DATA_FOLDER = "data/synthetic/chef_dataset.jsonl"

# The final file for Fine-Tuning
TRAIN_OUTPUT_PATH = "data/training/hybrid_train.jsonl"

# CONFIG 
RECIPE_FT_COUNT = 20000

In [None]:
print("Loading original recipe dataset...")

data = []
with open(EXISTING_RECIPES_PATH, 'r') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue

# Convert to DataFrame
full_df = pd.DataFrame(data)
print(f"‚úÖ Loaded {len(full_df)} total recipes.")

# Shuffle once before splitting
full_df = shuffle(full_df, random_state=42).reset_index(drop=True)

# Top 20k for Fine-Tuning
train_recipes_df = full_df.head(RECIPE_FT_COUNT).copy()

print(f"üìä Split Statistics:")
print(f"   - Fine-Tuning Recipes: {len(train_recipes_df)}")

Loading original recipe dataset...
‚úÖ Loaded 223498 total recipes.
üìä Split Statistics:
   - Fine-Tuning Recipes: 20000
   - RAG Knowledge Base:  223498


In [None]:
# TEMPLATE EXPANSION

# INGREDIENT ONLY (Casual, Formal, Short, Long)
INGREDIENT_PROMPTS = [
    # Casual / Short
    "What can I cook with {ingredients}?",
    "I have {ingredients}. Ideas?",
    "Got {ingredients}. What now?",
    "Make me something with {ingredients}.",
    "Recipe for {ingredients}?",
    "How do I use {ingredients}?",
    "Dinner ideas using {ingredients}.",
    "Lunch options with {ingredients}.",
    "I found {ingredients} in the pantry.",
    "Whip up a meal with {ingredients}.",
    "Can {ingredients} be a meal?",
    "Cooking with {ingredients}.",
    "Just bought {ingredients}. Recipes?",
    "Leftover {ingredients}. Help.",
    "Any dishes strictly with {ingredients}?",
    
    # Conversational
    "I'm staring at {ingredients} and have no clue what to make.",
    "My partner bought {ingredients} and I need a plan.",
    "I'd like to prepare a dish that highlights {ingredients}.",
    "Could you suggest a menu that features {ingredients}?",
    "I am looking for a reliable recipe involving {ingredients}.",
    "Please help me combine {ingredients} into a tasty dish.",
    "What pairs well with {ingredients} for a main course?",
    "I want to try something new with {ingredients}.",
    "Give me your best recommendation for {ingredients}.",
    "How would a chef prepare {ingredients}?",
    
    # Creative / Challenge
    "Challenge: I only have {ingredients}.",
    "Iron Chef style: The secret ingredient is {ingredients}.",
    "Can you turn {ingredients} into a gourmet meal?",
    "Surprise me with a recipe for {ingredients}.",
    "I'm bored of my usuals. What can I do with {ingredients}?",
    "Invent a dish using {ingredients}.",
    "If you had {ingredients}, what would you cook?",
    "Transform these simple ingredients: {ingredients}.",
    "I need a comfort meal using {ingredients}.",
    "Make {ingredients} the star of the show."
]

# CONTEXT BASED (Dietary, Time, Method, Vibe)
CONTEXT_PROMPTS = [
    # Direct phrasing
    "I'm looking for a {context} recipe using {ingredients}.",
    "Can you make a {context} dish with {ingredients}?",
    "I have {ingredients}. Make something {context}.",
    "Give me a {context} meal featuring {ingredients}.",
    "How can I make a {context} dish using {ingredients}?",
    
    # "Need" phrasing
    "I need a {context} dinner idea with {ingredients}.",
    "I need something {context} made from {ingredients}.",
    "We need a {context} option for {ingredients}.",
    "My diet is {context}, and I have {ingredients}.",
    "Keep it {context} please. Ingredients: {ingredients}.",
    
    # "Want" phrasing
    "I want to eat {context}. I have {ingredients}.",
    "I'm craving something {context} with {ingredients}.",
    "I'm in the mood for {context} food using {ingredients}.",
    "Can you satisfy a {context} craving with {ingredients}?",
    "I'd love a {context} version of {ingredients}.",
    
    # Question phrasing
    "Show me a {context} way to cook {ingredients}.",
    "Do you have a {context} recipe for {ingredients}?",
    "Is there a {context} way to prepare {ingredients}?",
    "What is a good {context} dish with {ingredients}?",
    "Find me a {context} dish with {ingredients}.",
    
    # Complex / Specific
    "Please provide a {context} recipe that includes {ingredients}.",
    "Make it {context} and use {ingredients}.",
    "I am hosting a {context} dinner and have {ingredients}.",
    "I need to use up my {ingredients} but keep it {context}.",
    "Can you adapt {ingredients} into a {context} meal?",
    "It's a {context} kind of night. I have {ingredients}.",
    "Help me stay {context} with these ingredients: {ingredients}.",
    "Any {context} chef tips for {ingredients}?",
    "How to cook {ingredients} in a {context} style?"
]

# CHEF INTROS (Personality Injection)
CHEF_INTROS = [
    # Enthusiastic
    "Here is a delicious recipe for you:\n\n",
    "I'd love to help! Try this dish:\n\n",
    "Let's get cooking! Here is a great choice:\n\n",
    "Ooh, great ingredients! Try this:\n\n",
    "I have the perfect idea for that:\n\n",
    
    # Professional
    "Certainly! This fits your needs perfectly:\n\n",
    "Here is a recipe that works well with those items:\n\n",
    "This is a fantastic option:\n\n",
    "Here is a classic preparation:\n\n",
    "Allow me to suggest this recipe:\n\n",
    
    # Confident
    "You can't go wrong with this:\n\n",
    "Trust me, this is delicious:\n\n",
    "This dish highlights your ingredients perfectly:\n\n",
    "Here is a chef-approved recipe:\n\n",
    "Simple, tasty, and perfect for you:\n\n",
    
    # Helpful / Short
    "How about this?\n\n",
    "Try this out:\n\n",
    "Here you go:\n\n",
    "One of my favorites:\n\n",
    "Here is a wonderful way to use those ingredients:\n\n"
]

In [None]:
def harmonize_row(row):
    raw_input = row.get('input', '')
    raw_output = row.get('output', '')
    
    # Parse Input (Extract Ingredients & Context)
    ingredients = ""
    context = ""
    
    # Handle the format
    if "Ingredients:" in raw_input:
        parts = raw_input.split("Context:")
        ingredients = parts[0].replace("Ingredients:", "").strip()
        if ingredients.endswith(('.', ',')):
            ingredients = ingredients[:-1]
            
        if len(parts) > 1:
            context = parts[1].strip()

    # Build New Instruction
    # Limiting to first 4 ingredients so the user prompt sounds natural
    ing_list = [x.strip() for x in ingredients.split(',')]
    main_ingredients = ", ".join(ing_list[:4]) 
    
    if context:
        template = random.choice(CONTEXT_PROMPTS)
        new_instruction = template.format(context=context.lower(), ingredients=main_ingredients)
    else:
        template = random.choice(INGREDIENT_PROMPTS)
        new_instruction = template.format(ingredients=main_ingredients)

    # Build New Output
    intro = random.choice(CHEF_INTROS)
    clean_recipe = raw_output.replace("</s>", "").strip()
    
    # Combining Intro + Recipe
    new_output = f"{intro}{clean_recipe}"
    
    # Return in the format of input output
    return pd.Series([new_instruction, "", new_output])

print("Harmonizing recipe formats (This may take a moment)...")
# Apply transformation to the Training Subset
train_recipes_df[['instruction', 'input', 'output']] = train_recipes_df.apply(harmonize_row, axis=1)
print("‚úÖ Recipes Harmonized to Chef Persona")

Harmonizing recipe formats (This may take a moment)...
‚úÖ Recipes Harmonized to Chef Persona


In [None]:
print("Loading new Chef Persona data...")
new_data_files = glob.glob(NEW_CHEF_DATA_FOLDER)
new_data_list = []

for file in new_data_files:
    try:
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    new_data_list.append(json.loads(line))
    except Exception as e:
        print(f"Error reading {file}: {e}")

chef_persona_df = pd.DataFrame(new_data_list)
print(f"‚úÖ Loaded {len(chef_persona_df)} Chef Persona examples")

# Merging
# Combine 20k Harmonized Recipes + 5k Chef Persona Examples
final_df = pd.concat([train_recipes_df, chef_persona_df], ignore_index=True)

# Shuffling
final_df = shuffle(final_df, random_state=42).reset_index(drop=True)

print(f"üöÄ Final Dataset Size: {len(final_df)} rows")

# Save
os.makedirs(os.path.dirname(TRAIN_OUTPUT_PATH), exist_ok=True)
final_df.to_json(TRAIN_OUTPUT_PATH, orient='records', lines=True)

print(f"üíæ Saved to {TRAIN_OUTPUT_PATH}")
print("Ready for Fine-Tuning!")

Loading new Chef Persona data...
‚úÖ Loaded 4720 Chef Persona examples
üöÄ Final Dataset Size: 24720 rows
üíæ Saved to data/training/hybrid_train.jsonl
Ready for Fine-Tuning!
