In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import os

if os.getcwd().endswith('notebooks'):
    # Move up one level to the project root
    os.chdir('..') 

print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /home/hsozer/Projects/DeepLearning/recipe_bot


In [15]:
import pandas as pd
import json
import sys
from scripts.data_prep import generate_llm_dataset

# CONFIG

# Paths
INPUT_PATH = "data/processed/clean_recipes.pkl"
OUTPUT_PATH = "data/training/llm_train.jsonl"

# Thresholds
MAX_LENGTH = 2048  # Max token length for the LLM

# Meal keywords
MEAL_KEYWORDS = {
    'main-dish': 'Dinner',           # Rank 10
    'side-dishes': 'Side Dish',      # Rank 42
    'desserts': 'Dessert',           # Rank 23
    'lunch': 'Lunch',                # Rank 48
    'appetizers': 'Appetizer',       # Rank 55
    'breakfast': 'Breakfast',        # Rank 65
    'brunch': 'Brunch',              # Rank 56
    'soups-stews': 'Soup',           # Rank 75
    'salads': 'Salad',               # Rank 63
    'beverages': 'Beverage',         # Rank 76
    'cocktails': 'Beverage',         # Rank 137
    'breads': 'Bread',               # Rank 64
    'snacks': 'Snack',               # Rank 100
    'finger-food': 'Snack'           # Rank 108
}

# Dietary keywords
DIET_KEYWORDS = {
    'low-sodium': 'Low-Sodium',          # Rank 22
    'low-carb': 'Low-Carb',              # Rank 24
    'healthy': 'Healthy',                # Rank 25
    'healthy-2': 'Healthy',              # Rank 43
    'low-cholesterol': 'Low-Cholesterol',# Rank 27
    'low-calorie': 'Low-Calorie',        # Rank 28
    'vegetarian': 'Vegetarian',          # Rank 29
    'vegan': 'Vegan',                    # Rank 82
    'low-protein': 'Low-Protein',        # Rank 34
    'low-saturated-fat': 'Low-Fat',      # Rank 35
    'low-fat': 'Low-Fat',                # Rank 50
    'very-low-carbs': 'Keto',            # Rank 86
    'high-protein': 'High-Protein',      # Rank 99
    'diabetic': 'Diabetic-Friendly',     # Rank 107
    'gluten-free': 'Gluten-Free',        # Rank 115
    'egg-free': 'Egg-Free',              # Rank 123
    'kosher': 'Kosher'                   # Rank 135
}

# Cooking method keywords
METHOD_KEYWORDS = {
    'oven': 'Oven Baked',                # Rank 37
    'stove-top': 'Stove Top',            # Rank 51
    'crock-pot-slow-cooker': 'Slow Cooker', # Rank 106
    'no-cook': 'No-Cook',                # Rank 110
    'grilling': 'Grilled',               # Rank 128
    'barbecue': 'BBQ',                   # Rank 140
    'food-processor-blender': 'Blended'  # Rank 126
}

STYLE_KEYWORDS = {
    'easy': 'Easy',                      # Rank 6
    'beginner-cook': 'Beginner Friendly',# Rank 30
    'inexpensive': 'Budget Friendly',    # Rank 33
    'kid-friendly': 'Kid Friendly',      # Rank 41
    'toddler-friendly': 'Kid Friendly',  # Rank 134
    'comfort-food': 'Comfort Food',      # Rank 44
    'for-1-or-2': 'Single Serving',      # Rank 49
    'for-large-groups': 'Crowd Pleaser', # Rank 58
    'weeknight': 'Weeknight',            # Rank 53
    'one-dish-meal': 'One-Pot',          # Rank 60
    'potluck': 'Potluck',                # Rank 77
    'spicy': 'Spicy',                    # Rank 89
    'romantic': 'Date Night',            # Rank 122
    'oamc-freezer-make-ahead': 'Meal Prep' # Rank 139
}

In [16]:
#Execute data preparation
# Load Data
print(f"⏳ Loading data from {INPUT_PATH}...")
if os.path.exists(INPUT_PATH):
    df = pd.read_pickle(INPUT_PATH)
    print(f"✅ Loaded {len(df):,} recipes.")
else:
    print("❌ Input file not found.")

# Run the Processing Script
print("⏳ Generating training pairs...")
training_data = generate_llm_dataset(
    df, 
    meal_map=MEAL_KEYWORDS, 
    diet_map=DIET_KEYWORDS, 
    method_map=METHOD_KEYWORDS,
    style_map=STYLE_KEYWORDS,
    max_length=MAX_LENGTH
)

# Save to JSONL
print(f"⏳ Saving to {OUTPUT_PATH}...")
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for entry in training_data:
        json.dump(entry, f)
        f.write('\n')

print(f"✅ Done! Ready for QLoRA.")

# 4. Preview
print("\n--- Sample Entry ---")
print(f"INSTRUCTION: {training_data[0]['instruction']}")
print(f"INPUT:       {training_data[0]['input']}")

⏳ Loading data from data/processed/clean_recipes.pkl...
✅ Loaded 225,342 recipes.
⏳ Generating training pairs...
   - Processed 222083 items.
   - Skipped 3259 items due to length > 2048.
⏳ Saving to data/training/llm_train.jsonl...
✅ Done! Ready for QLoRA.

--- Sample Entry ---
INSTRUCTION: You are a smart chef. Generate a recipe that uses the provided ingredients and strictly follows the context constraints.
INPUT:       Ingredients: winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt. Context: Side Dish, Vegetarian, Easy
