In [None]:
import numpy as np
import pandas as pd
import requests as rq
import openpyxl as op
import bs4  
import lxml as lx
import tqdm as tq
from bs4 import BeautifulSoup

from datasets import load_dataset

In [4]:
import requests

def estimate_recipe_cost(ingredients, servings=1, api_key="dc03366924mshc17da3ce290e748p18b0bfjsn5ca7b8892a81"):
    url = "https://spoonacular-recipe-food-nutrition-v1.p.rapidapi.com/recipes/parseIngredients"

    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "spoonacular-recipe-food-nutrition-v1.p.rapidapi.com",
        "content-type": "application/x-www-form-urlencoded"
    }

    payload = {
        "ingredientList": "\n".join(ingredients),
        "servings": servings
    }

    response = requests.post(url, data=payload, headers=headers)

    if response.status_code != 200:
        print(f"Error {response.status_code}: {response.text}")
        return None, []

    results = response.json()

    total_cost = 0.0
    breakdown = []

    for item in results:
        name = item['name']
        cost_info = item.get('estimatedCost', {})
        cost_cents = cost_info.get('value', 0.0)
        cost_usd = cost_cents / 100.0
        total_cost += cost_usd
        breakdown.append((name, cost_usd, "USD"))

    return total_cost, breakdown

In [None]:
manual_path = "/Users/sravankundurthi/NutritionApp/data/recipenlg/full_dataset.csv"


In [None]:
import pandas as pd

# Attempt to load the file and display the first recipe
try:
    df = pd.read_csv(manual_path)


    
    def show_recipe_text(index):
        if index < 0 or index >= len(df):
            print(f"Index {index} is out of bounds.")
            return

        recipe = df.iloc[index]
        print(f"\n📘 Recipe #{index}: {recipe['title']}")
        print(f"\n📝 Ingredients:\n{recipe['ingredients']}")
        print(f"\n👨‍🍳 Directions:\n{recipe['directions']}")
        print(f"\n🔖 Tags / NER:\n{recipe['NER']}")
        print(f"\n🔗 Source: {recipe['link']} ({recipe['source']})")

    show_recipe_text(0)

except FileNotFoundError:
    print(f"❌ File not found at: {manual_path}")
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [None]:
    show_recipe_text(25)

In [None]:
subset = df.iloc[:100]

In [None]:
    def show_subset_text(index):
        if index < 0 or index >= len(df):
            print(f"Index {index} is out of bounds.")
            return

        subset = df.iloc[index]
        print(f"\n📘 Recipe #{index}: {subset['title']}")
        print(f"\n📝 Ingredients:\n{subset['ingredients']}")
        print(f"\n👨‍🍳 Directions:\n{subset['directions']}")
        print(f"\n🔖 Tags / NER:\n{subset['NER']}")
        print(f"\n🔗 Source: {subset['link']} ({subset['source']})")
    show_subset_text(0)

In [None]:
import pandas as pd
import ast
from IPython.display import display

# Step 1: Load the raw CSV
df = pd.read_csv('recipe_subset_raw.csv')

# Step 2: Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Step 3: Drop duplicates
df = df.drop_duplicates()

# Step 4: Drop rows with missing essential info
df = df.dropna(subset=['title', 'ingredients'])

# Step 5: Clean and normalize text
df['title'] = df['title'].str.strip().str.lower()

# Step 6: Convert stringified lists to actual Python lists
for col in ['ingredients', 'directions', 'ner']:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

# Step 7: Drop the index column if it exists
if 'unnamed: 0' in df.columns:
    df = df.drop(columns=['unnamed: 0'])

# Step 8: Remove duplicate titles
df = df.drop_duplicates(subset=['title'])

# Step 9: Optional - Convert calories to numeric if present
if 'calories' in df.columns:
    df['calories'] = pd.to_numeric(df['calories'], errors='coerce')

# Step 10: Drop any rows with remaining critical missing data
df = df.dropna()

# Step 11: Save the cleaned dataset
df.to_csv('recipe_subset_cleaned.csv', index=False)

# Step 12: Display summary and preview
print("✅ Cleaned Data Summary:")
print(df.info())
print("\n📊 Sample Recipes:")
display(df.sample(5))

In [None]:
import re
from fractions import Fraction

# Common cooking units
unit_mappings = {
    "c.": "cup", "cup": "cup", "cups": "cup",
    "tbsp": "tablespoon", "tbsp.": "tablespoon", "tablespoons": "tablespoon",
    "tsp": "teaspoon", "tsp.": "teaspoon", "teaspoons": "teaspoon",
    "oz.": "ounce", "oz": "ounce", "pkg.": "package", "lb.": "pound",
    "lb": "pound", "g.": "gram", "g": "gram", "ml": "milliliter", "l": "liter"
}

# Descriptors to remove from ingredient names
descriptors = ["firmly packed", "bite size", "broken", "chopped", "shredded",
               "large", "small", "medium", "fresh", "whole", "uncooked", "cooked"]

def normalize_quantity(qty_str):
    try:
        return float(sum(Fraction(s) for s in qty_str.strip().split()))
    except:
        return None

def parse_ingredient(raw):
    raw = raw.lower().strip()

    # Match quantity range or single quantity
    range_pattern = r"(?P<qty1>\d+\s\d+/\d+|\d+/\d+|\d+)(\s*(–|-|to)\s*)(?P<qty2>\d+\s\d+/\d+|\d+/\d+|\d+)?"
    match = re.match(range_pattern, raw)
    if match:
        qty_min = normalize_quantity(match.group("qty1"))
        qty_max = normalize_quantity(match.group("qty2") or match.group("qty1"))
        rest = raw[match.end():].strip()
    else:
        single_pattern = r"(?P<qty>\d+\s\d+/\d+|\d+/\d+|\d+)"
        match = re.match(single_pattern, raw)
        if not match:
            return {"quantity_min": None, "quantity_max": None, "unit": None, "item": raw}
        qty_min = qty_max = normalize_quantity(match.group("qty"))
        rest = raw[match.end():].strip()

    # Extract unit
    unit_match = re.match(r"(?P<unit>\w+\.?)\s*", rest)
    if unit_match:
        unit_raw = unit_match.group("unit")
        unit = unit_mappings.get(unit_raw, unit_raw)
        rest = rest[unit_match.end():]
    else:
        unit = None

    # Clean descriptors
    for desc in descriptors:
        rest = rest.replace(desc, "")
    item = rest.strip(",.- ")

    return {
        "quantity_min": qty_min,
        "quantity_max": qty_max,
        "unit": unit,
        "item": item
    }

In [None]:
parse_ingredient("2–3 tbsp olive oil")

In [None]:
parse_ingredient("1/2 - 1 tsp salt")


In [None]:
parse_ingredient("1 c. firmly packed brown sugar")


In [None]:
df['parsed_ingredients'] = df['ingredients'].apply(
    lambda lst: [parse_ingredient(ing) for ing in lst]
)

In [None]:
df[['title', 'parsed_ingredients']].head(5)

In [None]:
df['parsed_ingredients'].iloc[0]
df['parsed_ingredients'].iloc[0]

In [None]:
df['parsed_ingredients'].iloc[0]


In [None]:
import re

def clean_instruction(step):
    # Fix encoding and normalize punctuation
    step = step.replace('\u00b0', '°').replace('–', '-').strip()
    step = re.sub(r"\s+", " ", step)  # remove extra spaces
    if not step.endswith('.'):
        step += '.'
    return step.strip()

df['cleaned_directions'] = df['directions'].apply(
    lambda lst: [clean_instruction(step) for step in lst]
)

In [None]:
df[['title','parsed_ingredients', 'cleaned_directions']].sample(3)


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_actions(step):
    doc = nlp(step)
    pairs = []
    for token in doc:
        if token.pos_ == "VERB":
            obj = None
            for child in token.children:
                if child.dep_ in ("dobj", "attr", "prep", "pobj", "conj"):
                    obj = child.text
                    break
            if obj:
                pairs.append({"action": token.lemma_, "object": obj})
    return pairs
def extract_from_all_steps(step_list):
    return [pair for step in step_list for pair in extract_actions(step)]

df['action_object_pairs'] = df['cleaned_directions'].apply(extract_from_all_steps)


In [None]:
df[['title', 'action_object_pairs']].sample(3)


In [None]:
def tag_course(title):
    title = title.lower()

    if any(word in title for word in ["cookie", "cake", "pie", "brownie", "ice cream", "pudding", "crumble"]):
        return "dessert"
    elif any(word in title for word in ["casserole", "stew", "chili", "chicken", "meatloaf", "lasagna", "pasta", "taco", "burger", "soup"]):
        return "main"
    elif any(word in title for word in ["rice", "corn", "beans", "potatoes", "grits", "coleslaw", "stuffing", "macaroni"]):
        return "side"
    elif any(word in title for word in ["bars", "bites", "snack", "roll", "muffin", "dip", "spread"]):
        return "snack"
    elif any(word in title for word in ["lemonade", "smoothie", "shake", "juice", "tea"]):
        return "beverage"
    else:
        return "unknown"


In [None]:
def tag_diet(ner_list):
    if not isinstance(ner_list, list):
        return []

    ner_set = set(i.lower() for i in ner_list)
    tags = []

    meats = ['chicken', 'beef', 'pork', 'bacon', 'turkey', 'ham', 'sausage', 'fish', 'shrimp']
    animal_products = meats + ['milk', 'cheese', 'butter', 'egg', 'cream', 'yogurt']

    if not any(meat in ner_set for meat in meats):
        tags.append("vegetarian")
    if not any(item in ner_set for item in animal_products):
        tags.append("vegan")
    if not any(gluten in ner_set for gluten in ['flour', 'bread', 'pasta', 'wheat', 'breadcrumbs']):
        tags.append("gluten-free")
    if not any(dairy in ner_set for dairy in ['milk', 'cheese', 'butter', 'cream', 'yogurt']):
        tags.append("dairy-free")
    if not any(carbs in ner_set for carbs in ['sugar', 'flour', 'rice', 'bread', 'pasta', 'potato']):
        tags.append("low-carb")

    return tags


In [None]:
def generate_tags(row):
    tags = []

    # Course tag
    course = tag_course(row['title'])
    if course != "unknown":
        tags.append(course)

    # Diet tags
    diet_tags = tag_diet(row['ner'])
    tags.extend(diet_tags)

    return tags

df['tags'] = df.apply(generate_tags, axis=1)


In [None]:
df['tags'] = df.apply(generate_tags, axis=1)


In [None]:
from openai import OpenAI
import pandas as pd

# 🔐 Directly assign your API key here
client = OpenAI(api_key="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")

# 🧠 Prompt builder
def build_tag_prompt(title, ingredients):
    return f"""
Classify this recipe based on the following fields:

- Course: one of [main, dessert, side, snack, appetizer, beverage]
- Diet: any of [vegetarian, vegan, gluten-free, none]
- Cuisine: if possible, otherwise return "unspecified"

Recipe:
Title: {title}
Ingredients: {', '.join(ingredients)}

Respond in JSON like this:
{{"course": "...", "diet": "...", "cuisine": "..."}}
"""

# 💬 GPT tagger function
def gpt_tag_recipe(title, ingredients):
    prompt = build_tag_prompt(title, ingredients)
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2
        )
        content = response.choices[0].message.content
        return eval(content)  # Or use json.loads() if more strict
    except Exception as e:
        print(f"❌ GPT tagging failed for '{title}': {e}")
        return {"course": "unknown", "diet": "unknown", "cuisine": "unspecified"}

In [None]:
test_df = df.head(5).copy()

test_df['gpt_tags'] = test_df.apply(
    lambda row: gpt_tag_recipe(row['title'], row['ner']), axis=1
)

test_df['gpt_course'] = test_df['gpt_tags'].apply(lambda x: x.get('course'))
test_df['gpt_diet'] = test_df['gpt_tags'].apply(lambda x: x.get('diet'))
test_df['gpt_cuisine'] = test_df['gpt_tags'].apply(lambda x: x.get('cuisine'))

# Display results
test_df[['title', 'gpt_course', 'gpt_diet', 'gpt_cuisine']]

In [8]:
ingredients = [
    "1 cup cottage cheese",
    "2 cup strawberries",
    "1 cup blueberries",
    "4 tsp honey"
]

total, breakdown = estimate_recipe_cost(ingredients, servings=2)

print(f"\nTotal Estimated Cost: ${total:.2f}")
for name, cost, unit in breakdown:
    print(f" - {name}: ${cost:.2f} {unit}")


Total Estimated Cost: $5.13
 - cottage cheese: $1.05 USD
 - strawberries: $2.57 USD
 - blueberries: $1.16 USD
 - honey: $0.34 USD


In [None]:
def get_price_wrapper(ingredient_list):
    try:
        total, _ = estimate_recipe_cost(ingredient_list, servings=1, api_key="dc03366924mshc17da3ce290e748p18b0bfjsn5ca7b8892a81")
        return round(total, 2)
    except:
        return None

subset['estimated_price_usd'] = subset['ingredients'].apply(get_price_wrapper)


In [None]:
subset.to_csv("priced_recipe_subset.csv", index=False)


In [82]:
import pandas as pd
import ast

df = pd.read_csv("/Users/sravankundurthi/NutritionApp/data/recipenlg/full_dataset.csv")
print(df.columns)

Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER'],
      dtype='object')


In [93]:
import ast

# Convert ingredient strings to lists
df['ingredients'] = df['ingredients'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Estimate price based on ingredient count
def rough_price_from_count(ingredient_list):
    base = 0.50
    return base + len(ingredient_list) * 0.50

df['estimated_price_usd'] = df['ingredients'].apply(rough_price_from_count)

In [95]:
df.to_csv("full_dataset_with_estimated_prices.csv", index=False)


In [101]:
print(df.columns)


Index(['unnamed:_0', 'title', 'ingredients', 'directions', 'link', 'source',
       'ner', 'estimated_price_usd'],
      dtype='object')


In [103]:
import pandas as pd
import ast
import json

# Load the data
df = pd.read_csv("full_dataset_with_estimated_prices.csv")

# Parse lists from strings
for col in ['ingredients', 'directions', 'ner']:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

# Convert each row to FinalRecipeSchema format
def build_recipe_json(row):
    return {
        "title": row["title"],
        "ingredients": [{"quantity": "", "unit": "", "name": item} for item in row["ingredients"]],
        "directions": row["directions"],
        "cuisine": "unknown",  # Not in your dataset
        "diet": "unknown",     # Not in your dataset
        "tags": row["ner"] + ["unknown", "unknown"],  # Add cuisine/diet as placeholders
        "macro_estimate": {
            "calories": None,
            "protein": None,
            "carbs": None,
            "fat": None
        },
        "cost_estimate": row["estimated_price_usd"]
    }

# Apply conversion
json_recipes = df.apply(build_recipe_json, axis=1).tolist()

# Save to file
output_path = "full_recipes_structured.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(json_recipes, f, indent=2)

print(f"✅ Saved {len(json_recipes)} recipes to {output_path}")


✅ Saved 2231141 recipes to full_recipes_structured.json


In [1]:
import pandas as pd
import ast
import json
import math
import os

INPUT_CSV = "full_dataset_with_estimated_prices.csv"
OUTPUT_DIR = "recipe_json_chunks"
CHUNK_SIZE = 1000

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load dataset
df = pd.read_csv(INPUT_CSV)

# Parse lists
for col in ['ingredients', 'directions', 'ner']:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

# Convert each row to FinalRecipeSchema format
def build_recipe_json(row):
    return {
        "title": row.get("title", ""),
        "ingredients": [{"quantity": "", "unit": "", "name": item} for item in row.get("ingredients", [])],
        "directions": row.get("directions", []),
        "cuisine": "unknown",
        "diet": "unknown",
        "tags": row.get("ner", []) + ["unknown", "unknown"],
        "macro_estimate": {
            "calories": None,
            "protein": None,
            "carbs": None,
            "fat": None
        },
        "cost_estimate": row.get("estimated_price_usd", 0.0)
    }

json_recipes = df.apply(build_recipe_json, axis=1).tolist()

# Split and write chunks
num_chunks = math.ceil(len(json_recipes) / CHUNK_SIZE)
for i in range(num_chunks):
    chunk = json_recipes[i * CHUNK_SIZE:(i + 1) * CHUNK_SIZE]
    output_path = os.path.join(OUTPUT_DIR, f"recipes_chunk_{i+1:02d}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunk, f, indent=2)

print(f"✅ Done! Created {num_chunks} files in '{OUTPUT_DIR}/'")


✅ Done! Created 2232 files in 'recipe_json_chunks/'
