In [2]:
import pandas as pd

In [None]:
# Load the dataset
dataset_path = "dataset/full_dataset.csv"
data = pd.read_csv(dataset_path)

In [None]:
from collections import Counter
import pandas as pd

# Flatten all NER lists and count occurrences
all_ner = [ingredient.lower() for ner_list in data['NER'] for ingredient in eval(ner_list)]
ner_counts = Counter(all_ner)
print(f"Total number of NER ingredient occurrences: {len(all_ner)}")
print(f"Total number of unique NER ingredients: {len(ner_counts)}")

# Get the 1000 most common ingredients
top_1000 = ner_counts.most_common(1000)
top_1000_df = pd.DataFrame(top_1000, columns=['ingredient', 'count'])

# Save to CSV
top_1000_df.to_csv('dataset/1000_ingredients.csv', index=False)

In [None]:
# Load the 1000 ingredients CSV
df_1000 = pd.read_csv('dataset/1000_ingredients.csv')

# Load the glycemic index CSV
gi_df = pd.read_csv('dataset/gi.csv')

# Merge the glycemic index info into the 1000 ingredients dataframe
df_1000_gi = df_1000.merge(gi_df, on='ingredient', how='left')

# Show ingredients with missing glycemic index
# Find ingredients with missing glycemic index
missing_gi = df_1000_gi[df_1000_gi['gi'].isnull()]

# Try to match by removing a trailing 's'
for idx, row in missing_gi.iterrows():
    ingredient = row['ingredient']
    if ingredient.endswith('s'):
        singular = ingredient[:-1]
        match = gi_df[gi_df['ingredient'] == singular]
        if not match.empty:
            df_1000_gi.at[idx, 'gi'] = match.iloc[0]['gi']
# Recalculate missing_gi after attempting to match singular forms
missing_gi = df_1000_gi[df_1000_gi['gi'].isnull()]
print(f"Ingredients without glycemic index info: {len(missing_gi)}")

# Save the merged dataframe
df_1000_gi.to_csv('dataset/1000_ingredients_with_gi.csv', index=False)

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://foodstruct.com/glycemic-index-chart"
resp = requests.get(url)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

tbody = soup.find("tbody")
rows = tbody.find_all("tr")

data = []
for tr in rows:
    tds = tr.find_all("td")
    if len(tds) < 3:
        continue

    # Name: 
    name_cell = tds[1].get_text(strip=True)

    # Glycemic index:
    gi_cell = tds[2].get_text(strip=True)
    try:
        gi_value = int(gi_cell)
    except ValueError:
        # skip rows where GI is missing or not an integer
        continue

    data.append((name_cell, gi_value))

# 5. Write to CSV
with open("dataset/gi.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["ingredient", "gi"])
    writer.writerows([(name.lower(), gi_value) for name, gi_value in data])

print(f"Extracted {len(data)} foods. Saved to dataset/gi.csv")


In [3]:
# Filter ingredients that have a valid glycemic index (gi > 0)
df_1000_gi = pd.read_csv('dataset/1000_ingredients_with_gi.csv')
with_gi = df_1000_gi[df_1000_gi['gi'].notnull() & (df_1000_gi['gi'] > 0)]

# Save to a separate file
with_gi.to_csv('dataset/1000_ingredients_with_valid_gi.csv', index=False)
print(f"Saved {len(with_gi)} ingredients with valid GI to 'dataset/1000_ingredients_with_valid_gi.csv'")

Saved 123 ingredients with valid GI to 'dataset/1000_ingredients_with_valid_gi.csv'


In [None]:
import pandas as pd
import ast

# 1) Load your merged 1,000‐ingredient DataFrame (with columns “ingredient” and “gi”)
df_1000_gi = pd.read_csv('dataset/1000_ingredients_with_gi.csv')

# 2) Build a lookup dict: ingredient (lowercase) → gi (as integer)
df_1000_gi['gi'] = pd.to_numeric(df_1000_gi['gi'], errors='coerce').fillna(0).astype(int)
gi_dict = dict(zip(df_1000_gi['ingredient'].str.lower(), df_1000_gi['gi']))

# 3) Load your recipe DataFrame (with ‘NER’ as a string‐encoded Python list of ingredients)
data = pd.read_csv('dataset/full_dataset.csv')

# 4) Parse `NER` once into actual Python lists
data['ner_list'] = data['NER'].apply(ast.literal_eval)

# 5) Define a robust “is_healthy” that:
#    a) immediately rejects if ner_list is empty
#    b) rejects if any ingredient has missing/invalid GI (gi ≤ 0)
#    c) otherwise, checks if ≥70% of ingredients have gi < 55
def is_healthy(ner_list):
    # If no ingredients at all, reject
    if not ner_list:
        return False

    # Lowercase everything for consistent lookup
    ing_lowers = [ing.lower() for ing in ner_list]

    gi_values = []
    for ing in ing_lowers:
        gi = gi_dict.get(ing, 0)
        # If gi ≤ 0, treat that as “no valid GI”—reject immediately
        if gi <= 0:
            return False
        gi_values.append(gi)

    # At this point, gi_values is non‐empty and all > 0
    low_count = sum(1 for gi in gi_values if gi < 55)
    return (low_count / len(gi_values)) >= 0.50

# 6) Apply the filter
data['is_healthy'] = data['ner_list'].apply(is_healthy)
healthy_recipes = data[data['is_healthy']]

print(f"Number of healthy recipes (all GIs known, ≥70% low‐GI): {len(healthy_recipes)}")

# 7) Save out only the columns you care about
healthy_recipes[['title', 'ingredients', 'directions', 'NER']].to_csv(
    'dataset/healthy_recipes_50pct.csv', index=False
)
healthy_recipes

Number of healthy recipes (all GIs known, ≥70% low‐GI): 1631


Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,ner_list,is_healthy
2808,2808,Big Red Ice Cream,"[""1 can Eagle Brand milk"", ""1 (3-liter) bottle...","[""Put Eagle Brand milk in ice cream freezer, t...",www.cookbooks.com/Recipe-Details.aspx?id=671765,Gathered,"[""milk""]",[milk],True
6190,6190,Danish Pickles,"[""Desired amount of cucumbers""]","[""Slice cucumbers very thin and sprinkle with ...",www.cookbooks.com/Recipe-Details.aspx?id=790326,Gathered,"[""cucumbers""]",[cucumbers],True
6829,6829,Frozen Chocolate Bananas,"[""1/4 c. peanuts, chopped"", ""1/2 c. milk"", ""1 ...","[""Chop peanuts into bowl."", ""Put foil on a pla...",www.cookbooks.com/Recipe-Details.aspx?id=555533,Gathered,"[""peanuts"", ""milk"", ""banana"", ""chocolate""]","[peanuts, milk, banana, chocolate]",True
12624,12624,Puddles(Candy),"[""1 can Eagle Brand milk"", ""2 large Hershey ba...","[""Melt together the milk and Hershey bars. Tak...",www.cookbooks.com/Recipe-Details.aspx?id=549972,Gathered,"[""milk"", ""pecans""]","[milk, pecans]",True
12760,12760,Fudgsicles,"[""1 pkg. instant pudding (chocolate)"", ""2 1/2 ...","[""Mix as directed on pudding package."", ""Pour ...",www.cookbooks.com/Recipe-Details.aspx?id=784838,Gathered,"[""chocolate"", ""milk""]","[chocolate, milk]",True
...,...,...,...,...,...,...,...,...,...
2221045,2221045,Canning Succotash Recipe,"[""3 lb corn in husks - (to 6 lbs) to make 1 qt...","[""Choose the freshest corn possible."", ""Select...",cookeatshare.com/recipes/canning-succotash-98975,Recipes1M,"[""corn"", ""beans"", ""green beans""]","[corn, beans, green beans]",True
2221126,2221126,Pineapple and Banana Smoothie,"[""2 large ripe bananas, cut into pieces"", ""4 t...","[""Blend the bananas, pour in the pineapple jui...",www.food.com/recipe/pineapple-and-banana-smoot...,Recipes1M,"[""bananas"", ""yogurt"", ""pineapple juice""]","[bananas, yogurt, pineapple juice]",True
2221432,2221432,Spaghetti With Marinara Sauce,"[""1 pound spaghetti"", ""2 cups tomato sauce"", ""...","[""Cook spaghetti in a 6- to 8-quart pot of boi...",www.epicurious.com/recipes/food/views/spaghett...,Recipes1M,"[""spaghetti"", ""tomato sauce""]","[spaghetti, tomato sauce]",True
2228744,2228744,Chocolate Covered Strawberries,"[""2 packages fresh strawberry's"", ""1 packages ...","[""Bring a pot of water to boil."", ""Than place ...",cookpad.com/us/recipes/351835-chocolate-covere...,Recipes1M,"[""chocolate""]",[chocolate],True


In [6]:
# Load the healthy recipes CSV if not already loaded
healthy_recipes_csv = pd.read_csv('dataset/healthy_recipes_50pct.csv')
df_1000_gi = pd.read_csv('dataset/1000_ingredients_with_gi.csv')


# Get the first 10 recipes
first_10 = healthy_recipes_csv.head(10)

# For each recipe, print the NER ingredients and their glycemic index
for idx, row in first_10.iterrows():
    print(f"Recipe: {row['title']}")
    ner_ings = eval(row['NER']) if isinstance(row['NER'], str) else row['NER']
    for ing in ner_ings:
        ing_lower = ing.lower()
        gi_row = df_1000_gi[df_1000_gi['ingredient'] == ing_lower]
        gi_val = gi_row['gi'].values[0] if not gi_row.empty else 'N/A'
        print(f"  {ing}: GI = {gi_val}")
    print('-' * 40)

Recipe: Phylis' Pineapple-Banana Salad
  pineapple: GI = 66.0
  bananas: GI = 48.0
----------------------------------------
Recipe: Fresh Cranberry-Orange Relish
  orange: GI = 45.0
  cranberries: GI = 45.0
  sugar: GI = 65.0
----------------------------------------
Recipe: No Bake Peanut Butter Cookies
  milk: GI = 31.0
  oatmeal: GI = 79.0
  peanut butter: GI = 14.0
  honey: GI = 61.0
----------------------------------------
Recipe: Ambrosia
  oranges: GI = 45.0
  pineapple: GI = 66.0
  coconut: GI = 59.0
  banana: GI = 48.0
----------------------------------------
Recipe: Big Red Ice Cream
  milk: GI = 31.0
----------------------------------------
Recipe: Orange Sherbet Surprise
  pineapple: GI = 66.0
  milk: GI = 31.0
----------------------------------------
Recipe: No Bake Peanut Butter Cookies
  milk: GI = 31.0
  peanut butter: GI = 14.0
  crackers: GI = 63.0
  sugar: GI = 65.0
----------------------------------------
Recipe: Danish Pickles
  cucumbers: GI = 21.0
----------------

In [11]:
import pandas as pd
import ast

# 1) Use the existing csv_path and df variables

def format_recipe_from_columns(example):
    """
    Given a row with columns:
      - title       : string
      - ingredients : stringified Python list of full ingredient lines
      - directions  : stringified Python list of step instructions
      - NER         : stringified Python list of ingredient names to use as inputs
    Return a dict with one key 'text' whose value is the fully-tagged recipe string.
    Removes unnecessary spaces to minimize token length.
    """
    title = example["title"].strip()

    # Parse the stringified lists into actual Python lists
    ingredient_list = ast.literal_eval(example["ingredients"])
    input_list      = ast.literal_eval(example["NER"])
    instruction_list= ast.literal_eval(example["directions"])

    # Build the <INPUT_START>…<INPUT_END> block using NER
    input_text = "<INPUT_START>" + "<NEXT_INPUT>".join(inp.strip() for inp in input_list) + "<INPUT_END>" if input_list else ""

    # Build the <INGR_START>…<INGR_END> block using the full ingredient lines
    ingr_text = "<INGR_START>" + "<NEXT_INGR>".join(ing.strip() for ing in ingredient_list) + "<INGR_END>" if ingredient_list else ""

    # Build the <INSTR_START>…<INSTR_END> block using directions
    instr_text = "<INSTR_START>" + "<NEXT_INSTR>".join(step.strip() for step in instruction_list) + "<INSTR_END>" if instruction_list else ""

    # Build the <TITLE_START>…<TITLE_END> block
    title_text = f"<TITLE_START>{title}<TITLE_END>"

    # Combine everything in the exact order, no extra spaces
    full_text = (
        "<RECIPE_START>"
        f"{input_text}"
        f"{ingr_text}"
        f"{instr_text}"
        f"{title_text}"
        "<RECIPE_END>"
    )

    return {"text": full_text}

# 3) Apply formatting to each row
df_formatted = df.apply(format_recipe_from_columns, axis=1)
df_texts = pd.DataFrame(df_formatted.tolist())

# 4) Preview (optional)
for i, row in df_texts.head(3).iterrows():
    print(row["text"])
    print("-" * 80)

# 5) Save to a new CSV ready for training
df_texts.to_csv("dataset/healthy_recipes_50pct_with_token.csv", index=False)
print("Saved formatted recipes to 'dataset/healthy_recipes_50pct_with_token.csv'")


<RECIPE_START><INPUT_START>pineapple<NEXT_INPUT>bananas<INPUT_END><INGR_START>1 large can chunk pineapple<NEXT_INGR>4 to 5 bananas<INGR_END><INSTR_START>Drain pineapple and reserve juice.<NEXT_INSTR>Cut bananas in chunks like pineapple.<INSTR_END><TITLE_START>Phylis' Pineapple-Banana Salad<TITLE_END><RECIPE_END>
--------------------------------------------------------------------------------
<RECIPE_START><INPUT_START>orange<NEXT_INPUT>cranberries<NEXT_INPUT>sugar<INPUT_END><INGR_START>1 navel orange, unpeeled and cut into small chunks<NEXT_INGR>1 bag cranberries (12 oz.)<NEXT_INGR>3/4 c. granulated sugar<INGR_END><INSTR_START>Process all ingredients in food processor using on/off turns until finely chopped. Cover and chill at least 1 hour.<NEXT_INSTR>Makes 3 cups; 12 servings.<INSTR_END><TITLE_START>Fresh Cranberry-Orange Relish<TITLE_END><RECIPE_END>
--------------------------------------------------------------------------------
<RECIPE_START><INPUT_START>milk<NEXT_INPUT>oatmeal<NEX

In [15]:
from transformers import AutoTokenizer

model_name = "mbien/recipenlg"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the formatted recipes
df_token = pd.read_csv("dataset/healthy_recipes_50pct_with_token.csv")

# Compute token lengths
token_lengths = df_token["text"].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))
max_length = token_lengths.max()

print(f"Maximum token length: {max_length}")

# Filter recipes with token length ≤ 512
df_token['token_length'] = token_lengths
filtered_df = df_token[df_token['token_length'] <= 400].drop(columns=['token_length'])

# Save to new CSV
filtered_df.to_csv("dataset/healthy_recipes_50pct_max400.csv", index=False)
print(f"Saved {len(filtered_df)} recipes to 'dataset/healthy_recipes_50pct_max512.csv'")


Maximum token length: 729
Saved 3371 recipes to 'dataset/healthy_recipes_50pct_max512.csv'
