In [1]:
import pandas as pd

In [2]:
# Load the dataset
dataset_path = "dataset/full_dataset.csv"
data = pd.read_csv(dataset_path)

In [3]:
from collections import Counter
import pandas as pd

# Flatten all NER lists and count occurrences, skipping ingredients with length 1
all_ner = [
    ingredient.lower()
    for ner_list in data['NER']
    for ingredient in eval(ner_list)
    if len(ingredient.strip()) > 1
]
ner_counts = Counter(all_ner)
print(f"Total number of NER ingredient occurrences (len>1): {len(all_ner)}")
print(f"Total number of unique NER ingredients (len>1): {len(ner_counts)}")

# Get the 1000 most common ingredients
top_1000 = ner_counts.most_common(1000)
top_1000_df = pd.DataFrame(top_1000, columns=['ingredient', 'count'])

# Save to CSV
top_1000_df.to_csv('dataset/ingredients_1000.csv', index=False)

Total number of NER ingredient occurrences (len>1): 18860448
Total number of unique NER ingredients (len>1): 199014


In [4]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://foodstruct.com/glycemic-index-chart"
resp = requests.get(url)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

tbody = soup.find("tbody")
rows = tbody.find_all("tr")

data = []
for tr in rows:
    tds = tr.find_all("td")
    if len(tds) < 3:
        continue

    # Name: 
    name_cell = tds[1].get_text(strip=True)

    # Glycemic index:
    gi_cell = tds[2].get_text(strip=True)
    try:
        gi_value = int(gi_cell)
    except ValueError:
        # skip rows where GI is missing or not an integer
        continue

    data.append((name_cell, gi_value))

# 5. Write to CSV
with open("dataset/ingredients_gi.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["ingredient", "gi"])
    writer.writerows([(name.lower(), gi_value) for name, gi_value in data])

print(f"Extracted {len(data)} foods.")


Extracted 600 foods.


In [7]:
import pandas as pd

# Load both CSVs
df_1000 = pd.read_csv('dataset/ingredients_1000.csv')
gi_df = pd.read_csv('dataset/ingredients_gi.csv')

# Merge and keep only rows with both 'count' and 'gi' present (non-null)
merged = df_1000.merge(gi_df, on='ingredient', how='inner')
merged = merged[merged['gi'].notnull() & merged['count'].notnull()]

# Build a lookup for GI
gi_lookup = dict(zip(gi_df['ingredient'], gi_df['gi']))

# Find ingredients in top 1000 not in GI, but their singular/plural is
missing = df_1000[~df_1000['ingredient'].isin(gi_df['ingredient'])].copy()
added_rows = []
for ing in missing['ingredient']:
    if ing.endswith('s'):
        singular = ing[:-1]
        if singular in gi_lookup:
            added_rows.append({'ingredient': ing, 'count': df_1000.loc[df_1000['ingredient'] == ing, 'count'].values[0], 'gi': gi_lookup[singular]})
    else:
        plural = ing + 's'
        if plural in gi_lookup:
            added_rows.append({'ingredient': ing, 'count': df_1000.loc[df_1000['ingredient'] == ing, 'count'].values[0], 'gi': gi_lookup[plural]})

# Add these rows to merged
if added_rows:
    merged = pd.concat([merged, pd.DataFrame(added_rows)], ignore_index=True)

# Save the filtered dataframe
merged.to_csv('dataset/ingredients.csv', index=False)
print(f"Saved {len(merged)} ingredients with both count and GI to 'dataset/ingredients.csv'")


Saved 180 ingredients with both count and GI to 'dataset/ingredients.csv'


In [8]:
import pandas as pd
import ast

# 1) Load merged 1,000‐ingredient DataFrame (already all lowercase)
df_1000_gi = pd.read_csv('dataset/ingredients.csv')

# 2) Build a lookup dict: ingredient → gi (as integer)
df_1000_gi['gi'] = pd.to_numeric(df_1000_gi['gi'], errors='coerce').fillna(0).astype(int)
gi_dict = dict(zip(df_1000_gi['ingredient'], df_1000_gi['gi']))

# 3) Load recipe DataFrame
data = pd.read_csv('dataset/full_dataset.csv')

# 4) Parse `NER` into Python lists
data['ner_list'] = data['NER'].apply(ast.literal_eval)

# 5) Define “is_healthy”
def is_healthy(ner_list):
    if not ner_list:
        return False
    gi_values = []
    for ing in ner_list:
        ing_lc = ing.lower()
        gi = gi_dict.get(ing_lc, 0)
        if gi <= 0:
            return False
        gi_values.append(gi)
    low_count = sum(1 for gi in gi_values if gi < 55)
    return (low_count / len(gi_values)) >= 0.5

# 6) Apply the filter
data['is_healthy'] = data['ner_list'].apply(is_healthy)
healthy_recipes = data[data['is_healthy']]

print(f"Number of healthy recipes (all GIs known, ≥50% low‐GI): {len(healthy_recipes)}")

# 7) Save out only the columns you care about
healthy_recipes[['title', 'ingredients', 'directions', 'NER']].to_csv(
    'dataset/h_recipes_50pct.csv', index=False
)
healthy_recipes


Number of healthy recipes (all GIs known, ≥50% low‐GI): 3408


Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,ner_list,is_healthy
62,62,Phylis' Pineapple-Banana Salad,"[""1 large can chunk pineapple"", ""4 to 5 bananas""]","[""Drain pineapple and reserve juice."", ""Cut ba...",www.cookbooks.com/Recipe-Details.aspx?id=682439,Gathered,"[""pineapple"", ""bananas""]","[pineapple, bananas]",True
1122,1122,Fresh Cranberry-Orange Relish,"[""1 navel orange, unpeeled and cut into small ...","[""Process all ingredients in food processor us...",www.cookbooks.com/Recipe-Details.aspx?id=719301,Gathered,"[""orange"", ""cranberries"", ""sugar""]","[orange, cranberries, sugar]",True
1182,1182,No Bake Peanut Butter Cookies,"[""1 c. dry milk"", ""1 c. oatmeal"", ""1/2 c. pean...","[""Thoroughly mix everything together."", ""Shape...",www.cookbooks.com/Recipe-Details.aspx?id=748246,Gathered,"[""milk"", ""oatmeal"", ""peanut butter"", ""honey""]","[milk, oatmeal, peanut butter, honey]",True
1723,1723,Ambrosia,"[""8 small navel oranges"", ""1 ripe pineapple, p...","[""Cut peel and white pith from oranges. Slice ...",www.cookbooks.com/Recipe-Details.aspx?id=226585,Gathered,"[""oranges"", ""pineapple"", ""coconut"", ""banana""]","[oranges, pineapple, coconut, banana]",True
2808,2808,Big Red Ice Cream,"[""1 can Eagle Brand milk"", ""1 (3-liter) bottle...","[""Put Eagle Brand milk in ice cream freezer, t...",www.cookbooks.com/Recipe-Details.aspx?id=671765,Gathered,"[""milk""]",[milk],True
...,...,...,...,...,...,...,...,...,...
2227280,2227280,Party Munch Mix,"[""1 pkg. (170 g) Back to Nature 37% Cashew Nut...","[""Combine ingredients.""]",www.kraftrecipes.com/recipes/party-munch-mix-1...,Recipes1M,"[""Cashews"", ""Crackers""]","[Cashews, Crackers]",True
2228744,2228744,Chocolate Covered Strawberries,"[""2 packages fresh strawberry's"", ""1 packages ...","[""Bring a pot of water to boil."", ""Than place ...",cookpad.com/us/recipes/351835-chocolate-covere...,Recipes1M,"[""chocolate""]",[chocolate],True
2230006,2230006,Banana Pops,"[""4 large bananas"", ""2 cups fruit-flavored yog...","[""Peel the bananas, leave whole, and dip them ...",www.foodnetwork.com/recipes/cat-cora/banana-po...,Recipes1M,"[""bananas"", ""yogurt"", ""cereal""]","[bananas, yogurt, cereal]",True
2230682,2230682,Easy Peanut Butter Shake,"[""1 cup milk"", ""1/2 cup peanut butter"", ""10 cu...","[""In a blender mix the milk, peanut butter and...",cookpad.com/us/recipes/349160-easy-peanut-butt...,Recipes1M,"[""milk"", ""peanut butter"", ""sugar""]","[milk, peanut butter, sugar]",True


In [10]:
# Load the healthy recipes CSV if not already loaded
healthy_recipes = pd.read_csv('dataset/h_recipes_50pct.csv')
ingredients_all = pd.read_csv('dataset/ingredients.csv')


# Get the first 10 recipes
first_10 = healthy_recipes.head(4)

# For each recipe, print the NER ingredients and their glycemic index
for idx, row in first_10.iterrows():
    print(f"Recipe: {row['title']}")
    ner_ings = eval(row['NER']) if isinstance(row['NER'], str) else row['NER']
    for ing in ner_ings:
        ing_lower = ing.lower()
        gi_row = ingredients_all[df_1000_gi['ingredient'] == ing_lower]
        gi_val = gi_row['gi'].values[0] if not gi_row.empty else 'N/A'
        print(f"  {ing}: GI = {gi_val}")
    print('-' * 40)

Recipe: Phylis' Pineapple-Banana Salad
  pineapple: GI = 66
  bananas: GI = 48
----------------------------------------
Recipe: Fresh Cranberry-Orange Relish
  orange: GI = 45
  cranberries: GI = 45
  sugar: GI = 65
----------------------------------------
Recipe: No Bake Peanut Butter Cookies
  milk: GI = 31
  oatmeal: GI = 79
  peanut butter: GI = 14
  honey: GI = 61
----------------------------------------
Recipe: Ambrosia
  oranges: GI = 45
  pineapple: GI = 66
  coconut: GI = 59
  banana: GI = 48
----------------------------------------


In [11]:
import pandas as pd
import ast

# 1) Use the healthy_recipes DataFrame loaded earlier

def format_recipe_from_columns(example):
    title = example["title"].strip()

    # Parse the stringified lists into actual Python lists
    ingredient_list = ast.literal_eval(example["ingredients"])
    input_list      = ast.literal_eval(example["NER"])
    instruction_list= ast.literal_eval(example["directions"])

    # Build the <INPUT_START>…<INPUT_END> block using NER
    input_text = "<INPUT_START>" + "<NEXT_INPUT>".join(inp.strip() for inp in input_list) + "<INPUT_END>" if input_list else ""

    # Build the <INGR_START>…<INGR_END> block using the full ingredient lines
    ingr_text = "<INGR_START>" + "<NEXT_INGR>".join(ing.strip() for ing in ingredient_list) + "<INGR_END>" if ingredient_list else ""

    # Build the <INSTR_START>…<INSTR_END> block using directions
    instr_text = "<INSTR_START>" + "<NEXT_INSTR>".join(step.strip() for step in instruction_list) + "<INSTR_END>" if instruction_list else ""

    # Build the <TITLE_START>…<TITLE_END> block
    title_text = f"<TITLE_START>{title}<TITLE_END>"

    # Combine everything in the exact order, no extra spaces
    full_text = (
        "<RECIPE_START>"
        f"{input_text}"
        f"{ingr_text}"
        f"{instr_text}"
        f"{title_text}"
        "<RECIPE_END>"
    )

    return {"text": full_text}

# 3) Apply formatting to each row
df_formatted = healthy_recipes.apply(format_recipe_from_columns, axis=1)
df_texts = pd.DataFrame(df_formatted.tolist())


# 5) Save to a new CSV ready for training
df_texts.to_csv("dataset/h_recipes_50pct_token.csv", index=False)
print("Saved formatted recipes")


Saved formatted recipes


In [None]:
from transformers import AutoTokenizer

model_name = "mbien/recipenlg"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the formatted recipes
df_token = pd.read_csv("dataset/h_recipes_50pct_token.csv")

# Compute token lengths
token_lengths = df_token["text"].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))
max_length = token_lengths.max()

print(f"Maximum token length: {max_length}")

# Filter recipes with token length ≤ l
df_token['token_length'] = token_lengths
filtered_df = df_token[df_token['token_length'] <= 512].drop(columns=['token_length'])

# Save to new CSV
filtered_df.to_csv("dataset/h_recipes_50pct_token_max512.csv", index=False)
print(f"Saved {len(filtered_df)} recipes to 'dataset/healthy_recipes_50pct_max512.csv'")


  from .autonotebook import tqdm as notebook_tqdm


Maximum token length: 729
Saved 3390 recipes to 'dataset/healthy_recipes_50pct_max512.csv'
