In [2]:
import pandas as pd
# Load the CSV file
df = pd.read_csv('/Users/yanyi/Desktop/IE 7374/Project/project datasets/RecipeNLG_dataset.csv')

# Display the first 5 rows
print(df.head())


   Unnamed: 0                  title  \
0           0    No-Bake Nut Cookies   
1           1  Jewell Ball'S Chicken   
2           2            Creamy Corn   
3           3          Chicken Funny   
4           4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com

In [None]:
import json
df["NER"] = df["NER"].apply(json.loads)

In [None]:
fridge_items = {"milk", "cheese", "cream cheese", "butter", "yogurt", "sour cream", "egg", "tomato", "onion", "lettuce", "spinach", "cabbage", "zucchini", "carrot", "garlic", "cucumber", "pepper", "celery", "apple", "grapes", "corn", "mushroom"}

In [None]:
# Filter recipes with 3-7 ingredients and ≥2 items in fridge list
def is_fridge_friendly(ingredients):
    match_count = sum(item.lower() in fridge_items for item in ingredients)
    return 3 <= len(ingredients) <= 7 and match_count >= 2

In [None]:
# Filter the dataset and drop unnecessary columns
df_subset = (
    df[df["NER"].apply(is_fridge_friendly)]
    .drop(columns=["Unnamed: 0", "link", "source"])
    .sample(frac=0.05, random_state=42)  # Sample 5% of the dataset
)

In [7]:
print("Original DataFrame shape:", len(df))
print("Filtered DataFrame shape:", len(df_subset))
print(df_subset[["title", "NER"]].head())

Original DataFrame shape: 2231142
Filtered DataFrame shape: 9158
                             title  \
1700466  Nif's Chicken and Peaches   
365097           Chicken Casserole   
492504    Chocolate Cookie Pudding   
2183707       Mom's Salmon Patties   
137318             No Name Chicken   

                                                       NER  
1700466  [olive oil, onion, garlic, fresh ginger, chick...  
365097   [chicken, cream of chicken soup, water chestnu...  
492504   [Oreo cookies, powdered sugar, milk, cream che...  
2183707  [salmon, egg, onion, green bell pepper, saltin...  
137318   [chicken breasts, Italian dressing mix, cream ...  


In [8]:
# Count how many recipes have 3–5 ingredients
df["ner_len"] = df["NER"].apply(len)
print(df["ner_len"].value_counts().sort_index())

ner_len
0         573
1        7047
2       31812
3       89827
4      150979
        ...  
98          1
219         1
276         1
328         1
402         1
Name: count, Length: 78, dtype: int64


In [None]:
df_subset.to_csv("fridge_friendly_subset.csv", index=False)

In [None]:
# Join NER and title into plain training lines
df_subset["text"] = df_subset["NER"].apply(lambda x: ", ".join(x)) + " → " + df_subset["title"]

# Save text data to file for later tokenization
save_path = "/Users/yanyi/Desktop/IE 7374/Project/RecipeNLG_dataset_gpt_in_1pct.txt"
df_subset["text"].to_csv(save_path, index=False, header=False)

In [None]:
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)
from datasets import load_dataset

# Load tokenizer & model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set eos token as pad token
tokenizer.pad_token = tokenizer.eos_token

print("Load your formatted text dataset")
dataset = load_dataset("text", data_files={
    "train": "/Users/yanyi/Desktop/IE 7374/Project/RecipeNLG_dataset_gpt_in_1pct.txt"
})

# Tokenize the text
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./gpt2-recipes",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)

print("Starting fine-tuning")
trainer.train()

print("Saving model and tokenizer")
model.save_pretrained("./gpt2-recipes-final")
tokenizer.save_pretrained("./gpt2-recipes-final")

Load your formatted text dataset


Generating train split: 9158 examples [00:00, 574161.97 examples/s]
Map: 100%|██████████| 9158/9158 [00:03<00:00, 3043.68 examples/s]


Starting fine-tuning


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.918
200,2.6703
300,2.5922
400,2.5054
500,2.5096
600,2.4864
700,2.4349
800,2.4995
900,2.35
1000,2.4227


Saving model and tokenizer


('./gpt2-recipes-final/tokenizer_config.json',
 './gpt2-recipes-final/special_tokens_map.json',
 './gpt2-recipes-final/vocab.json',
 './gpt2-recipes-final/merges.txt',
 './gpt2-recipes-final/added_tokens.json')