In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("./RecipeNLG/RecipeNLG_dataset.csv")

# Display the first 5 rows
print(df.head())


   Unnamed: 0                  title  \
0           0    No-Bake Nut Cookies   
1           1  Jewell Ball'S Chicken   
2           2            Creamy Corn   
3           3          Chicken Funny   
4           4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com

In [2]:
# Create a 1% sample
sample_1 = df.sample(frac=0.01, random_state=42)
sample_1.to_csv("RecipeNLG/RecipeNLG_dataset_1pct.csv", index=False)

# Create a 5% sample
sample_5 = df.sample(frac=0.05, random_state=42)
sample_5.to_csv("RecipeNLG/RecipeNLG_dataset_5pct.csv", index=False)

print("Samples saved successfully.")

Samples saved successfully.


In [3]:
sample_1.count()

Unnamed: 0     22311
title          22311
ingredients    22311
directions     22311
link           22311
source         22311
NER            22311
dtype: int64

In [4]:
sample_1.dtypes

Unnamed: 0      int64
title          object
ingredients    object
directions     object
link           object
source         object
NER            object
dtype: object

In [5]:
import json

# Columns that look like stringified lists
list_columns = ["ingredients", "directions", "NER"]

for col in list_columns:
    sample_1[col] = sample_1[col].apply(json.loads)

In [6]:
sample_1['NER'].head()

2015528    [flank steak, green onions, red wine, soy sauc...
1608734    [rosemary, thyme, bay leaves, paprika, pepper,...
778500            [carrots, butter, brown sugar, lemon rind]
1334975    [Flour, Salt, Baking Powder, Sugar, Crisco, eg...
116562                            [thin pretzels, margarine]
Name: NER, dtype: object

In [7]:
# Print NER items from first 5 rows (or any n)
for i in range(5):
    print(f"NER[{i}]:", end=" ")
    
    for j in range(len(sample_1['NER'].iloc[i])):
        print(sample_1['NER'].iloc[i][j], end=", ")
    
    print()  # Newline after each row

NER[0]: flank steak, green onions, red wine, soy sauce, salad oil, sesame seeds, brown sugar, grnd black pepper, grnd ginger, clove garlic, 
NER[1]: rosemary, thyme, bay leaves, paprika, pepper, red wine, chicken broth, button mushrooms, mushroom mix, carrots, onion, frozen green beans, black olives, handful grape tomatoes, chicken, stalks celery, water, 
NER[2]: carrots, butter, brown sugar, lemon rind, 
NER[3]: Flour, Salt, Baking Powder, Sugar, Crisco, egg, vinegar, Water, 
NER[4]: thin pretzels, margarine, 


In [8]:
# Drop missing data
sample_1 = sample_1.dropna(subset=["ingredients", "directions"])

# Optional: Keep only relevant columns
sample_1 = sample_1[["ingredients", "directions"]]

# Convert each example into a prompt-response format
def format_example(row):
    return f"""### Ingredients:\n{row['ingredients']}\n\n### Directions:\n{row['directions']}"""

# Apply formatting
sample_1["formatted"] = sample_1.apply(format_example, axis=1)

# Save to text file (line-by-line for fine-tuning)
with open("./RecipeNLG/RecipeNLG_dataset_gpt_in_1pct.txt", "w", encoding="utf-8") as f:
    for line in sample_1["formatted"]:
        f.write(line + "\n\n")

### Test GPT2 with 1% recipe input
Is the input ready to be used by GPT2 type model? Test with pretrained model with fine tuning for working example then work on model design.

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

# Load tokenizer & model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

print("Load your formatted text dataset with datasets library")
dataset = load_dataset("text", data_files={"train": "./RecipeNLG/RecipeNLG_dataset_gpt_in_1pct.txt"})

# Set padding token
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

# Tokenize the text
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=512, padding="max_length")  # Ensure padding

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print("Prepare data collator for language modeling (no masked language model here)")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT uses causal LM, not masked LM like BERT
)

print("Define training arguments")
training_args = TrainingArguments(
    output_dir="./gpt2-recipes",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100
)

print("Setup the Trainer")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)

print("Start fine-tuning")
trainer.train()

print("Save the final model and tokenizer")
model.save_pretrained("./gpt2-recipes-final")
tokenizer.save_pretrained("./gpt2-recipes-final")

Load your formatted text dataset with datasets library


Map:   0%|          | 0/133866 [00:00<?, ? examples/s]

Prepare data collator for language modeling (no masked language model here)
Define training arguments
Setup the Trainer
Start fine-tuning


Step,Training Loss
100,2.63
200,2.2036
300,1.9753
400,2.0415
500,1.9587
600,1.8307
700,2.021
800,1.7582
900,1.937
1000,1.7155


In [None]:
#!pip install 'accelerate>=0.26.0'

In [None]:
#!pip install torch