In [1]:
import pandas as pd

data = pd.read_csv('main_train_dataset.csv')

In [2]:
# Prepare a plain-text file for GPT-2 fine-tuning
with open('main_for_finetuning.txt', 'w', encoding='utf-8') as f:
    for _, row in data.iterrows():
        # Assuming the recipe text is in a column named "directions"
        if pd.notna(row['directions']):  # Skip NaN values
            f.write(row['directions'] + '\n<|endoftext|>\n')

print("Text file prepared: main_for_finetuning.txt")

Text file prepared: main_for_finetuning.txt


In [3]:
import accelerate
import huggingface_hub

print("Accelerate version:", accelerate.__version__)
print("Huggingface Hub version:", huggingface_hub.__version__)

Accelerate version: 1.1.1
Huggingface Hub version: 0.26.2


In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load GPT-2 tokenizer and model
model_name = 'gpt2'  # Use the small GPT-2 model to fit your hardware
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the text dataset for fine-tuning
train_path = 'main_for_finetuning.txt'
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=64  # Max token length for each training sample
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Disable masked language modeling for GPT-2
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-recipes',  # Where to save the model
    overwrite_output_dir=True,
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=2,  # Small batch size for memory efficiency
    gradient_accumulation_steps=4,
    save_steps=1000,
    save_total_limit=1,
    logging_dir='./logs',  # Directory for logs
    logging_steps=500,
    evaluation_strategy="no",  # No validation during training
    fp16=False,  # Disable mixed precision for MPS
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./gpt2-finetuned-recipes-main')
tokenizer.save_pretrained('./gpt2-finetuned-recipes-main')

print("Fine-tuning complete. Model saved to './gpt2-finetuned-recipes-main'")

2024-11-20 21:31:48.558809: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-20 21:31:48.575813: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-20 21:31:48.656214: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  return torch._C._cuda_getDeviceCount() > 0


Step,Training Loss
500,2.8368
1000,2.5872


Fine-tuning complete. Model saved to './gpt2-finetuned-recipes-main'


In [11]:
# Load the bakery dataset
bakery_dataset_path = 'bakery_dataset.csv'
bakery_data = pd.read_csv(bakery_dataset_path)

# Sample a smaller subset of the dataset (e.g., 10%)
sampled_data_1 = bakery_data.sample(frac=0.001, random_state=42)  # Adjust frac for size
print(f"Sampled dataset size: {len(sampled_data)}")

# Save the sampled dataset to a CSV file
sampled_data_1.to_csv('sampled_recipes_bakery.csv', index=False)

# Save the sampled dataset as a plain-text file with <|endoftext|> separators
with open('sampled_bakery_recipes_for_finetuning.txt', 'w', encoding='utf-8') as f:
    for _, row in sampled_data.iterrows():
        if pd.notna(row['directions']):  # Ensure the directions column is valid
            f.write(row['directions'] + '\n<|endoftext|>\n')

print("Prepared text file for bakery dataset: sampled_bakery_recipes_for_finetuning.txt")

Sampled dataset size: 1607
Prepared text file for bakery dataset: sampled_bakery_recipes_for_finetuning.txt


In [15]:
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Prepare the bakery-specific dataset for fine-tuning
train_path = 'bakery_recipes_for_finetuning.txt'
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=32  # Adjust token block size
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments for version 2
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-recipes-v2',  # Save as version 2
    overwrite_output_dir=True,
    num_train_epochs=1,  # Retrain for 1 epoch
    per_device_train_batch_size=4,  # Small batch size
    gradient_accumulation_steps=1,
    save_steps=200,
    save_total_limit=1,
    logging_dir='./logs-v2',  # Separate logs for version 2
    logging_steps=500,
    evaluation_strategy="no",
    fp16=False,  # Disable mixed precision for MPS
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Retrain the model
print("Starting retraining on bakery dataset...")
trainer.train()

# Save the retrained model
model.save_pretrained('./gpt2-finetuned-recipes-v2')
tokenizer.save_pretrained('./gpt2-finetuned-recipes-v2')

print("Retraining complete. Model saved to './gpt2-finetuned-recipes-v2'")

Starting retraining on bakery dataset...


Step,Training Loss


KeyboardInterrupt: 