In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
import os

# Source and destination paths
src_path = '/content/drive/MyDrive/dl_project/h_recipes_50pct_token_max400.csv'
dst_path = '/content/h_recipes_50pct_token_max400.csv'

try:
    shutil.copy(src_path, dst_path)
except FileNotFoundError as e:
    print("Source file exists:", os.path.exists(src_path))
    print("Destination file exists:", os.path.exists(dst_path))
    print(f"Error: {e}")

Source file exists: True
Destination file exists: True


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import pandas as pd

# 1. Initialize tokenizer and model
model_name = "mbien/recipenlg"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 3. Load dataset from mounted Google Drive copy
data_file = "/content/h_recipes_50pct_token_max400.csv"
df = pd.read_csv(data_file)
dataset = Dataset.from_pandas(df[["text"]])

# 4. Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=400)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# 5. Create data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 6. Define training arguments
training_args = TrainingArguments(
    output_dir="/content/finetuned_recipenlg",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=1,
    lr_scheduler_type="linear",
    warmup_steps=50,
    fp16=torch.cuda.is_available(),
    seed=42,
    save_steps=30,
    save_total_limit=2,
    logging_dir="/content/logs",
    logging_steps=100,
    report_to="none"
)

# 7. Initialize Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

# 8. Save the fine-tuned model and tokenizer to Google Drive
output_dir = "/content/drive/MyDrive/dl_project/finetuned_recipenlg_2"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("Fine-tuning complete. Model saved to:", output_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/665M [00:00<?, ?B/s]

Map:   0%|          | 0/3371 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.9508
200,1.8401
300,1.7533
400,1.7953


Fine-tuning complete. Model saved to: /content/drive/MyDrive/dl_project/finetuned_recipenlg
