In [None]:
# ================================================
# STEP 1: Mount Google Drive
# ================================================
from google.colab import drive
drive.mount('/content/drive')

# Path to save checkpoints
CHECKPOINT_DIR = "/content/drive/MyDrive/finetune_llm_checkpoints"

# Create directory if it doesn't exist
import os
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [None]:
# ================================================
# STEP 2: Install Hugging Face Transformers + Datasets
# ================================================
!pip install -q transformers datasets accelerate



In [None]:
# ================================================
# STEP 3: Load Dataset and Model
# (You can replace with your dataset/model)
# ================================================
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# Example dataset (wikitext-2)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

model_name = "gpt2"  # Change to your base model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # For causal LM padding

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])



In [None]:
# ================================================
# STEP 4: Define Data Collator
# ================================================
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
# ================================================
# STEP 5: Load Model (Resume if Checkpoint Exists)
# ================================================
from transformers import AutoModelForCausalLM

if os.path.exists(os.path.join(CHECKPOINT_DIR, "trainer_state.json")):
    print(f"Resuming from checkpoint: {CHECKPOINT_DIR}")
    model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_DIR)
else:
    print(f"Starting from base model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name)



In [None]:
# ================================================
# STEP 6: Define Trainer with Auto-Save
# ================================================
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import IntervalStrategy # Import IntervalStrategy

training_args = TrainingArguments(
    output_dir=CHECKPOINT_DIR,
    overwrite_output_dir=True,
    eval_strategy=IntervalStrategy.STEPS, # Use IntervalStrategy.STEPS
    save_strategy="steps",        # Save checkpoints every N steps
    save_steps=500,               # Adjust to save more/less frequently
    save_total_limit=3,           # Keep only last 3 checkpoints
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
# ================================================
# STEP 7: Train (Resume Automatically if Needed)
# ================================================
last_checkpoint = None
if os.path.exists(CHECKPOINT_DIR):
    from transformers.trainer_utils import get_last_checkpoint
    last_checkpoint = get_last_checkpoint(CHECKPOINT_DIR)
    if last_checkpoint is not None:
        print(f"Found checkpoint at {last_checkpoint}, resuming training...")
        
trainer.train(resume_from_checkpoint=last_checkpoint)



In [None]:
# ================================================
# STEP 8: Save Final Model to Drive
# ================================================
trainer.save_model(CHECKPOINT_DIR)
print(f"Final model saved to {CHECKPOINT_DIR}")