1. Project Setup

In [1]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [2]:
# check GPU availability
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU name: Tesla T4


In [3]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# import packages
import os
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          Trainer, TrainingArguments,
                          TrainerCallback, EarlyStoppingCallback,
                          DataCollatorForLanguageModeling,
                          BitsAndBytesConfig)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

In [5]:
# Load Model with 4-bit Quantization
model_name = "gpt2"  # Start with GPT-2, can upgrade later
# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
)

# implement LoRA
peft_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT-2 attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

# this is required
model.enable_input_require_grads()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

2. importing datasets and tokenizing the datasets

In [6]:
# for fine-tuning resumption, find the latest checkpoint
# Paths
output_dir = "/content/drive/MyDrive/gpt2_finetune"

def check_latest_checkpoint(output_dir):
    # Detect latest checkpoint (if any exist)
    checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        latest_checkpoint = os.path.join(output_dir, max(checkpoints, key=lambda x: int(x.split("-")[1])))
    else:
        latest_checkpoint = None

    return latest_checkpoint

print(f"Latest checkpoint: {check_latest_checkpoint(output_dir)}")

Latest checkpoint: /content/drive/MyDrive/gpt2_interrupted/checkpoint-7500


In [7]:
# initial tokenizer uploaded from HF
if not check_latest_checkpoint(output_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Add padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # save the modified tokenizer to ouput_dir for resumption FT
    tokenizer.save_pretrained(output_dir)

In [10]:
# if FT resumption, load tokenizer from output_dir
if check_latest_checkpoint(output_dir):
    tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [11]:
# Load dataset with streaming (for large datasets)
dataset_train = load_dataset("roneneldan/TinyStories", split="train", streaming=True)
dataset_train = dataset_train.shuffle(seed=42, buffer_size=50000)
# load validation dataset
dataset_valid = load_dataset("roneneldan/TinyStories", split="validation", streaming=True)
dataset_valid = dataset_valid.shuffle(seed=42, buffer_size=10000)

README.md: 0.00B [00:00, ?B/s]

In [12]:
# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    )
    # Add labels for language modeling
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True, batch_size=1000)  # Process in larger batches
tokenized_dataset_valid = dataset_valid.map(tokenize_function, batched=True, batch_size=1000)

3. Training Setup

In [13]:
# Colab-Optimized Training
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,          # Increased from 2 (if VRAM allows)
    gradient_accumulation_steps=5,           # Effective batch=20
    optim="adamw_torch_fused",
    learning_rate=3e-5,                      # Slightly lower for larger batches
    num_train_epochs=1,                     # Note: num_train_epochs is ignored when max_steps is set
    warmup_steps=200,                        # Helps with large dataset
    weight_decay=0.01,
    max_steps=105986,                          # 2119719 samples total
    logging_steps=50,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    report_to="none",
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",              # Better for long runs
    gradient_checkpointing=True,              # Saves VRAM
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_valid,
)

In [14]:
# use data collator for dynamic padding
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8  # Optimized memory
)

# Add callback to verify
class ResumeCheckCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print(f"Resumed from global_step: {state.global_step}")
        print(f"Model LoRA layers: {[n for n, p in model.named_parameters() if 'lora' in n and p.requires_grad]}")

4. Start Training

In [15]:
# Train (resume if checkpoint exists), otherwise start initial FT

if not check_latest_checkpoint(output_dir):
    trainer.callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    trainer.data_collator = collator

    print("Starting fresh training ...")
    trainer.train()

else:
    if not any(isinstance(c, ResumeCheckCallback) for c in trainer.callback_handler.callbacks):
        trainer.add_callback(ResumeCheckCallback())

    print(f"Resuming from {check_latest_checkpoint(output_dir)} ...")
    trainer.train(resume_from_checkpoint=check_latest_checkpoint(output_dir))

Resuming from /content/drive/MyDrive/gpt2_interrupted/checkpoint-4500 ...
Resumed from global_step: 4500
Model LoRA layers: ['base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight', 'base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight', 'base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight', 'base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight', 'base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.1.attn.c_proj.lora_A.default.weight', 'base_model.model.transformer.h.1.attn.c_proj.lora_B.default.weight', 'base_model.model.transformer.h.1.mlp.c_proj.lora_A.default.weight', 'base_model.model.transformer.h.1.mlp.c_proj.lora_B.default.weight', 'base_model.model.transformer.h.2.attn.

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
5000,2.1385,1.906918
5500,2.122,1.886076
6000,2.1193,1.877362
6500,2.0837,1.866447
7000,2.0824,1.850914
7500,2.0448,1.840981


KeyboardInterrupt: 

5. Advanced Generation Techniques

In [None]:
def advanced_generation(prompt, max_length=1024):
    inputs = tokenizer(prompt, return_tensors="pt")

    # Create attention mask
    attention_mask = torch.ones_like(inputs.input_ids)

    # Generate with more control
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
        do_sample=True,
        temperature=0.9,
        top_k=50,
        top_p=0.92,
        repetition_penalty=1.2
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


print(advanced_generation("once upon a time,"))

6. Building an Interactive Demo

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Create UI elements
text_input = widgets.Textarea(
    value="The dragon flew over the mountains and",
    placeholder='Enter your story beginning...',
    description='Prompt:',
    layout={'width': '80%', 'height': '100px'}
)

generate_button = widgets.Button(description="Generate Text")
output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        prompt = text_input.value
        print("Generating...")
        generated_text = advanced_generation(prompt)
        print("\nGenerated Text:")
        print(generated_text)

generate_button.on_click(on_button_click)