1. Project Setup

In [1]:
# check GPU availability
import torch
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU type: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU type: Tesla T4


In [2]:
# import packages
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForLanguageModeling
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from google.colab import drive

In [3]:
# Mount Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


2. Choosing a Model Architecture

For text generation, we'll use a decoder-only transformer model (like GPT). Good options include:

    (1) GPT-2 (smaller, faster)

    (2) GPT-Neo (open-source alternative)

    (3) GPT-J (6B parameter model)

In [10]:
# Load Model with 4-bit Quantization
model_name = "gpt2"  # Start with GPT-2, can upgrade later
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    # torch_dtype=torch.float16,
    load_in_4bit=True
)

# implement LoRA
peft_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT-2 attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

# model.print_trainable_parameters()  # ~0.1% of total params

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [12]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

3. Basic Text Generation

Let's start with simple text generation:

In [13]:
# basic text generation without fine-tuning
def generate_text(prompt, max_length=256):
    # Tokenize with return_attention_mask
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True)

    # Move tensors to the same device as the model
    device = model.device
    inputs.input_ids = inputs.input_ids.to(device)
    inputs.attention_mask = inputs.attention_mask.to(device)

    # Generate text with attention mask
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,  # This is the key addition
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id  # Explicitly set pad token
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test it
print(generate_text("once upon a time "))

once upon a time !"

And, as we all know, the more that they were able to find a way to get their hands on them, or to make them feel good.
.


4. Fine-Tuning on Custom Data

For better results, we can fine-tune on specific data. Let's use a dataset of story openings:

In [14]:
# Load dataset with streaming (for large datasets)
dataset_train = load_dataset("roneneldan/TinyStories", split="train", streaming=True)
dataset_train = dataset_train.take(5000)
dataset_valid = load_dataset("roneneldan/TinyStories", split="validation", streaming=True)
dataset_valid = dataset_valid.take(1000)

README.md: 0.00B [00:00, ?B/s]

In [15]:
# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    )
    # Add labels for language modeling
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True)
tokenized_dataset_valid = dataset_valid.map(tokenize_function, batched=True)

5. Training Setup

In [17]:
# use data collator for dynamic padding
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8  # Optimized memory
)

# Colab-Optimized Training
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/gpt2_finetune",
    per_device_train_batch_size=4,          # Increased from 2 (if VRAM allows)
    gradient_accumulation_steps=5,           # Effective batch=20
    optim="adamw_torch_fused",
    learning_rate=3e-5,                      # Slightly lower for larger batches
    num_train_epochs=1,                     # Note: num_train_epochs is ignored when max_steps is set
    warmup_steps=200,                        # Helps with large dataset
    weight_decay=0.01,
    max_steps=4000,                          # ~12 hours on Colab T4
    logging_steps=50,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    report_to="none",
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",              # Better for long runs
    gradient_checkpointing=True,              # Saves VRAM
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_valid,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=collator
)

In [18]:
# 5. Start Training
try:
    trainer.train()
except KeyboardInterrupt:
    print("Saving model before Colab disconnects...")
    trainer.save_model("/content/drive/MyDrive/gpt2_interrupted")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,3.1615,2.692424
1000,2.9526,2.640652
1500,2.905,2.548841
2000,2.8666,2.529474
2500,2.888,2.516323


In [19]:
# Save model
model.save_pretrained("/content/drive/MyDrive/final_model_gpt2_ts")

# Save tokenizer
tokenizer.save_pretrained("/content/drive/MyDrive/final_model_gpt2_ts")

('/content/drive/MyDrive/final_model_gpt2_ts/tokenizer_config.json',
 '/content/drive/MyDrive/final_model_gpt2_ts/special_tokens_map.json',
 '/content/drive/MyDrive/final_model_gpt2_ts/vocab.json',
 '/content/drive/MyDrive/final_model_gpt2_ts/merges.txt',
 '/content/drive/MyDrive/final_model_gpt2_ts/added_tokens.json',
 '/content/drive/MyDrive/final_model_gpt2_ts/tokenizer.json')

6. Advanced Generation Techniques

After training, we can implement more sophisticated generation:

In [20]:
# text generation using the re-trained model
def advanced_generation(prompt, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt")

    # Create attention mask
    attention_mask = torch.ones_like(inputs.input_ids)

    # Move tensors to the same device as the model
    device = model.device
    inputs.input_ids = inputs.input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Generate with more control
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
        do_sample=True,
        temperature=0.9,
        top_k=50,
        top_p=0.92,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id # Added pad_token_id for consistency
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [21]:
print(advanced_generation("once upon a time,"))

once upon a time, there was a little girl who liked to play in the woods. One day, she played in the forest with her friends.

One day, her friend went to the forest and saw a big tree. She wanted to play with it, but she didn't know what to do. She decided to go to the tree and play with the tree. The tree was so big, they couldn't see it. The little girl was so happy that she played with it.



7. Building an Interactive Demo

In [22]:
from IPython.display import display
import ipywidgets as widgets

# Create UI elements
text_input = widgets.Textarea(
    value="The dragon flew over the mountains and",
    placeholder='Enter your story beginning...',
    description='Prompt:',
    layout={'width': '80%', 'height': '100px'}
)

generate_button = widgets.Button(description="Generate Text")
output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        prompt = text_input.value
        print("Generating...")
        generated_text = advanced_generation(prompt)
        print("\nGenerated Text:")
        print(generated_text)

generate_button.on_click(on_button_click)

display(text_input, generate_button, output)

Textarea(value='The dragon flew over the mountains and', description='Prompt:', layout=Layout(height='100px', …

Button(description='Generate Text', style=ButtonStyle())

Output()