In [1]:
# You know AutoTokenizer, AutoModelForCausalLM
# Trainer is what creates a class for fine-tuning.
# Training Arguments is where we collect epochs, output_dir and such arguments for trainer
# DataCollatorForLanguageModeling is where we have the chef which takes batched tokenized data WHILE training and it doesnt have
  # pre loaded batches. It all happens in training time
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

In [2]:
# A small training corpus which doesnt even make any big difference.
training_corpus = """
Kunj Shah is a passionate developer exploring the world of AI, LLMs, and web technologies. With hands-on experience in building transformer models from scratch, including custom tokenizers and attention mechanisms, he's on a mission to understand how large language models really work.
He has worked on various projects like voice-activated AI assistants, RAG-based PDF summarizers, and AI startup evaluators. He's also familiar with LangChain, Langflow, Hugging Face models, and deploying APIs on Raspberry Pi.

In addition to AI, Kunj enjoys building sleek frontend experiences using React, TailwindCSS, and Vite. He’s comfortable working across both backend and frontend stacks, with experience in Node.js, Express, and database integration via MySQL and MongoDB.
He's explored tools like n8n, Sublime Text, and Python-based automation to streamline development. Kunj is driven by curiosity, rapid prototyping, and bringing ideas to life quickly and effectively.
"""

In [3]:
# Making lines by splitting them and creating a dataset object which makes it easier to batch
lines = training_corpus.strip().split("\n\n")
dataset = Dataset.from_dict({"text": lines})

In [4]:
dataset['text']

["Kunj Shah is a passionate developer exploring the world of AI, LLMs, and web technologies. With hands-on experience in building transformer models from scratch, including custom tokenizers and attention mechanisms, he's on a mission to understand how large language models really work.\nHe has worked on various projects like voice-activated AI assistants, RAG-based PDF summarizers, and AI startup evaluators. He's also familiar with LangChain, Langflow, Hugging Face models, and deploying APIs on Raspberry Pi.",
 "In addition to AI, Kunj enjoys building sleek frontend experiences using React, TailwindCSS, and Vite. He’s comfortable working across both backend and frontend stacks, with experience in Node.js, Express, and database integration via MySQL and MongoDB.\nHe's explored tools like n8n, Sublime Text, and Python-based automation to streamline development. Kunj is driven by curiosity, rapid prototyping, and bringing ideas to life quickly and effectively."]

In [5]:
model_name = "gpt2"

# Getting GPT2 for tokenizer and actual weights
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# adding a padding token as gpt2 doesnt have one
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [9]:
def tokenize(example):
    # Simply tokenizing
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

In [10]:
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) # tokenizing the ENTIRE dataset

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [11]:
print(tokenized_dataset['input_ids'])

[[42, 403, 73, 18381, 318, 257, 15347, 8517, 13504, 262, 995, 286, 9552, 11, 27140, 10128, 11, 290, 3992, 8514, 13, 2080, 2832, 12, 261, 1998, 287, 2615, 47385, 4981, 422, 12692, 11, 1390, 2183, 11241, 11341, 290, 3241, 11701, 11, 339, 338, 319, 257, 4365, 284, 1833, 703, 1588, 3303, 4981, 1107, 670, 13, 198, 1544, 468, 3111, 319, 2972, 4493, 588, 3809, 12, 33106, 9552, 29488, 11, 371, 4760, 12, 3106, 12960, 15676, 11341, 11, 290, 9552, 13693, 5418, 84, 2024, 13, 679, 338, 635, 5385, 351, 16332, 35491, 11, 16332, 11125, 11, 12905, 2667, 15399, 4981, 11, 290, 29682, 23113, 319, 24244, 13993, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [12]:
# data_collator is the object that takes a batch from tokenized_dataset and
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
# We gather all the training arguments that we need to pass on to the trainer in future
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
    report_to="none"
)

In [14]:
# Trainer object for 'full finetuning' of gpt2 based on this small chunk of data
  # We pass in the model, arguments, tokenized data, and the data_collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [15]:
trainer.train() # And now we train it

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=5, training_loss=3.9313148498535155, metrics={'train_runtime': 98.9702, 'train_samples_per_second': 0.101, 'train_steps_per_second': 0.051, 'total_flos': 1306460160000.0, 'train_loss': 3.9313148498535155, 'epoch': 5.0})

In [16]:
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json',
 './gpt2-finetuned/tokenizer.json')

In [17]:
from transformers import pipeline

# pipeline !!!!!!!
generator = pipeline("text-generation", model="./gpt2-finetuned", tokenizer="./gpt2-finetuned")
print(generator("Kunj Shah is a ", max_new_tokens=50)[0]["generated_text"])

Device set to use cpu


Kunj Shah is a vernacular term for a young man who is learning to work for a startup in the tech industry. He has helped startups make money by developing apps that solve real-world problems, helping them get things done, and helping them solve problems that aren't
