# 🤖 Mini Fine-Tune: LLaMA-Inspired Language Model

In this notebook, we'll fine-tune a small open-source LLM on a custom dataset.
No ML background needed — just follow along and explore the power of language models!


In [None]:
# 📦 Step 1: Install Required Packages
!pip install -q transformers datasets accelerate peft trl torch scipy

In [None]:
# 📁 Step 2: Load Sample Dataset
import json

with open('dataset.json') as f:
    data = json.load(f)

for i in data:
    print('Instruction:', i['instruction'])
    print('Input:', i['input'])
    print('Expected Output:', i['output'])
    print('-'*50)

In [None]:
# 🤗 Step 3: Load a Pretrained Model (Tiny Version for Demo)
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "sshleifer/tiny-gpt2"  # A small GPT2 for quick runs
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# 🛠️ Step 4: Preprocess Data
from datasets import Dataset

def format(example):
    prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nOutput:"
    return {"text": prompt + example['output']}

formatted_data = [format(ex) for ex in data]
dataset = Dataset.from_list(formatted_data)
dataset = dataset.train_test_split(test_size=0.3)
dataset["train"][0]

In [None]:
# 🎯 Step 5: Fine-tune with Trainer
from transformers import Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
)

tokenized_dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding='max_length', max_length=128), batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
# ✨ Step 6: Test Your Model
prompt = "Instruction: Translate the sentence to French.\nInput: Good morning!\nOutput:"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))