In [1]:
from datasets import load_dataset

data_file_path = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json"
dataset = load_dataset("json", data_files=data_file_path)

In [2]:
dataset = dataset['train'].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 41601
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 10401
    })
})

In [3]:
dataset['train'][0]

{'instruction': 'What is the best way to introduce yourself to a group?',
 'input': '',
 'output': 'The best way to introduce yourself to a group is to keep it short and sweet, start by saying your name, share a brief one-line description of what you do and why you’re part of the group, and then invite questions and conversation.'}

In [4]:
def formatting_prompts_func(example):
    text = f"### Instruction: {example['instruction']}\n### Input: {example['input']}\n### Output: {example['output']}"
    return {"messages": text}

In [5]:
formatted_dataset = dataset.map(formatting_prompts_func)

Map:   0%|          | 0/41601 [00:00<?, ? examples/s]

Map:   0%|          | 0/10401 [00:00<?, ? examples/s]

In [6]:
formatted_dataset['train'][0]

{'instruction': 'What is the best way to introduce yourself to a group?',
 'input': '',
 'output': 'The best way to introduce yourself to a group is to keep it short and sweet, start by saying your name, share a brief one-line description of what you do and why you’re part of the group, and then invite questions and conversation.',
 'messages': '### Instruction: What is the best way to introduce yourself to a group?\n### Input: \n### Output: The best way to introduce yourself to a group is to keep it short and sweet, start by saying your name, share a brief one-line description of what you do and why you’re part of the group, and then invite questions and conversation.'}

In [7]:
import torch
device = ("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import setup_chat_format

# # Load model and tokenizer
# model_id = "HuggingFaceTB/SmolLM2-360M"
# model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_id)

In [10]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Or use load_in_4bit for even more compression
    llm_int8_threshold=6.0
)

model_id = "HuggingFaceTB/SmolLM2-360M"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)#.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [12]:
# Do an initial visual assessement by asking some question

In [11]:
test_input = {
    'instruction': 'Explain the physical properties of air.',
 'input': '',
 'output': ''
 }

In [12]:
prompt = formatting_prompts_func(test_input)
print(prompt["messages"])

### Instruction: Explain the physical properties of air.
### Input: 
### Output: 


In [13]:

# Ensure PAD token is set (important for causal models like LLaMA, Mistral, etc.)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS as PAD

inputs = tokenizer(prompt["messages"], return_tensors="pt",padding=True, truncation=True).to(device)
outputs = model.generate(**inputs,max_length=100)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


### Instruction: Explain the physical properties of air.
### Input: 
### Output: 
### Constraints: 
### Time Limit: 1.00s
### Memory Limit: 64M
### Problem Description

The air is a very important part of our lives. It is the medium through which we breathe, the medium through which we communicate, and the medium through which we travel. It is also the medium through which we can see, hear,


In [14]:
def formatting_prompts_func2(example):
    text = f"### Instruction: {example['instruction']}\n### Input: {example['input']}\n### Output: {example['output']}"
    return text

In [15]:
# PEFT Config

from peft import LoraConfig

# TODO: Configure LoRA parameters
# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 6
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules=["q_proj", "v_proj"], #"all-linear" ,  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

In [16]:
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer

# Configure trainer
training_args = SFTConfig(
    output_dir="./sft_smol_book_full",
    # max_steps=1000,
    per_device_train_batch_size=8,
    # learning_rate=5e-5,
    # logging_steps=10,
    # save_steps=100,
    eval_strategy="steps",
    eval_steps=1000,
    #packing=True
)

trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    formatting_func=formatting_prompts_func2,
    peft_config=peft_config,
)

Applying formatting function to train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/41601 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/10401 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/10401 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/10401 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/10401 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/10401 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

In [None]:
import matplotlib.pyplot as plt

# Extract loss and steps from trainer logs
losses = [log["loss"] for log in trainer.state.log_history if "loss" in log]
steps = [log["step"] for log in trainer.state.log_history if "step" in log]

# Plot
plt.figure(figsize=(8, 5))
plt.plot( losses, label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Loss vs Steps")
plt.legend()
plt.grid()
plt.show()


In [None]:
trainer

In [None]:
# len(losses)

In [25]:
# len(steps)

In [24]:
# test_input = dataset['train'][0]
# # test_input

In [47]:
# test_input = {
#     'instruction': 'Rewrite the sentence using a simile.',
#  'input': 'The car is very fast.',
#  'output': ''
#  }

In [23]:
# prompt = formatting_prompts_func(test_input)
# prompt

In [22]:

# # Ensure PAD token is set (important for causal models like LLaMA, Mistral, etc.)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token  # Use EOS as PAD

# inputs = tokenizer(prompt, return_tensors="pt",padding=True, truncation=True).to(device)
# outputs = model.generate(**inputs,max_length=100)
# print(tokenizer.decode(outputs[0],skip_special_tokens=True))