In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "meta-llama/Llama-3.2-1B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Enable double quantization for better memory efficiency
    bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

In [2]:
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51760
    })
})


In [3]:
tokenizer.eos_token

'<|end_of_text|>'

In [4]:
tokenizer.bos_token

'<|begin_of_text|>'

In [18]:
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def format_prompt(instruction, input, output):
    prompt_template = """### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}
<|end_of_text|>"""
    if len(input):
        return prompt_template.format(
            instruction=instruction,
            input=input,
            output=output,
        )
    prompt_template = """### Instruction:
{instruction}

### Response:
{output}
<|end_of_text|>"""
    return prompt_template.format(
        instruction=instruction,
        output=output,
    )



def formatting_prompts_func(examples):
    outputs = examples['output']
    inputs = examples['input']
    instructions = examples['instruction']
    text = []
    for output, input, instruction in zip(outputs, inputs, instructions):
        text.append(format_prompt(instruction, input, output))
    return {'text': text}

train_datasets = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=['instruction', 'input', 'output'],
)
print(train_datasets)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 51760
    })
})


In [7]:
example = next(iter(train_datasets['train']))
print(example)

{'text': '### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n<|end_of_text|>'}


In [8]:
from peft import get_peft_model, LoraConfig, TaskType

# Apply PEFT (LoRA) configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Task type for causal language modeling
    inference_mode=False,          # Set to False for training
    r=32,                          # Rank of the low-rank matrices
    lora_alpha=32,                 # Scaling factor
    lora_dropout=0.1,              # Dropout rate
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Target specific layers for LoRA
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 1,242,630,144 || trainable%: 0.5485


In [16]:
from trl import SFTConfig, SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq

max_seq_length = 512


training_args = SFTConfig(
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    packing=True,
    learning_rate=2e-4,
    logging_steps=10,
    optim="adamw_8bit",
    lr_scheduler_type = "linear",
    weight_decay=0.01,
    output_dir="llama3.2-alpaca-finetuned-4bit",
    save_total_limit=2,
    push_to_hub=False,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_datasets['train'],
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    args=training_args,
)

  trainer = SFTTrainer(


Generating train split: 0 examples [00:00, ? examples/s]

In [19]:
trainer.train()

Step,Training Loss
10,1.731
20,1.6339
30,1.5867
40,1.5747
50,1.5129
60,1.5608
70,1.5697
80,1.5123
90,1.4952
100,1.4755


TrainOutput(global_step=2107, training_loss=1.4530328819408878, metrics={'train_runtime': 8107.6095, 'train_samples_per_second': 2.08, 'train_steps_per_second': 0.26, 'total_flos': 5.074402420142899e+16, 'train_loss': 1.4530328819408878, 'epoch': 0.999644170323805})

In [20]:
tokenizer.bos_token_id

128000

In [21]:
tokenizer.eos_token_id

128001

In [22]:
model.eval()

instruction = 'Rewrite the given paragraph in a shorter, easier to understand form.'
input = 'Robot.txt, otherwise known as the Robots Exclusion Protocol, was created to tell search engine sites what not to crawl as they index the web. OpenAI says on its informational page that it honors such files when configured with its own set of do-not-crawl tags, though it also warns that it can take its bots up to 24 hours to recognize an updated robot.txt file.'

template = """### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

prompt = template.format(instruction=instruction, input=input)
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
    inputs = {k: v.to('cuda') for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=512,
        eos_token_id=tokenizer.eos_token_id,
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tensor([[128000,  14711,  30151,    512,  59565,   1269,    279,   2728,  14646,
            304,    264,  24210,     11,   8831,    311,   3619,   1376,    382,
          14711,   5688,    512,  44474,   3996,     11,   6062,   3967,    439,
            279,  97146,   1398,   9134,  25590,     11,    574,   3549,    311,
           3371,   2778,   4817,   6732,   1148,    539,    311,  46764,    439,
            814,   1963,    279,   3566,     13,   5377,  15836,   2795,    389,
           1202,  47735,   2199,    430,    433,  49593,   1778,   3626,    994,
          20336,    449,   1202,   1866,    743,    315,    656,  30269,   1824,
          34783,   9681,     11,   3582,    433,   1101,  49140,    430,    433,
            649,   1935,   1202,  50629,    709,    311,    220,   1187,   4207,
            311,  15641,    459,   6177,  12585,   3996,   1052,    382,  14711,
           6075,    512,  44474,   3996,    374,    264,   1052,    430,  10975,
           2778,  21787,   1