In [1]:
!pip install -q accelerate peft transformers trl

In [7]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [2]:

# Model and dataset
model_name = "NousResearch/Llama-3.2-1B"
dataset_name = "mlabonne/guanaco-llama2-1k"
new_model = "Llama-3.2-1B-chat-finetune"

# LoRA config
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Training config
output_dir = "./results"
num_train_epochs = 1
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25
lr_scheduler_type = "cosine"


In [5]:
!pip install tensorboardX



In [6]:

# Load dataset
dataset = load_dataset(dataset_name, split="train")

# Load model and tokenizer (CPU only)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to("cpu")
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Training arguments for CPU
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    logging_steps=logging_steps,
    save_steps=save_steps,
    fp16=False,
    bf16=False,
    optim="adamw_torch",  # Compatible with CPU
    report_to="none",     # TensorBoard optional
)

In [None]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_arguments,
    peft_config=peft_config,
)

In [None]:


# Train the model
# trainer.train()

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,1.42
10,1.4368
15,1.6514
20,1.6829
25,1.5544
30,1.7233
35,1.8265
40,1.9741
45,1.9878
50,2.4599


TrainOutput(global_step=250, training_loss=1.690826696395874, metrics={'train_runtime': 16805.6917, 'train_samples_per_second': 0.06, 'train_steps_per_second': 0.015, 'total_flos': 2205408076775424.0, 'train_loss': 1.690826696395874})

In [8]:
trainer.model.save_pretrained(new_model)

In [None]:
!pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.71.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hDownloading absl_py-2.2.2-py3-none-any.wh

In [13]:
# # Empty VRAM
# del model
# del pipe
# del trainer
# import gc
# gc.collect()
# gc.collect()

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Force CPU execution
device = torch.device("cpu")

# Reload base model on CPU (no FP16)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float32,  # Use float32 for CPU
)

# Load LoRA adapter and merge
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Move merged model to CPU
model.to(device)

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [10]:
from transformers import pipeline

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is a large language model? [/INST] A large language model (LLM) is a type of artificial intelligence (AI) model that is trained on a large corpus of text data and is capable of generating human-like text based on the input provided. These models are trained on a vast amount of data, which allows them to learn complex patterns and generate text that is more accurate and coherent than traditional AI models. </s>


In [14]:
%load_ext tensorboard
%tensorboard --logdir results/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 6476), started 0:55:20 ago. (Use '!kill 6476' to kill it.)

In [21]:
from transformers import pipeline
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What games would you recommend if I liked Undertale?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])


<s>[INST] What games would you recommend if I liked Undertale? [/INST] Undertale is a game that has a lot of replay value, so I would recommend trying out some of the other games in the series. Some of my favorites include:
- Undertale: A game that is a spin-off of the original game, but still has a lot of the same mechanics and themes.
- The Binding of Isaac: A game that is a roguelike game, which means that it has a lot of random elements and is very difficult.
- Undertale: Another spin-off of the original game, but this time it is a visual novel game, which means that it has a lot of story and character development.
- Undertale: A game that is a mix of the two, with a visual novel style story and a roguelike game style gameplay.
- Undertale: A game that is a mix of the two, with a visual novel style story and a roguel


In [17]:
from peft import PeftModel

# Force CPU execution
device = torch.device("cpu")

# Reload base model on CPU (no FP16)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float32,  # Use float32 for CPU
)

# Load LoRA adapter and merge
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Move merged model to CPU
model.to(device)

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [18]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"