# Pre-training example using Unsloth 
This is a very basic example of training llama3.2 using unsloth. It will train on plain text so only continued pre-traning.


In [None]:
!nvidia-smi

In [2]:
# Install unsloth
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Found existing installation: unsloth 2024.12.12
Uninstalling unsloth-2024.12.12:
  Successfully uninstalled unsloth-2024.12.12
Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-o12zjxia
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-o12zjxia
  Resolved https://github.com/unslothai/unsloth.git to commit 87f5bffc45a8af7f23a41650b30858e097b86418
  Installing build dependencies ... [?2done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25ldone
[?25h  Created wheel for unsloth: filename=unsloth-2024.12.12-py3-none-any.whl size=175166 sha256=25bef00eecf2b779e977f2064969cfac03e12758fede0f086487cc0ba60554e8
  Stored in directory: /tmp/pip-ephem-wheel-cache-dm21wyr9/wheels/

In [3]:
# Install WandB (weight and biases) for nice graphs and result tracking
!pip install wandb 



In [2]:
# Login to Wandb
!wandb login


[34m[1mwandb[0m: Currently logged in as: [33mjoakim_eriksson[0m ([33mjoakim_eriksson-rise-research-institutes-of-sweden[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import wandb
# Initialize W&B
wandb.init(
    project="contiki-llama-pretraining",
    name="llama-3.2-contiki-run",
    config={
        "model": "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
        "learning_rate": 1e-4,
        "batch_size": 4,
        "gradient_accumulation_steps": 4,
        "max_steps": 500
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjoakim_eriksson[0m ([33mjoakim_eriksson-rise-research-institutes-of-sweden[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [16]:
import json
from pathlib import Path
from datasets import Dataset

# Load all datasets from the dataset directory
dataset_dir = Path('dataset')
allowed_types = ('repository_file', 'pdf_text')
all_data = []
for file in dataset_dir.glob('*.json'):
    print(f"Loading file {file}")
    data = []
    with open(file, 'r', encoding='utf-8') as f:  # Open the file
        jsdata = json.load(f)
        for entry in jsdata:
            if entry['type'] in allowed_types:
                data.append({'text' : entry['output']})
        print(f"  found {len(data)} entries.")
        all_data.extend(data)
        
print(f"Loaded {len(all_data)} examples from datasets")
wandb.log({"dataset_size": len(all_data)})

# Convert to HuggingFace dataset
dataset = Dataset.from_list(all_data)
dataset.train_test_split(test_size = 0.01)
#print(dataset)

Loading file dataset/pdf_pretraining_dataset.json
  found 106 entries.
Loading file dataset/pretraining_dataset.json
  found 2811 entries.
Loaded 2917 examples from datasets


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2887
    })
    test: Dataset({
        features: ['text'],
        num_rows: 30
    })
})

In [17]:
from transformers import AutoTokenizer, TrainingArguments
# Initialize model and tokenizer
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"

# Alternative models if needed
backup_models = [
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
]

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

def try_load_model(model_names):
    for name in model_names:
        try:
            print(f"Attempting to load {name}...")
            # Load the tokenizer
            tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
            tokenizer.pad_token = tokenizer.eos_token

            # Load the model with unsloth optimizations
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=name,
                max_seq_length=2048,
                dtype=None,
                load_in_4bit=True,
            )
            print(f"Successfully loaded model: {name}")
            wandb.log({"model_loaded": name})
            return model, tokenizer
        except Exception as e:
            print(f"Failed to load {name}: {str(e)}")
            wandb.log({"model_load_error": {"model": name, "error": str(e)}})
    raise Exception("Failed to load any model")

# Try to load models in order of preference
model, tokenizer = try_load_model([model_name] + backup_models)

Attempting to load unsloth/Llama-3.2-3B-Instruct-bnb-4bit...
==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Successfully loaded model: unsloth/Llama-3.2-3B-Instruct-bnb-4bit


In [18]:
# From Unslot pretraining example
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.12.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [19]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████| 2917/2917 [00:00<00:00, 69927.84 examples/s]


In [20]:
# Print out some of the data
for row in dataset[:5]["text"]:
    print("=========================")
    print(row)

Digital Comprehensive Summaries of Uppsala Dissertations

from the Faculty of Science and Technology 2335

Scalable and Interoperable Low-Power

Internet of Things Networks

JOAKIM ERIKSSON

ACTA UNIVERSITATIS

UPSALIENSIS

2023

ISSN 1651-6214

ISBN 978-91-513-1951-3

urn:nbn:se:uu:diva-513926<|eot_id|>
Dissertation presented at Uppsala University to be publicly examined in Häggsalen,

Ångströmlaboratoriet, Lägerhyddsvägen 1, Uppsala, Friday, 15 December 2023 at 13:15 for

the degree of Doctor of Philosophy. The examination will be conducted in English. Faculty

examiner: Professor Leo Selavo (University of Latvia ).

Abstract

Eriksson, J. 2023. Scalable and Interoperable Low-Power Internet of Things Networks.

Digital Comprehensive Summaries of Uppsala Dissertations from the Faculty of Science and

Technology 2335. 48 pp. Uppsala: Acta Universitatis Upsaliensis. ISBN 978-91-513-1951-3.

Internet of Things (IoT) is the concept of connecting devices to the Internet. IoT devices can be

# Setup the Unsloth Trainer


In [21]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,

        warmup_ratio = 0.1,
        num_train_epochs = 1,

        learning_rate = 5e-5,
        embedding_learning_rate = 5e-6,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb",
        run_name="llama-3.2-contiki-pretrain-run"
    ),
)

Map (num_proc=8): 100%|████████████████████████████████████████████████████████████████████████████| 2917/2917 [00:08<00:00, 332.71 examples/s]


In [22]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 24.0 GB.
6.578 GB of memory reserved.


In [23]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,917 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 91
 "-____-"     Number of trainable parameters = 982,515,712


Step,Training Loss
1,0.8689
2,0.8937
3,0.8076
4,0.8386
5,0.7703
6,0.8994
7,0.815
8,0.7383
9,0.7351
10,0.6664


In [25]:
import torch
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2206.4137 seconds used for training.
36.77 minutes used for training.
Peak reserved memory = 12.727 GB.
Peak reserved memory for training = 6.149 GB.
Peak reserved memory % of max memory = 53.029 %.
Peak reserved memory for training % of max memory = 25.621 %.


In [26]:
# Save the model and log to W&B
trainer.save_model("contiki_llama32_pretrain_model_final")

# Test inferencing of the model(s)

In [45]:
# Select the model by setting which model to use...
if False:
    model = trainer.model
elif False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "contiki_llama32_pretrain_model_final", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )
else:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )

inference_model = FastLanguageModel.for_inference(model)


==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [48]:
from transformers import TextIteratorStreamer
from threading import Thread
text_streamer = TextIteratorStreamer(tokenizer)
import textwrap
max_print_width = 100

# Fill in style
if True:
    inputs = tokenizer(
    [
        """The contributors of Contiki-NG are
         """
    ]*1, return_tensors = "pt").to("cuda")

# Instruction style
else: 
    inputs = tokenizer(
    [
    """ Based on given instruction and context, generate an appropriate response. You are a code co-pilot helping out answering on Contiki-NG related questions.
### Instruction:
Write a hello world program in Contiki-NG style - with a process and an e-timer.
### Context:
Show a code snippet with good comments.
### Response:
"""
    ]*1, return_tensors = "pt").to("cuda")


generation_kwargs = dict(
    inputs,
    streamer = text_streamer,
    max_new_tokens = 256,
    use_cache = True,
)
thread = Thread(target = inference_model.generate, kwargs = generation_kwargs)
thread.start()

length = 0
for j, new_text in enumerate(text_streamer):
    if j == 0:
        wrapped_text = textwrap.wrap(new_text, width = max_print_width)
        length = len(wrapped_text[-1])
        wrapped_text = "\n".join(wrapped_text)
        print(wrapped_text, end = "")
    else:
        length += len(new_text)
        if length >= max_print_width:
            length = 0
            print()
        print(new_text, end = "")
    pass
pass

<|begin_of_text|>The contributors of Contiki-NG are Theodoros Kasapidis
 Antonios G. 
Papadopoulos
          Georgios P. Papadopoulos
M. Papadopoulosantinos 
Vasilios K. 
Papadopoulos
 Andreas K. Papadopoulos
          Georgios D. Papadopoulos
P.        Georgios 
Papadopoulos
Vasilios K. Papadopoulos
Andreas K. Papadopoulos
 Georgios D. 
Papadopoulos
          Georgios P. Papadopoulos
K. Papadopoulosios 
K.        Andreas 
Papadopoulos
Georgios D. Papadopoulos
 Georgios P. Papadopoulos
K.        Vasilios 
Papadopoulos
Andreas K. Papadopoulos
          Georgios D. Papadopoulos
P.        Georgios 
Papadopoulos
Vasilios K. Papadopoulos
          Andreas K. Papadopoulos
Georgios D. 
Papadopoulos
Papadopoulosorgios P. 
Vasilios K. Papadopoulos
          Andreas K. 
Papadopoulos
          Georgios D. Papadopoulos
Papadopoulosorgios P. 
Vasilios K