In [1]:
%pip install unsloth datasets dotenv huggingface_hub torch

Note: you may need to restart the kernel to use updated packages.


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
Standard import failed for UnslothNashMDTrainer: No module named 'UnslothNashMDTrainer'. Using tempfile instead!
==((====))==  Unsloth 2025.8.1: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.339 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.8.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
from datasets import load_dataset

dataset = load_dataset("jeanmcm/b_risks", split="train")

In [4]:
dataset

Dataset({
    features: ['messages', 'topic'],
    num_rows: 1870
})

In [5]:
def validate_messages_format(messages):
    """
    Validates the format of the messages list and counts interactions.

    Args:
        messages (list): A list of dictionaries representing messages.

    Returns:
        tuple: (bool, int) - True if the format is valid and the number of interactions (pairs of user/assistant messages), False and 0 otherwise.
    """
    if len(messages) % 2 != 0:
        return False, 0  # Length must be even

    user_count = 0
    assistant_count = 0
    for i, message in enumerate(messages):
        if i % 2 == 0:
            if message.get('role') != 'user':
                return False, 0  # Even indexed messages should have 'user' role
            user_count += 1
        else:
            if message.get('role') != 'assistant':
                return False, 0  # Odd indexed messages should have 'assistant' role
            assistant_count += 1

    if user_count == assistant_count:
        return True, user_count  # Return True and the number of interactions
    else:
        return False, 0

def clean_invalid_messages(messages):
    """
    Cleans invalid message sequences by keeping only the valid alternating pairs.

    Args:
        messages (list): A list of dictionaries representing messages.

    Returns:
        list: A list containing only the valid alternating user/assistant message pairs.
    """
    cleaned_messages = []
    for i in range(0, len(messages) - 1, 2):
        if messages[i].get('role') == 'user' and messages[i+1].get('role') == 'assistant':
            cleaned_messages.append(messages[i])
            cleaned_messages.append(messages[i+1])
        else:
            # Stop if the pattern is broken
            break
    return cleaned_messages


valid_dataset = []
invalid_dataset = []

# Iterate through the dataset and validate the messages
for example in dataset:
    is_valid, _ = validate_messages_format(example['messages'])
    if is_valid:
        valid_dataset.append(example)
    else:
        invalid_dataset.append(example)

print(f"Total number of valid examples: {len(valid_dataset)}")
print(f"Total number of invalid examples: {len(invalid_dataset)}")

cleaned_invalid_dataset = []
# Clean the invalid messages and re-validate
for example in invalid_dataset:
    cleaned_messages = clean_invalid_messages(example['messages'])
    is_valid, interactions = validate_messages_format(cleaned_messages)
    if is_valid:
        cleaned_invalid_dataset.append({'messages': cleaned_messages, 'topic': example['topic']})
        # print(f"Cleaned invalid example now valid with {interactions} interactions.")
    # else:
        # print(f"Cleaning failed for example: {example['messages']} -> {cleaned_messages}")


print(f"\nTotal number of cleaned invalid examples that are now valid: {len(cleaned_invalid_dataset)}")

# Combine valid and cleaned invalid datasets
combined_dataset = valid_dataset + cleaned_invalid_dataset

print(f"Total number of examples in the combined dataset: {len(combined_dataset)}")

# Optional: You can now replace the original dataset with the combined one if needed
dataset = combined_dataset

Total number of valid examples: 1771
Total number of invalid examples: 99

Total number of cleaned invalid examples that are now valid: 99
Total number of examples in the combined dataset: 1870


In [8]:
from datasets import Dataset
# Flatten the messages list into pairs of user/assistant interactions with a system message
interaction_pairs_with_system = []
for example in dataset:
    messages = example['messages']
    topic = example['topic'] # Get the topic
    system_message = {"role": "system", "content": f"This conversation is about: {topic}. "} # Create system message based on topic

    # Assuming messages are already validated to be in user/assistant pairs within each example
    for i in range(0, len(messages), 2):
        if i + 1 < len(messages): # Ensure there's a next message for a pair
            # Prepend the system message to the interaction pair
            interaction_pairs_with_system.append([system_message, messages[i], messages[i+1]])

# Now you have a list where each element is a [system_message, user_message, assistant_message] list
print(f"Total number of interaction sets (with system message): {len(interaction_pairs_with_system)}")


dataset = Dataset.from_list([{'messages': interaction_set} for interaction_set in interaction_pairs_with_system])


Total number of interaction sets (with system message): 3392


# Llama 3.1 Template

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1", # change this to the right chat_template name
)

In [14]:
from datasets import Dataset
from unsloth import standardize_sharegpt


# Convert the list of interaction sets back to a Dataset
dataset = Dataset.from_list([{'messages': interaction_set} for interaction_set in interaction_pairs_with_system])

# Adjust the formatting function to handle the new structure
def formatting_prompts_func(examples):
    # Each element in 'examples'['messages'] is now a list [system_message, user_message, assistant_message]
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

# Apply the formatting function to the new dataset
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/3392 [00:00<?, ? examples/s]

In [15]:
dataset[5]['messages']

[{'content': 'This conversation is about: Lavado de activos y sus métodos. ',
  'role': 'system'},
 {'content': "¿Qué es un 'embudo' en este contexto?", 'role': 'user'},
 {'content': "En este contexto, una 'cuenta de embudo' se refiere a la creación de una estructura financiera compleja diseñada para ocultar el origen ilícito de los fondos. Esta estructura puede incluir múltiples cuentas en diferentes países o entidades financieras, lo que dificulta su rastreo y seguimiento.",
  'role': 'assistant'}]

In [16]:
dataset[5]['text']

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nThis conversation is about: Lavado de activos y sus métodos. <|eot_id|><|start_header_id|>user<|end_header_id|>\n\n¿Qué es un 'embudo' en este contexto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEn este contexto, una 'cuenta de embudo' se refiere a la creación de una estructura financiera compleja diseñada para ocultar el origen ilícito de los fondos. Esta estructura puede incluir múltiples cuentas en diferentes países o entidades financieras, lo que dificulta su rastreo y seguimiento.<|eot_id|>"

# Alpaca Format

In [8]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

In [12]:
from unsloth import standardize_sharegpt



def formatting_prompts_func(examples):
    convos = examples["messages"]
    #texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    texts = [alpaca_prompt.format(convo[0]["content"], convo[1]["content"], convo[2]['content']) + EOS_TOKEN for convo in convos]
    return { "text" : texts, }

# Convert the list of interaction sets back to a Dataset
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 3392/3392 [00:00<00:00, 27783.25 examples/s]


# Custom Chat Template

In [9]:
def add_column(example):
  messages = example["messages"]
  messages_without_system = messages[1:]
  example["conversations"] = messages_without_system
  return example

dataset = dataset.map(add_column)

dataset

Map: 100%|██████████| 3392/3392 [00:00<00:00, 13171.49 examples/s]


Dataset({
    features: ['messages', 'conversations'],
    num_rows: 3392
})

In [11]:
chat_template = """Below are some instructions that describe some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

from unsloth import apply_chat_template


dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
    # default_system_message = "You are a helpful assistant", << [OPTIONAL]
)

Unsloth: We automatically added an EOS token to stop endless generations.
Map: 100%|██████████| 3392/3392 [00:00<00:00, 20100.65 examples/s]


In [13]:
dataset["conversations"][0]

[{'content': '¿Qué son las UAFE y qué tipos se mencionan en el texto?',
  'role': 'user'},
 {'content': 'Las UAFE (Unidades de Análisis Financiero Especializado) se refieren a técnicas o procedimientos utilizados para analizar y evaluar riesgos financieros. El texto menciona cuatro tipos de UAFE: 1) uso de documentos adulterados para la adquisición de vehículos de alto valor, 2) uso inadecuado de productos financieros en cuentas de personas expuestas políticamente, 3) lavado de activos a través de exportaciones sobreravaloradas y 4) uso de exportaciones de oro para canalizar dinero ilícito.',
  'role': 'assistant'}]

In [14]:
dataset[0]['text'].split('\n')

['<|begin_of_text|>Below are some instructions that describe some tasks. Write responses that appropriately complete each request.',
 '',
 '### Instruction:',
 '¿Qué son las UAFE y qué tipos se mencionan en el texto?',
 '',
 '### Response:',
 'Las UAFE (Unidades de Análisis Financiero Especializado) se refieren a técnicas o procedimientos utilizados para analizar y evaluar riesgos financieros. El texto menciona cuatro tipos de UAFE: 1) uso de documentos adulterados para la adquisición de vehículos de alto valor, 2) uso inadecuado de productos financieros en cuentas de personas expuestas políticamente, 3) lavado de activos a través de exportaciones sobreravaloradas y 4) uso de exportaciones de oro para canalizar dinero ilícito.<|end_of_text|>']

In [15]:
dataset

Dataset({
    features: ['messages', 'conversations', 'text'],
    num_rows: 3392
})

In [16]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 3392/3392 [00:01<00:00, 1873.46 examples/s]


In [17]:
trainer.train_dataset

Dataset({
    features: ['messages', 'conversations', 'text', 'input_ids', 'attention_mask'],
    num_rows: 3392
})

In [18]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|begin_of_text|>Below are some instructions that describe some tasks. Write responses that appropriately complete each request.\n\n### Instruction:\n¿Qué es un 'embudo' en este contexto?\n\n### Response:\nEn este contexto, una 'cuenta de embudo' se refiere a la creación de una estructura financiera compleja diseñada para ocultar el origen ilícito de los fondos. Esta estructura puede incluir múltiples cuentas en diferentes países o entidades financieras, lo que dificulta su rastreo y seguimiento.<|end_of_text|>"

In [19]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A40. Max memory = 44.339 GB.
7.137 GB of memory reserved.


In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,392 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.4516
2,2.7027
3,2.2288
4,2.3752
5,2.3122
6,2.0164
7,2.0831
8,1.8823
9,1.6819
10,1.9536


In [21]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

101.2224 seconds used for training.
1.69 minutes used for training.
Peak reserved memory = 7.137 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 16.096 %.
Peak reserved memory for training % of max memory = 0.0 %.


# Inference

In [22]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [
        {
        "role": "user",
        "content": "¿Qué ocurre con un estafador que paga por dos vehículos mediante transferencias bancarias y luego solicita un crédito de seis meses para la compra de otros dos vehículos?",
    }
     ],
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(inputs, streamer = text_streamer, max_new_tokens = 128,pad_token_id = tokenizer.eos_token_id)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


El estafador utiliza transferencias bancarias para pagar por dos vehículos, luego solicita un crédito de seis meses para la compra de otros dos vehículos. Esta práctica sugiere que el estafador tiene recursos financieros disponibles, lo cual podría indicar que es una persona con capacidad económica y recursos financieros significativos.<|end_of_text|>


In [33]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [                    # Change below!
    {"role": "user", "content": "hola :D"},
    {"role":"assistant", "content":"Hola, en que puedo ayudarte?"},
    {"role": "user", "content": "cuentame un chiste"},
    
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

¿Qué 

ocurre si un chihuahua se come una hamburguesa? Bueno, es un chihuahua, ¡así que no tiene importancia!<|end_of_text|>


In [25]:
from dotenv import load_dotenv
load_dotenv()

True

In [26]:
import os

hf_token = os.environ.get("HUGGING_FACE_KEY")
model_name ="jeanmcm/llama3.1-b_risks-lora"

In [27]:


#model.save_pretrained("lora_model")  # Local saving
#tokenizer.save_pretrained("lora_model")
model.push_to_hub(model_name, token = hf_token) # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (0 / 1)                :   0%|          |  558kB /  168MB,  698kB/s  
Processing Files (0 / 1)                :   1%|          | 1.12MB /  168MB, 1.12MB/s  
Processing Files (0 / 1)                :   3%|▎         | 5.02MB /  168MB, 4.19MB/s  
Processing Files (0 / 1)                :  11%|█         | 17.9MB /  168MB, 12.8MB/s  
Processing Files (0 / 1)                :  36%|███▌      | 59.7MB /  168MB, 37.3MB/s  
Processing Files (0 / 1)                :  36%|███▌      | 60.8MB /  168MB, 33.8MB/s  
Processing Files (0 / 1)                :  46%|████▌     | 77.0MB /  168MB, 38.5MB/s  
Processing Files (0 / 1)                :  71%|███████   |  119MB /  168MB, 54.3MB/s  
Processing Files (0 / 1)                :  99%|█████████▉|  166MB /  168MB, 69.1MB/s  
Processing Files (0 / 1)                : 100%|█████████▉|  167MB /  168MB, 64.4MB/s  
[A
[A
[A
Processing Files (0 / 1)       

Saved model to https://huggingface.co/jeanmcm/llama3.1-b_risks-lora


In [28]:
# Save to 8bit Q8_0
# if False: model.save_pretrained_gguf(model_name, tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
# if False: model.push_to_hub_gguf(model_name, tokenizer, token = hf_token)

# Save to 16bit GGUF
# if False: model.save_pretrained_gguf(model_name, tokenizer, quantization_method = "f16")
# if False: model.push_to_hub_gguf(model_name, tokenizer, quantization_method = "f16", )

# Save to q4_k_m GGUF
model.save_pretrained_gguf(model_name, tokenizer, quantization_method = "q4_k_m")
#if True: model.push_to_hub_gguf(model_name, tokenizer, quantization_method = "q4_k_m", token = hf_token)


Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 312.31 out of 503.51 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 33.97it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at jeanmcm/llama3.1-b_risks-lora into bf16 GGUF format.
The output location will be /workspace/fine-tunning-models/jeanmcm/llama3.1-b_risks-lora/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama3.1-b_risks-lora
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: /workspace/fine-tunning-models/jeanmcm/llama3.1-b_risks-lora/unsloth.Q4_K_M.gguf
Unsloth: Saved Ollama Modelfile to jeanmcm/llama3.1-b_risks-lora/Modelfile


In [None]:
model_file = tokenizer._ollama_modelfile

model_file = model_file.replace("{__FILE_LOCATION__}","./gemma-3N-finetune.Q8_0.gguf")
print(model_file)