# Mexican forest cover Chatbot fine-tuning.

In this notebook, we will see how to fine-tune a custom chatbot (based on a Phi-4-mini model) using a hand-made training dataset.

This is a prompt-completion fine-tuning intended to generate SQL querys

Prerequisite: Create HuggingFace token with permission access to `microsoft/Phi-4-mini-instruct`.

In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
from huggingface_hub import login
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
from trl import SFTTrainer
from peft import LoraConfig, AutoPeftModelForCausalLM

Load the custom training dataset, available in this same github project.

In [2]:
excel_file_path = '/usr/workspace/media/training_prompts.xlsx'
df = pd.read_excel(excel_file_path)
hf_dataset = Dataset.from_pandas(df)
single_dataset_dict = DatasetDict({'train': hf_dataset})

In [3]:
single_dataset_dict['train'][0]

{'prompt': 'User request: Muéstrame entidad federativa y superficie cubierta por bosque, ordenada descendente por superficie cubierta por bosque, muestra solo el primer registro\n\nSQL:',
 'completion': 'SELECT\n  entidad_federativa,\n  superficie_cubierta_por_bosque\nFROM \n  superficie_bd.superficie_forestal\nORDER BY\n  superficie_cubierta_por_bosque\nDESC\nLIMIT 1;',
 'system_prompt': 'You are a SQL generator for ClickHouse database. Given a user request in natural language, you will respond with exactly one valid ClikHouse SQL query, nothing else. Use proper table and column names from the schema. Handle aggregations and filtering appropriately.',
 '__index_level_0__': 0}

Download LLM from HuggingFace and set up tokenizer. We'll use a 4bit quantization as i only have 8gb of Vram.

In [None]:
# Hugging Face login
my_token = "hf_xxxxxxx"
login(token=my_token)


# -------------------------------
# Load Phi-4-mini-instruct
# -------------------------------

model_id = 'microsoft/Phi-4-mini-instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=my_token
)


# Make sure pad_token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Set up LoRA configurations, datasets and SFT (Supervised Fine-Tuning) training procedure.

In [5]:
# -------------------------------
# LoRA config
# -------------------------------

lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
)

In [6]:
# ----------------------------------------
# Tokenization function (completion mode)
# ----------------------------------------

def tokenize_function(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    texts = []
    for prompt, completion in zip(prompts, completions):
        # Concatenate prompt + completion
        text = prompt.strip() + " " + completion.strip()
        texts.append(text)
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)


single_dataset_dict = single_dataset_dict.map(tokenize_function, batched=True)


Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Start the fine-tuning with 150 training step, which will take ~3 minutes on a GTX 4060 Laptop GPU with 8gb VRAM.

In [7]:
# -------------------------------
# Trainer
# -------------------------------

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=single_dataset_dict['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=150,
        learning_rate=2e-4,
        bf16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        report_to="none",
    ),
    peft_config=lora_config,
)

trainer.train()

Truncating train dataset:   0%|          | 0/275 [00:00<?, ? examples/s]

Step,Training Loss
1,10.7848
2,10.6911
3,10.8988
4,10.2715
5,8.5478
6,6.1164
7,4.2938
8,2.8314
9,2.1403
10,1.3965


TrainOutput(global_step=150, training_loss=0.7226741376519203, metrics={'train_runtime': 467.6573, 'train_samples_per_second': 1.283, 'train_steps_per_second': 0.321, 'total_flos': 5923717282529280.0, 'train_loss': 0.7226741376519203})

In [8]:
# Save only the LoRA adapter + tokenizer
trainer.model.save_pretrained("phi4_mini_lora")
tokenizer.save_pretrained("phi4_mini_lora")

('phi4_mini_lora/tokenizer_config.json',
 'phi4_mini_lora/special_tokens_map.json',
 'phi4_mini_lora/chat_template.jinja',
 'phi4_mini_lora/vocab.json',
 'phi4_mini_lora/merges.txt',
 'phi4_mini_lora/added_tokens.json',
 'phi4_mini_lora/tokenizer.json')

### Test pipeline

In [3]:
#torch.set_float32_matmul_precision('high')

from transformers import AutoTokenizer, BitsAndBytesConfig
from peft import AutoPeftModelForCausalLM
import torch

# -----------------------
# Quantization config (4-bit)
# -----------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # use bf16 on 4060
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# -----------------------
# Load tokenizer + model with LoRA adapter
# -----------------------
tokenizer = AutoTokenizer.from_pretrained("phi4_mini_lora")

model = AutoPeftModelForCausalLM.from_pretrained(
    "phi4_mini_lora",              # path to your LoRA adapter
    quantization_config=bnb_config, # load in 4-bit
    device_map="auto"
)

def nl_to_sql(request: str) -> str:
    """
    Generate a SQL query from a natural language request using the fine-tuned model.
    """
    prompt = f"User request: {request}\nSQL:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    sql = full_output.replace(prompt, "").strip()
    return sql

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
sql = nl_to_sql("Muestra la división de la suma de superficie forestal entre la suma de población")
print("Generated SQL:", sql)

Generated SQL: SELECT
  superficie_forestal / población
FROM
  superficie_forestal
JOIN
  superficie_forestal_forestal
  ON superficie_forestal.superficie_forestal_forestal.superficie_forestal_forestal_id = superficie_forestal.superficie_forestal_id
JOIN
  superficie_forestal_forestal.poblacion
  ON superficie_forestal_forestal.poblacion.poblacion_id = superficie_forestal_forestal.poblacion_id
