# Installation and loading model

In [1]:
# %%capture
# !pip install unsloth
# !pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
%%capture
!pip install unsloth==2025.2.15 unsloth-zoo==2025.2.7
!rm -rf unsloth_compiled_cache

In [3]:
from google.colab import userdata
from huggingface_hub import login
import os

os.environ['HF_TOKEN'] = 'YOUR_HF_TOKEN'
hf_token = os.environ.get('HF_TOKEN')

login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, # add your HF token
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Loading & Formatting the Dataset

In [6]:
train_prompt_style = """You are a SQL generation assistant.
Your task is to generate a syntactically correct SQL query based solely on the provided database schema and natural language question.
Do not include any explanations or additional text—output only the SQL query.

Ensure:
• All table and column names match exactly as provided in the schema.
• The SQL query uses proper syntax for the given database.
• No extra commentary or formatting is added—only the SQL command is output.
---

### Database Schema:
{table_schema}

### Question:
{question}

### SQL Query:
{sql_query}
"""

In [7]:
EOS_TOKEN = tokenizer.eos_token # MUST ADD EOS_TOKEN

def formatting_prompts_func(examples):
    questions = examples["question"]
    sqls = examples["sql"]
    # Define the default table schema
    # default_table_schema = "Table: Houses (houseName VARCHAR(255), location VARCHAR(255), price DECIMAL(10,2), numBath INT, numRoom INT)"
    # Use the provided table schema if available; otherwise, use the default.
    # table_schemas = examples.get("table", [default_table_schema] * len(questions))
    table_schemas = examples.get("table")

    texts = []
    for question, sql, table_schema in zip(questions, sqls, table_schemas):
        text = train_prompt_style.format(
            table_schema=table_schema,
            question=question,
            sql_query=sql
        ) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [8]:
from datasets import load_dataset
# dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train[0:100]", trust_remote_code=True)
dataset = load_dataset("wikisql", split="train[0:100]", trust_remote_code=True)
print(dataset.column_names)
dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset.column_names)

README.md:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

wikisql.py:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/15878 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8421 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/56355 [00:00<?, ? examples/s]

['phase', 'question', 'table', 'sql']


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

['phase', 'question', 'table', 'sql', 'text']


In [9]:
dataset['text'][0]

"You are a SQL generation assistant.\nYour task is to generate a syntactically correct SQL query based solely on the provided database schema and natural language question.\nDo not include any explanations or additional text—output only the SQL query.\n\nEnsure:\n• All table and column names match exactly as provided in the schema.\n• The SQL query uses proper syntax for the given database.\n• No extra commentary or formatting is added—only the SQL command is output.\n---\n\n### Database Schema:\n{'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'], 'page_title': '', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-1000181-1', 'section_title': '', 'caption': '', 'rows': [['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No sl

# Setting Training Arguments

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to= "none", # Use this for WandB etc
    ),
)

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

# Train

In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.6087
20,0.9115
30,0.5361
40,0.5115
50,0.3639
60,0.1872


# Testing

In [12]:
prompt_style = """You are a real estate agent assistant.
When given a property database schema and an agent's query, your job is to generate a syntactically correct SQL command that retrieves the relevant property information.
Output only the SQL query without any explanations.

### Property Database Schema:
{table_schema}

### Agent's Question:
{question}

### SQL Query:
"""

In [18]:
question = "Give me the name of the houses which value is below a million dolars located in Juriquilla and have at least 1 bathroom and 1 room."
table_schemas = """{
                      "header": ["houseName", "location", "price", "numBath", "numRoom"],
                      "types": ["text", "text", "decimal", "int", "int"],
                      "table name": "Houses"
                   }"""

sql_query = ""  # Assuming you want to generate the SQL query based on the question and table schema

formatted_prompt = prompt_style.format(
    table_schema=table_schemas,
    question=question,
    sql_query=sql_query
)

FastLanguageModel.for_inference(model)
inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    # pad_token_id=tokenizer.pad_token_id,
    max_new_tokens=1200,
    use_cache=True,
)

import ast

response = tokenizer.batch_decode(outputs)

print(response[0].split("SQL Query:")[1])


{'human_readable': 'SELECT houseName FROM Houses WHERE location = Juriquilla AND numBath >= 1 AND numRoom >= 1 AND price < 1e6', 'sel': 0, 'agg': 0, 'conds': {'column_index': [1, 3, 2], 'operator_index': [0, 0, 1], 'condition': ['Juriquilla', '1', '1e6']}}
<｜end▁of▁sentence｜>


# Saving to float16 for VLLM

We also support saving to float16 directly. Select merged_16bit for float16 or merged_4bit for int4. We also allow lora adapters as a fallback. Use push_to_hub_merged to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method="merged_16bit", token="") # username/model name

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token="")

# Just LORA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method="lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method="lora", token="")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.68 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 34%|███▍      | 11/32 [00:00<00:01, 11.86it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [05:45<00:00, 10.79s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model/pytorch_model-00004-of-00004.bin...
Done.
