In [6]:
# Check nvidia driver version
!nvcc --version
print(" ")
print("------------------------------")
print(" ")
# Check if cuda exist
!nvidia-smi


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Jul_16_20:06:48_Pacific_Daylight_Time_2025
Cuda compilation tools, release 13.0, V13.0.48
Build cuda_13.0.r13.0/compiler.36260728_0
 
------------------------------
 
Sat Oct  4 10:53:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.97                 Driver Version: 580.97         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   60C    P8              9W /   45W |       0MiB /   4096

In [3]:
import json, os, torch
from unsloth import FastLanguageModel
from datasets import Dataset

print("CUDA available:", torch.cuda.is_available())

NotImplementedError: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.

Load Base Model (Qwen3-14B, use 4bit if GPU is available)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,
    load_in_4bit = torch.cuda.is_available(),
    load_in_8bit = False,
    full_finetuning = False,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)


Load and prepare dataset

In [None]:
with open("sqlite_json_training_dataset_extended.json", "r") as f:
    dataset_file = json.load(f)

def format_prompt(example):
    return f"### Input: {example['input']}\n### Output: {json.dumps(example['sql'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in dataset_file]
dataset = Dataset.from_dict({"text": formatted_data})

print("Sample prompt:\n", formatted_data[0])

Fine-tuning

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,   # reduce for low RAM
        gradient_accumulation_steps = 2,
        max_steps = 30,   # demo; increase later
        learning_rate = 2e-4,
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    ),
)

trainer.train()
print("✅ Training complete")

Save & Export to GGUF

In [None]:
EXPORT_PATH = "./gguf_model"
os.makedirs(EXPORT_PATH, exist_ok=True)

model.save_pretrained_merged(EXPORT_PATH, tokenizer, save_method="merged_16bit")

model.save_pretrained_gguf(
    EXPORT_PATH,
    tokenizer,
    quantization_method="q4_k_s"  # ~2GB
)

print("Files in export dir:", os.listdir(EXPORT_PATH))

Inference with GGUF

In [None]:
from llama_cpp import Llama

gguf_file = [f for f in os.listdir(EXPORT_PATH) if f.endswith(".gguf")][0]
llm = Llama(model_path=os.path.join(EXPORT_PATH, gguf_file), n_ctx=1024, n_threads=4)

response = llm("Aggregate all tech_startup names into JSON array")
print("Model output:\n", response["choices"][0]["text"])