# 🧠 DeepSeek Text→NoSQL Fine-Tuning + GGUF Export (LoRA Edition)
### Author: Oshinit
### Workflow:
1. Load dataset (`input` + `sql`)
2. Apply parameter-efficient fine-tuning with LoRA
3. Save LoRA adapter and merged FP16 model
4. Convert to GGUF
5. Quantize (Q4_K_M, Q5_K_M, Q8_0)


In [None]:
# ---------- GPU / memory heuristics ----------
import torch

cuda_available = torch.cuda.is_available()
gpu_name = None
gpu_total_gb = 0
if cuda_available:
    prop = torch.cuda.get_device_properties(0)
    gpu_name = prop.name
    gpu_total_gb = int(prop.total_memory / (1024 ** 3))
print(f"CUDA available: {cuda_available}, GPU: {gpu_name}, VRAM (GB): {gpu_total_gb}")
print(f"CUDA version: {torch.version.cuda}")  # should show CUDA version

In [None]:
!pip list
# # !pip install datasets
# !pwd

In [1]:
# Standard imports and workspace configuration
import os
from pathlib import Path
import time
import json
import subprocess

BASE_MODEL = "deepseek-ai/deepseek-coder-1.3b-base"   # change if you have a local path
DATA_PATH = Path("/workspace/sqlite_json_training_dataset_extended.json")
WORKSPACE = Path("/workspace")
OUTPUT_DIR = WORKSPACE / "outputs" / "final_model_fp16"
MERGED_OUTPUT_DIR = WORKSPACE / "outputs" / "merged_model_fp16"
GGUF_EXPORT_DIR = WORKSPACE / "gguf_exports"
LLAMA_CPP_PATH = WORKSPACE / "llama.cpp"

QUANT_METHODS = ["q4_k_m", "q5_k_m", "q8_0"]

for p in (OUTPUT_DIR, MERGED_OUTPUT_DIR, GGUF_EXPORT_DIR):
    p.mkdir(parents=True, exist_ok=True)

print("Workspace paths:")
print(" OUTPUT_DIR =", OUTPUT_DIR)
print(" MERGED_OUTPUT_DIR =", MERGED_OUTPUT_DIR)
print(" GGUF_EXPORT_DIR =", GGUF_EXPORT_DIR)

Workspace paths:
 OUTPUT_DIR = /workspace/outputs/final_model_fp16
 MERGED_OUTPUT_DIR = /workspace/outputs/merged_model_fp16
 GGUF_EXPORT_DIR = /workspace/gguf_exports


In [2]:
# Load dataset (expects json array of objects with 'input' and 'sql' keys)
from datasets import Dataset

def load_text2sql_dataset(data_path: Path):
    if not data_path.exists():
        raise FileNotFoundError(f"Dataset not found: {data_path}")
    with open(data_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    samples = [{"text": f"### Instruction:\n{item['input']}\n### Response:\n{item['sql']}"} for item in data]
    ds = Dataset.from_list(samples)
    print(f"Loaded {len(ds)} samples.")
    return ds

dataset = load_text2sql_dataset(DATA_PATH)
dataset[0]  # show first sample for sanity

Loaded 999 samples.


{'text': '### Instruction:\nFind invalid JSON rows\n### Response:\nSELECT rowid, row_data FROM nosql_store WHERE json_valid(row_data)=0;'}

In [3]:
# Model & tokenizer setup, LoRA config, and tokenization
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import LoraConfig, get_peft_model, PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load base model (FP16 if GPU available)
device_map = "auto" if torch.cuda.is_available() else None
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"Loading base model {BASE_MODEL} with dtype={dtype} device_map={device_map} ...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    device_map=device_map,
    trust_remote_code=True,
)

# Setup LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Wrap model with PEFT/LoRA BEFORE training
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()
print("✅ DeepSeek LLM Loaded with LoRA!")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading base model deepseek-ai/deepseek-coder-1.3b-base with dtype=torch.float16 device_map=auto ...
trainable params: 1,572,864 || all params: 1,348,044,800 || trainable%: 0.1167
done


In [13]:
# Tokenize dataset
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize_fn, batched=True)
print(tokenized)

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 999
})


In [None]:
# Training - Trainer setup
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=0.5,   # adjust as needed
    learning_rate=3e-4,
    weight_decay=0.01,
    logging_dir=str(WORKSPACE / "logs"),
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    save_strategy="epoch",
    push_to_hub=False,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Trainer initialized. Starting training...")
t0 = time.time()
trainer_stats = trainer.train()
t1 = time.time()
print(f"Training complete ({t1 - t0:.1f}s).")
print("Trainer stats:", trainer_stats)


In [None]:
def generate_prediction(review_text):
    inputs = tokenizer(review_text, return_tensors="pt").to("cuda")
    outputs = base_model.generate(**inputs, max_length=200, pad_token_id=tokenizer.eos_token_id)
    print("********************")
    print(outputs)
    print("********************")
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example reviews
reviews = [
    "Find retail_store in Berlin with rating over 4.0.",
    "Find restaurant_company in London.",
    "Show all leaf keys and values for retail_store."
]

# Run predictions
for review in reviews:
    print(f"Review: {review}")
    print(" ")
    print(f"Predicted Sentiment: {generate_prediction(review)}")
    print("-" * 80)

In [None]:
# Save the LoRA adapter (cheap) and base model with adapter applied (merged)
# 1) Save adapter only (safest to reload quickly later)
ADAPTER_DIR = OUTPUT_DIR / "lora_adapter"
ADAPTER_DIR.mkdir(parents=True, exist_ok=True)
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print(f"LoRA adapter saved to: {ADAPTER_DIR}")

# 2) Merge LoRA into base model and save merged model (FP16 preferred)
# If model is a PeftModel, use merge_and_unload or PeftModel.merge_and_unload
if hasattr(model, "merge_and_unload"):
    print("Merging LoRA weights into base model (this will produce a standard HF model)...")
    merged = model.merge_and_unload()  # returns merged base model
else:
    # fallback: if PeftModel wrapper exists, call merger from peft.PeftModel
    merged = model  # best-effort; in most PEFT versions merge_and_unload exists

# Ensure merged model directory exists and save
MERGED_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
merged.save_pretrained(MERGED_OUTPUT_DIR)
tokenizer.save_pretrained(MERGED_OUTPUT_DIR)
print(f"Merged FP16 model saved to: {MERGED_OUTPUT_DIR}")


In [None]:
# Ensure nightly tourch version is not replace by older version of torch.
!rm -rf llama.cpp
!git clone --recursive https://github.com/ggml-org/llama.cpp
print('✅ llama.cpp cloned')

# CPU-only configuration (disable CUDA flags)
!cd llama.cpp && cmake -B build -DCMAKE_BUILD_TYPE=Release
print('✅ CMake configuration done (CPU build)')

# Build CPU binaries
!cmake --build llama.cpp/build --config Release -- -j$(nproc)
print('✅ Compilation done')

# Check binaries
!ls -lh llama.cpp/build/bin

# Create symlinks so Unsloth can find them
!ln -sf /workspace/llama.cpp/build/bin/llama-quantize /workspace/llama.cpp/llama-quantize
!ln -sf /workspace/llama.cpp/build/bin/llama-quantize /usr/local/bin/llama-quantize
!ln -sf /workspace/llama.cpp/build/bin/llama-cli /workspace/llama.cpp/llama-cli
!ln -sf /workspace/llama.cpp/build/bin/llama-server /workspace/llama.cpp/llama-server
!pip install mistral-common
print('✅ Symlinks created for Unsloth compatibility')

# Sanity check
!which llama-quantize
!llama.cpp/llama-quantize --help || echo "⚠️ Quantize binary not found — check build output."
print('✅ Symlinks created for Unsloth compatibility')

print('LLama installed')

In [None]:
# Convert to gguf
BASE_MODEL_NAME = BASE_MODEL.split('/')[1]

# !python3 /workspace/llama.cpp/convert_hf_to_gguf.py \
#     /workspace/outputs/merged_model_fp16 \
#     --outfile /workspace/gguf_exports/deepseek-coder-1.3b-base_f16.gguf \
#     --outtype f16

# ls -lash /workspace/gguf_exports/deepseek-coder-1.3b-base_f16.gguf

# Quantize
!python3 /workspace/llama.cpp/build/bin/llama-quantize \
    /workspace/gguf_exports/deepseek-coder-1.3b-base_f16.gguf \
    /workspace/gguf_exports/deepseek-coder-1.3b-base_q4_k_m.gguf \
    Q4_K_M


### Done — summary:
 - Base HF model downloaded and wrapped with LoRA before training.
 - Training performed with Trainer + PEFT; LoRA adapter saved.
 - LoRA weights merged into FP16 base model and saved to merged_model_fp16.
 - Convert script and quantize binaries are attempted; if not found, instructions are printed.

