In [None]:
# Gerekli kütüphanelerin kurulumu
%%capture
import os, re

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth faiss-gpu sentence-transformers
else:
    import torch
    v = re.match(r"[0-9]+\.[0-9]+", str(torch.__version__)).group(0)
    xformers = "xformers==" + (
        "0.0.33.post1" if v=="2.9"
        else "0.0.32.post2" if v=="2.8"
        else "0.0.29.post3"
    )

    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets==4.3.0 huggingface_hub>=0.34.0 hf_transfer
    !pip install --no-deps unsloth
    !pip install transformers==4.56.2 trl==0.22.2
    !pip install faiss-gpu sentence-transformers

In [None]:
# Qwen3-8B modelinin yüklenmesi
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.8: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
# Lora Yapılandırması - Eğitim
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

Unsloth 2025.12.8 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
# Embedding modeli
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-e5-small"
)

In [None]:
documents = [
    "Afyon bükmesi Afyonkarahisar yöresine ait geleneksel bir hamur işidir.",
    "Afyon bükmesi için un, su, tuz ve haşhaş kullanılır.",
    "Hamur yoğrulur, bezelere ayrılır ve ince şekilde açılır.",
    "Haşhaşlı karışım hamura sürülür ve sac üzerinde pişirilir.",
    "Pişen bükmeler isteğe göre tereyağı ile servis edilir."
]

In [None]:
# FAISS Vektör database oluşturma
import faiss
import numpy as np

doc_embeddings = embedding_model.encode(
    documents,
    convert_to_numpy=True,
    normalize_embeddings=True
)

dimension = doc_embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)
index.add(doc_embeddings)

In [None]:
# Soru → Doküman getirme
def retrieve_context(question, k=3):
    q_emb = embedding_model.encode(
        [question],
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    scores, indices = index.search(q_emb, k)
    return "\n".join([documents[i] for i in indices[0]])

In [None]:
# Alpaca formatı
rag_prompt = """Below is an instruction that describes a task.
Use ONLY the given context to answer the question.

### Instruction:
{instruction}

### Context:
{context}

### Question:
{question}

### Response:
"""

Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

In [None]:
FastLanguageModel.for_inference(model)

question = "Afyon bükmesi nasıl yapılır?"

context = retrieve_context(question)

inputs = tokenizer(
    rag_prompt.format(
        instruction="Kullanıcının sorduğu yemeğin tarifini ver.",
        context=context,
        question=question
    ),
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    use_cache=True
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1080 [00:00<?, ? examples/s]

🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf("model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )