In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B-Base",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

In [None]:
# load imdb data
from datasets import load_dataset
ds = load_dataset("imdb")
# Keep only positive samples for SFT as  per description
train_pos = ds["train"].filter(lambda x: x["label"] == 1)
test_pos = ds["test"].filter(lambda x: x["label"] == 1)
if 1000 > 0 and 1000 < len(train_pos):
    train_pos = train_pos.select(range(1000))
if 200 > 0 and 200 < len(test_pos):
    test_pos = test_pos.select(range(200))

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_pos,
    eval_dataset= test_pos,
    compute_metrics=
    args = SFTConfig(
        dataset_text_field = "text",
        output_dir="./sft-imdb-qlora",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        save_steps=500,
        eval_strategy="steps",  # "epoch" is simplest
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
trainer.train()

In [None]:
def save_lora(trainer, tokenizer, output_dir):
    # This saves the PEFT adapter and trainer state
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Saved LoRA adapter and tokenizer to: {output_dir}")

save_lora(trainer, tokenizer, './sft_imdb_qlora_qwen')


In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

def load_lora_for_inference(model_name, max_length, output_dir):
    # Re-create base, then attach LoRA adapter
    base_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_length,
        load_in_4bit=False,      # for inference you can use 4bit=True too
        fast_inference=True,
        gpu_memory_utilization=0.9,
    )
    # if tokenizer.pad_token is None:
    #     tokenizer.pad_token = tokenizer.eos_token
    # tokenizer.padding_side = "left"  # for generation often left padding works well

    peft_model = PeftModel.from_pretrained(base_model, output_dir)
    peft_model.eval()
    return peft_model, tokenizer


# 7) Load back for inference
inf_model, inf_tokenizer = load_lora_for_inference(
    model_name="unsloth/Qwen3-1.7B-Base",
    max_length=1024,
    output_dir='./sft_imdb_qlora_qwen'
)

In [None]:
import random
@torch.inference_mode()
def demo_generation(model, tokenizer, eval_ds, max_new_tokens=64, max_length=1024, demo_n=4):
    print("\n===== Demo: Generations on 3-4 reviews =====")
    n = min(demo_n, len(eval_ds))
    idxs = random.sample(range(len(eval_ds)), n)

    for i, idx in enumerate(idxs, start=1):
        src = eval_ds[idx]["text"]
        # Use a short prefix of the review to show continuation capability
        prompt = src[:400]  # first 400 chars
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length).to(model.device)
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            # eos_token_id=tokenizer.eos_token_id,
            # pad_token_id=tokenizer.pad_token_id,
        )
        out = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
        print(f"\n--- Sample {i} ---")
        print(out)
    print("\n=====================\n")

In [None]:
demo_generation(model=inf_model, tokenizer=inf_tokenizer, eval_ds=test_pos)


In [None]:
@torch.inference_mode()
def batch_generate_only_new(model, tokenizer, prompts, max_new_tokens=128, **gen_kwargs):
    device = next(model.parameters()).device
    enc = tokenizer(
        prompts, return_tensors="pt", padding=True, truncation=True
    ).to(device)

    input_lens = enc["attention_mask"].sum(dim=1)  # [B]
    out = model.generate(
        **enc,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        return_dict_in_generate=True,
        **gen_kwargs,
    )

    results = []
    for i, seq in enumerate(out.sequences):
        ilen = int(input_lens[i].item())
        gen_part = seq[ilen:]
        text = tokenizer.decode(gen_part, skip_special_tokens=True)
        results.append(text)
    return results

In [None]:
prompts = [
    "The movie was so bad, and i hate when",
    "The movie was",
    "Finish this thought about the acting quality:",
]
for i, t in enumerate(batch_generate_only_new(inf_model, inf_tokenizer, prompts, max_new_tokens=64), 1):
    print(f"=== {i} ===\n{t}\n")