In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

from peft import PeftModel
DEVICE = "mps" # ← adjust to "cpu", "cuda" etc. (I tested cpu, cuda, mps)

In [None]:
base = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct", device_map=DEVICE)

In [None]:
model = PeftModel.from_pretrained(base, "models/finetuned_smaller").to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained("models/finetuned_smaller", use_fast=True)

In [None]:
test_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
You are a scientist with advanced knowledge in philosophy.
Write the next paragraph for the following text.

### Text:
{}

### Response:
"""

In [None]:
test_prompt_style = """
You are a professor of philosohy.
You are helpful, honest, and harmless.
Below is a question. Write an answer.

### Question:
{}

### Response:
"""

In [None]:
question = 'What is moral reasoning?'

In [None]:
streamer = TextStreamer(tokenizer)

In [None]:
from IPython.display import display 
import ipywidgets as widgets

class NotebookStreamer(TextStreamer):
    def __init__(self, tokenizer, **kwargs):
        super().__init__(tokenizer, skip_prompt=True, **kwargs)
        self.output = widgets.HTML("<pre style='white-space: pre-wrap; word-break: break-word; font-family: monospace'></pre>")
        display(self.output)
        self.buffer = ""

    def on_finalized_text(self, text, stream_end=False):
        self.buffer += text
        self.output.value = f"<pre style='white-space: pre-wrap; word-break: break-word; font-family: monospace'>{self.buffer}</pre>"

streamer = NotebookStreamer(tokenizer)

In [None]:
inputs = tokenizer(
    [test_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to(DEVICE)

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
    do_sample=True,           # ← enable sampling
    temperature=0.7,          # ← randomness control, 0.7–1.0 works well
    top_k=50,                 # ← limit sampling to top 50 tokens
    top_p=0.9,                # ← or use nucleus sampling
    repetition_penalty=1.2,   # ← discourage loops and repeats
    streamer=streamer,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])