# Mistral Quick Chat (HF Local)

Edit `question` and run the last cell.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "Qwen/Qwen2.5-3B-Instruct"
DTYPE = torch.float32
MAX_NEW_TOKENS = 96

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=DTYPE)

# Prefer MPS when available, fallback to CPU if runtime does not support it.
device = "cpu"
if torch.backends.mps.is_available():
    try:
        model = model.to("mps")
        device = "mps"
    except Exception:
        model = model.to("cpu")
else:
    model = model.to("cpu")

model.eval()

def ask(question: str) -> str:
    messages = [
        {"role": "system", "content": "You are a concise assistant. Keep answers short and direct."},
        {"role": "user", "content": question},
    ]

    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            repetition_penalty=1.1,
            no_repeat_ngram_size=4,
            pad_token_id=tokenizer.eos_token_id,
        )

    new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

print(f"Mistral ready on device: {device}")

In [None]:
question = "Who are you?"  # <-- edit this
answer = ask(question)
print("Q:", question)
print()
print("A:")
print(answer)