In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    attn_implementation="flash_attention_2",
)

In [11]:
from peft import PeftModel

adapter_path = "models/mistral-7b-reasoning-lora"
model = PeftModel.from_pretrained(base_model, adapter_path)

In [None]:
model = model.merge_and_unload()
model.eval()

In [37]:
def generate_response(prompt):
    # Format the input using the chat template
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        return_tensors="pt",
    ).to(model.device)

    # Generate with parameters suited for reasoning
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=800,
        temperature=0.6,
        top_p=0.8,
        do_sample=True,
    )

    # Decode the response, removing the prompt
    response = tokenizer.decode(
        outputs[0][inputs.shape[1] :], skip_special_tokens=False
    )
    return response

In [None]:
question = "What is larger 9.11 or 9.01? Show your thinking and reasoning."
response = generate_response(question)
print(response)